Merge commit 'eee/mesa-es' into android

author: Chia-I Wu <olvaffe@gmail.com> 2009-09-15 14:16:22 +0800
committer: Chia-I Wu <olvaffe@gmail.com> 2009-09-15 14:16:22 +0800
commit: e2ba90a9cc762cf00a168f0a59d31e7dc52fc42e (patch)
tree: fe3206d7602ad935296884742980f3c4d30bd867 /src/mesa/drivers
parent: 11a4292d4eb515813b82b8d688a318adef66b3e6 (diff)
parent: b4b8800315637d9218a81c76f09df7d601710d29 (diff)
542 files changed, 62187 insertions, 31381 deletions
diff --git a/src/mesa/drivers/allegro/amesa.c b/src/mesa/drivers/allegro/amesa.c
index a9d8f62f92..0744677d2b 100644
--- a/src/mesa/drivers/allegro/amesa.c
+++ b/src/mesa/drivers/allegro/amesa.c
@@ -338,7 +338,7 @@ void AMesaDestroyBuffer(AMesaBuffer buffer)
 {
    if (buffer->Screen)     destroy_bitmap(buffer->Screen);
    if (buffer->Background) destroy_bitmap(buffer->Background);
-   _mesa_unreference_framebuffer(&buffer->GLBuffer);
+   _mesa_reference_framebuffer(&buffer->GLBuffer, NULL);
    free(buffer);
 }
 
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 986f751bdc..31a027e0e4 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -24,16 +24,22 @@
 
 
 #include "main/glheader.h"
+#include "main/accum.h"
 #include "main/imports.h"
 #include "main/arrayobj.h"
 #include "main/buffers.h"
+#include "main/colortab.h"
 #include "main/context.h"
+#include "main/convolve.h"
+#include "main/drawpix.h"
 #include "main/framebuffer.h"
 #include "main/mipmap.h"
 #include "main/queryobj.h"
+#include "main/rastpos.h"
 #include "main/renderbuffer.h"
 #include "main/texcompress.h"
 #include "main/texformat.h"
+#include "main/texgetimage.h"
 #include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/texstore.h"
@@ -44,6 +50,9 @@
 #include "main/fbobject.h"
 #include "main/texrender.h"
 #endif
+#if FEATURE_ARB_sync
+#include "main/syncobj.h"
+#endif
 
 #include "shader/program.h"
 #include "shader/prog_execute.h"
@@ -80,12 +89,13 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
 
    /* framebuffer/image functions */
    driver->Clear = _swrast_Clear;
-   driver->Accum = _swrast_Accum;
-   driver->RasterPos = _tnl_RasterPos;
-   driver->DrawPixels = _swrast_DrawPixels;
+
+   _MESA_INIT_ACCUM_FUNCTIONS(driver, _swrast_);
+   _MESA_INIT_DRAWPIX_FUNCTIONS(driver, _swrast_);
+
+   _MESA_INIT_RASTPOS_FUNCTIONS(driver, _tnl_);
+
    driver->ReadPixels = _swrast_ReadPixels;
-   driver->CopyPixels = _swrast_CopyPixels;
-   driver->Bitmap = _swrast_Bitmap;
 
    /* Texture functions */
    driver->ChooseTextureFormat = _mesa_choose_tex_format;
@@ -125,18 +135,15 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->UpdateTexturePalette = NULL;
 
    /* imaging */
-   driver->CopyColorTable = _swrast_CopyColorTable;
-   driver->CopyColorSubTable = _swrast_CopyColorSubTable;
-   driver->CopyConvolutionFilter1D = _swrast_CopyConvolutionFilter1D;
-   driver->CopyConvolutionFilter2D = _swrast_CopyConvolutionFilter2D;
+   /* swrast does not need UpdateTexturePalette */
+#define _swrast_UpdateTexturePalette NULL
+   _MESA_INIT_COLORTABLE_FUNCTIONS(driver, _swrast_);
+   _MESA_INIT_CONVOLVE_FUNCTIONS(driver, _swrast_);
 
    /* Vertex/fragment programs */
    driver->BindProgram = NULL;
    driver->NewProgram = _mesa_new_program;
    driver->DeleteProgram = _mesa_delete_program;
-#if FEATURE_MESA_program_debug
-   driver->GetProgramRegister = _mesa_get_program_register;
-#endif /* FEATURE_MESA_program_debug */
 
    /* simple state commands */
    driver->AlphaFunc = NULL;
@@ -202,17 +209,17 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->GetDoublev = NULL;
    driver->GetFloatv = NULL;
    driver->GetIntegerv = NULL;
+   driver->GetInteger64v = NULL;
    driver->GetPointerv = NULL;
    
-#if FEATURE_ARB_vertex_buffer_object
-   driver->NewBufferObject = _mesa_new_buffer_object;
-   driver->DeleteBuffer = _mesa_delete_buffer_object;
-   driver->BindBuffer = NULL;
-   driver->BufferData = _mesa_buffer_data;
-   driver->BufferSubData = _mesa_buffer_subdata;
-   driver->GetBufferSubData = _mesa_buffer_get_subdata;
-   driver->MapBuffer = _mesa_buffer_map;
-   driver->UnmapBuffer = _mesa_buffer_unmap;
+   /* buffer objects */
+   _mesa_init_buffer_object_functions(driver);
+
+   /* query objects */
+   _mesa_init_query_object_functions(driver);
+
+#if FEATURE_ARB_sync
+   _mesa_init_sync_object_functions(driver);
 #endif
 
 #if FEATURE_EXT_framebuffer_object
@@ -227,13 +234,6 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->BlitFramebuffer = _swrast_BlitFramebuffer;
 #endif
 
-   /* query objects */
-   driver->NewQueryObject = _mesa_new_query_object;
-   driver->DeleteQuery = _mesa_delete_query;
-   driver->BeginQuery = _mesa_begin_query;
-   driver->EndQuery = _mesa_end_query;
-   driver->WaitQuery = _mesa_wait_query;
-
    /* APPLE_vertex_array_object */
    driver->NewArrayObject = _mesa_new_array_object;
    driver->DeleteArrayObject = _mesa_delete_array_object;
@@ -308,7 +308,7 @@ _mesa_init_driver_state(GLcontext *ctx)
    ctx->Driver.Enable(ctx, GL_LINE_SMOOTH, ctx->Line.SmoothFlag);
    ctx->Driver.Enable(ctx, GL_POLYGON_STIPPLE, ctx->Polygon.StippleFlag);
    ctx->Driver.Enable(ctx, GL_SCISSOR_TEST, ctx->Scissor.Enabled);
-   ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
+   ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil._Enabled);
    ctx->Driver.Enable(ctx, GL_TEXTURE_1D, GL_FALSE);
    ctx->Driver.Enable(ctx, GL_TEXTURE_2D, GL_FALSE);
    ctx->Driver.Enable(ctx, GL_TEXTURE_RECTANGLE_NV, GL_FALSE);
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
new file mode 100644
index 0000000000..05909cfa30
--- /dev/null
+++ b/src/mesa/drivers/common/meta.c
@@ -0,0 +1,2039 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.6
+ *
+ * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Meta operations.  Some GL operations can be expressed in terms of
+ * other GL operations.  For example, glBlitFramebuffer() can be done
+ * with texture mapping and glClear() can be done with polygon rendering.
+ *
+ * \author Brian Paul
+ */
+
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/imports.h"
+#include "main/arrayobj.h"
+#include "main/blend.h"
+#include "main/bufferobj.h"
+#include "main/buffers.h"
+#include "main/depth.h"
+#include "main/enable.h"
+#include "main/fbobject.h"
+#include "main/image.h"
+#include "main/macros.h"
+#include "main/matrix.h"
+#include "main/mipmap.h"
+#include "main/polygon.h"
+#include "main/readpix.h"
+#include "main/scissor.h"
+#include "main/shaders.h"
+#include "main/stencil.h"
+#include "main/texobj.h"
+#include "main/texenv.h"
+#include "main/teximage.h"
+#include "main/texparam.h"
+#include "main/texstate.h"
+#include "main/varray.h"
+#include "main/viewport.h"
+#include "shader/program.h"
+#include "shader/arbprogram.h"
+#include "swrast/swrast.h"
+#include "drivers/common/meta.h"
+
+
+/** Return offset in bytes of the field within a vertex struct */
+#define OFFSET(FIELD) ((void *) offsetof(struct vertex, FIELD))
+
+
+/**
+ * State which we may save/restore across meta ops.
+ * XXX this may be incomplete...
+ */
+struct save_state
+{
+   GLbitfield SavedState;  /**< bitmask of META_* flags */
+
+   /** META_ALPHA_TEST */
+   GLboolean AlphaEnabled;
+
+   /** META_BLEND */
+   GLboolean BlendEnabled;
+   GLboolean ColorLogicOpEnabled;
+
+   /** META_COLOR_MASK */
+   GLubyte ColorMask[4];
+
+   /** META_DEPTH_TEST */
+   struct gl_depthbuffer_attrib Depth;
+
+   /** META_FOG */
+   GLboolean Fog;
+
+   /** META_PIXEL_STORE */
+   struct gl_pixelstore_attrib Pack, Unpack;
+
+   /** META_RASTERIZATION */
+   GLenum FrontPolygonMode, BackPolygonMode;
+   GLboolean PolygonOffset;
+   GLboolean PolygonSmooth;
+   GLboolean PolygonStipple;
+   GLboolean PolygonCull;
+
+   /** META_SCISSOR */
+   struct gl_scissor_attrib Scissor;
+
+   /** META_SHADER */
+   GLboolean VertexProgramEnabled;
+   struct gl_vertex_program *VertexProgram;
+   GLboolean FragmentProgramEnabled;
+   struct gl_fragment_program *FragmentProgram;
+   GLuint Shader;
+
+   /** META_STENCIL_TEST */
+   struct gl_stencil_attrib Stencil;
+
+   /** META_TRANSFORM */
+   GLenum MatrixMode;
+   GLfloat ModelviewMatrix[16];
+   GLfloat ProjectionMatrix[16];
+   GLfloat TextureMatrix[16];
+   GLbitfield ClipPlanesEnabled;
+
+   /** META_TEXTURE */
+   GLuint ActiveUnit;
+   GLuint ClientActiveUnit;
+   /** for unit[0] only */
+   struct gl_texture_object *CurrentTexture[NUM_TEXTURE_TARGETS];
+   /** mask of TEXTURE_2D_BIT, etc */
+   GLbitfield TexEnabled[MAX_TEXTURE_UNITS];
+   GLbitfield TexGenEnabled[MAX_TEXTURE_UNITS];
+   GLuint EnvMode;  /* unit[0] only */
+
+   /** META_VERTEX */
+   struct gl_array_object *ArrayObj;
+   struct gl_buffer_object *ArrayBufferObj;
+
+   /** META_VIEWPORT */
+   GLint ViewportX, ViewportY, ViewportW, ViewportH;
+   GLclampd DepthNear, DepthFar;
+
+   /** Miscellaneous (always disabled) */
+   GLboolean Lighting;
+};
+
+
+/**
+ * Temporary texture used for glBlitFramebuffer, glDrawPixels, etc.
+ * This is currently shared by all the meta ops.  But we could create a
+ * separate one for each of glDrawPixel, glBlitFramebuffer, glCopyPixels, etc.
+ */
+struct temp_texture
+{
+   GLuint TexObj;
+   GLenum Target;         /**< GL_TEXTURE_2D or GL_TEXTURE_RECTANGLE */
+   GLsizei MinSize;       /**< Min texture size to allocate */
+   GLsizei MaxSize;       /**< Max possible texture size */
+   GLboolean NPOT;        /**< Non-power of two size OK? */
+   GLsizei Width, Height; /**< Current texture size */
+   GLenum IntFormat;
+   GLfloat Sright, Ttop;  /**< right, top texcoords */
+};
+
+
+/**
+ * State for glBlitFramebufer()
+ */
+struct blit_state
+{
+   GLuint ArrayObj;
+   GLuint VBO;
+   GLuint DepthFP;
+};
+
+
+/**
+ * State for glClear()
+ */
+struct clear_state
+{
+   GLuint ArrayObj;
+   GLuint VBO;
+};
+
+
+/**
+ * State for glCopyPixels()
+ */
+struct copypix_state
+{
+   GLuint ArrayObj;
+   GLuint VBO;
+};
+
+
+/**
+ * State for glDrawPixels()
+ */
+struct drawpix_state
+{
+   GLuint ArrayObj;
+   GLuint VBO;
+
+   GLuint StencilFP;  /**< Fragment program for drawing stencil images */
+   GLuint DepthFP;  /**< Fragment program for drawing depth images */
+};
+
+
+/**
+ * State for glBitmap()
+ */
+struct bitmap_state
+{
+   GLuint ArrayObj;
+   GLuint VBO;
+   struct temp_texture Tex;  /**< separate texture from other meta ops */
+};
+
+
+/**
+ * State for _mesa_meta_generate_mipmap()
+ */
+struct gen_mipmap_state
+{
+   GLuint ArrayObj;
+   GLuint VBO;
+   GLuint FBO;
+};
+
+
+/**
+ * All per-context meta state.
+ */
+struct gl_meta_state
+{
+   struct save_state Save;    /**< state saved during meta-ops */
+
+   struct temp_texture TempTex;
+
+   struct blit_state Blit;    /**< For _mesa_meta_blit_framebuffer() */
+   struct clear_state Clear;  /**< For _mesa_meta_clear() */
+   struct copypix_state CopyPix;  /**< For _mesa_meta_copy_pixels() */
+   struct drawpix_state DrawPix;  /**< For _mesa_meta_draw_pixels() */
+   struct bitmap_state Bitmap;    /**< For _mesa_meta_bitmap() */
+   struct gen_mipmap_state Mipmap;    /**< For _mesa_meta_generate_mipmap() */
+};
+
+
+/**
+ * Initialize meta-ops for a context.
+ * To be called once during context creation.
+ */
+void
+_mesa_meta_init(GLcontext *ctx)
+{
+   ASSERT(!ctx->Meta);
+
+   ctx->Meta = CALLOC_STRUCT(gl_meta_state);
+}
+
+
+/**
+ * Free context meta-op state.
+ * To be called once during context destruction.
+ */
+void
+_mesa_meta_free(GLcontext *ctx)
+{
+   struct gl_meta_state *meta = ctx->Meta;
+
+   if (_mesa_get_current_context()) {
+      /* if there's no current context, these textures, buffers, etc should
+       * still get freed by _mesa_free_context_data().
+       */
+
+      /* the temporary texture */
+      _mesa_DeleteTextures(1, &meta->TempTex.TexObj);
+
+      /* glBlitFramebuffer */
+      _mesa_DeleteBuffersARB(1, & meta->Blit.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->Blit.ArrayObj);
+      _mesa_DeletePrograms(1, &meta->Blit.DepthFP);
+
+      /* glClear */
+      _mesa_DeleteBuffersARB(1, & meta->Clear.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->Clear.ArrayObj);
+
+      /* glCopyPixels */
+      _mesa_DeleteBuffersARB(1, & meta->CopyPix.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->CopyPix.ArrayObj);
+
+      /* glDrawPixels */
+      _mesa_DeleteBuffersARB(1, & meta->DrawPix.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->DrawPix.ArrayObj);
+      _mesa_DeletePrograms(1, &meta->DrawPix.DepthFP);
+      _mesa_DeletePrograms(1, &meta->DrawPix.StencilFP);
+
+      /* glBitmap */
+      _mesa_DeleteBuffersARB(1, & meta->Bitmap.VBO);
+      _mesa_DeleteVertexArraysAPPLE(1, &meta->Bitmap.ArrayObj);
+      _mesa_DeleteTextures(1, &meta->Bitmap.Tex.TexObj);
+   }
+
+   _mesa_free(ctx->Meta);
+   ctx->Meta = NULL;
+}
+
+
+/**
+ * Enter meta state.  This is like a light-weight version of glPushAttrib
+ * but it also resets most GL state back to default values.
+ *
+ * \param state  bitmask of META_* flags indicating which attribute groups
+ *               to save and reset to their defaults
+ */
+static void
+_mesa_meta_begin(GLcontext *ctx, GLbitfield state)
+{
+   struct save_state *save = &ctx->Meta->Save;
+
+   save->SavedState = state;
+
+   if (state & META_ALPHA_TEST) {
+      save->AlphaEnabled = ctx->Color.AlphaEnabled;
+      if (ctx->Color.AlphaEnabled)
+         _mesa_set_enable(ctx, GL_ALPHA_TEST, GL_FALSE);
+   }
+
+   if (state & META_BLEND) {
+      save->BlendEnabled = ctx->Color.BlendEnabled;
+      if (ctx->Color.BlendEnabled)
+         _mesa_set_enable(ctx, GL_BLEND, GL_FALSE);
+      save->ColorLogicOpEnabled = ctx->Color.ColorLogicOpEnabled;
+      if (ctx->Color.ColorLogicOpEnabled)
+         _mesa_set_enable(ctx, GL_COLOR_LOGIC_OP, GL_FALSE);
+   }
+
+   if (state & META_COLOR_MASK) {
+      COPY_4V(save->ColorMask, ctx->Color.ColorMask);
+      if (!ctx->Color.ColorMask[0] ||
+          !ctx->Color.ColorMask[1] ||
+          !ctx->Color.ColorMask[2] ||
+          !ctx->Color.ColorMask[3])
+         _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+   }
+
+   if (state & META_DEPTH_TEST) {
+      save->Depth = ctx->Depth; /* struct copy */
+      if (ctx->Depth.Test)
+         _mesa_set_enable(ctx, GL_DEPTH_TEST, GL_FALSE);
+   }
+
+   if (state & META_FOG) {
+      save->Fog = ctx->Fog.Enabled;
+      if (ctx->Fog.Enabled)
+         _mesa_set_enable(ctx, GL_FOG, GL_FALSE);
+   }
+
+   if (state & META_PIXEL_STORE) {
+      save->Pack = ctx->Pack;
+      save->Unpack = ctx->Unpack;
+      ctx->Pack = ctx->DefaultPacking;
+      ctx->Unpack = ctx->DefaultPacking;
+   }
+
+   if (state & META_RASTERIZATION) {
+      save->FrontPolygonMode = ctx->Polygon.FrontMode;
+      save->BackPolygonMode = ctx->Polygon.BackMode;
+      save->PolygonOffset = ctx->Polygon.OffsetFill;
+      save->PolygonSmooth = ctx->Polygon.SmoothFlag;
+      save->PolygonStipple = ctx->Polygon.StippleFlag;
+      save->PolygonCull = ctx->Polygon.CullFlag;
+      _mesa_PolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+      _mesa_set_enable(ctx, GL_POLYGON_OFFSET_FILL, GL_FALSE);
+      _mesa_set_enable(ctx, GL_POLYGON_SMOOTH, GL_FALSE);
+      _mesa_set_enable(ctx, GL_POLYGON_STIPPLE, GL_FALSE);
+      _mesa_set_enable(ctx, GL_CULL_FACE, GL_FALSE);
+   }
+
+   if (state & META_SCISSOR) {
+      save->Scissor = ctx->Scissor; /* struct copy */
+   }
+
+   if (state & META_SHADER) {
+      if (ctx->Extensions.ARB_vertex_program) {
+         save->VertexProgramEnabled = ctx->VertexProgram.Enabled;
+         save->VertexProgram = ctx->VertexProgram.Current;
+         _mesa_set_enable(ctx, GL_VERTEX_PROGRAM_ARB, GL_FALSE);
+      }
+
+      if (ctx->Extensions.ARB_fragment_program) {
+         save->FragmentProgramEnabled = ctx->FragmentProgram.Enabled;
+         save->FragmentProgram = ctx->FragmentProgram.Current;
+         _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB, GL_FALSE);
+      }
+
+      if (ctx->Extensions.ARB_shader_objects) {
+         save->Shader = ctx->Shader.CurrentProgram ?
+            ctx->Shader.CurrentProgram->Name : 0;
+         _mesa_UseProgramObjectARB(0);
+      }
+   }
+
+   if (state & META_STENCIL_TEST) {
+      save->Stencil = ctx->Stencil; /* struct copy */
+      if (ctx->Stencil.Enabled)
+         _mesa_set_enable(ctx, GL_STENCIL_TEST, GL_FALSE);
+      /* NOTE: other stencil state not reset */
+   }
+
+   if (state & META_TEXTURE) {
+      GLuint u, tgt;
+
+      save->ActiveUnit = ctx->Texture.CurrentUnit;
+      save->ClientActiveUnit = ctx->Array.ActiveTexture;
+      save->EnvMode = ctx->Texture.Unit[0].EnvMode;
+
+      if (ctx->Texture._EnabledUnits |
+          ctx->Texture._EnabledCoordUnits |
+          ctx->Texture._TexGenEnabled |
+          ctx->Texture._TexMatEnabled) {
+
+      /* Disable all texture units */
+      for (u = 0; u < ctx->Const.MaxTextureUnits; u++) {
+         save->TexEnabled[u] = ctx->Texture.Unit[u].Enabled;
+         save->TexGenEnabled[u] = ctx->Texture.Unit[u].TexGenEnabled;
+         if (ctx->Texture.Unit[u].Enabled ||
+             ctx->Texture.Unit[u].TexGenEnabled) {
+            _mesa_ActiveTextureARB(GL_TEXTURE0 + u);
+            _mesa_set_enable(ctx, GL_TEXTURE_1D, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_2D, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_3D, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_CUBE_MAP, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_RECTANGLE, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_GEN_S, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_GEN_T, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_GEN_R, GL_FALSE);
+            _mesa_set_enable(ctx, GL_TEXTURE_GEN_Q, GL_FALSE);
+         }
+      }
+      }
+
+      /* save current texture objects for unit[0] only */
+      for (tgt = 0; tgt < NUM_TEXTURE_TARGETS; tgt++) {
+         _mesa_reference_texobj(&save->CurrentTexture[tgt],
+                                ctx->Texture.Unit[0].CurrentTex[tgt]);
+      }
+
+      /* set defaults for unit[0] */
+      _mesa_ActiveTextureARB(GL_TEXTURE0);
+      _mesa_ClientActiveTextureARB(GL_TEXTURE0);
+      _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+   }
+
+   if (state & META_TRANSFORM) {
+      GLuint activeTexture = ctx->Texture.CurrentUnit;
+      _mesa_memcpy(save->ModelviewMatrix, ctx->ModelviewMatrixStack.Top->m,
+                   16 * sizeof(GLfloat));
+      _mesa_memcpy(save->ProjectionMatrix, ctx->ProjectionMatrixStack.Top->m,
+                   16 * sizeof(GLfloat));
+      _mesa_memcpy(save->TextureMatrix, ctx->TextureMatrixStack[0].Top->m,
+                   16 * sizeof(GLfloat));
+      save->MatrixMode = ctx->Transform.MatrixMode;
+      /* set 1:1 vertex:pixel coordinate transform */
+      _mesa_ActiveTextureARB(GL_TEXTURE0);
+      _mesa_MatrixMode(GL_TEXTURE);
+      _mesa_LoadIdentity();
+      _mesa_ActiveTextureARB(GL_TEXTURE0 + activeTexture);
+      _mesa_MatrixMode(GL_MODELVIEW);
+      _mesa_LoadIdentity();
+      _mesa_MatrixMode(GL_PROJECTION);
+      _mesa_LoadIdentity();
+      _mesa_Ortho(0.0F, ctx->DrawBuffer->Width,
+                  0.0F, ctx->DrawBuffer->Height,
+                  -1.0F, 1.0F);
+      save->ClipPlanesEnabled = ctx->Transform.ClipPlanesEnabled;
+      if (ctx->Transform.ClipPlanesEnabled) {
+         GLuint i;
+         for (i = 0; i < ctx->Const.MaxClipPlanes; i++) {
+            _mesa_set_enable(ctx, GL_CLIP_PLANE0 + i, GL_FALSE);
+         }
+      }
+   }
+
+   if (state & META_VERTEX) {
+      /* save vertex array object state */
+      _mesa_reference_array_object(ctx, &save->ArrayObj,
+                                   ctx->Array.ArrayObj);
+      _mesa_reference_buffer_object(ctx, &save->ArrayBufferObj,
+                                    ctx->Array.ArrayBufferObj);
+      /* set some default state? */
+   }
+
+   if (state & META_VIEWPORT) {
+      /* save viewport state */
+      save->ViewportX = ctx->Viewport.X;
+      save->ViewportY = ctx->Viewport.Y;
+      save->ViewportW = ctx->Viewport.Width;
+      save->ViewportH = ctx->Viewport.Height;
+      /* set viewport to match window size */
+      if (ctx->Viewport.X != 0 ||
+          ctx->Viewport.Y != 0 ||
+          ctx->Viewport.Width != ctx->DrawBuffer->Width ||
+          ctx->Viewport.Height != ctx->DrawBuffer->Height) {
+         _mesa_set_viewport(ctx, 0, 0,
+                            ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);
+      }
+      /* save depth range state */
+      save->DepthNear = ctx->Viewport.Near;
+      save->DepthFar = ctx->Viewport.Far;
+      /* set depth range to default */
+      _mesa_DepthRange(0.0, 1.0);
+   }
+
+   /* misc */
+   {
+      save->Lighting = ctx->Light.Enabled;
+      if (ctx->Light.Enabled)
+         _mesa_set_enable(ctx, GL_LIGHTING, GL_FALSE);
+   }
+}
+
+
+/**
+ * Leave meta state.  This is like a light-weight version of glPopAttrib().
+ */
+static void
+_mesa_meta_end(GLcontext *ctx)
+{
+   struct save_state *save = &ctx->Meta->Save;
+   const GLbitfield state = save->SavedState;
+
+   if (state & META_ALPHA_TEST) {
+      if (ctx->Color.AlphaEnabled != save->AlphaEnabled)
+         _mesa_set_enable(ctx, GL_ALPHA_TEST, save->AlphaEnabled);
+   }
+
+   if (state & META_BLEND) {
+      if (ctx->Color.BlendEnabled != save->BlendEnabled)
+         _mesa_set_enable(ctx, GL_BLEND, save->BlendEnabled);
+      if (ctx->Color.ColorLogicOpEnabled != save->ColorLogicOpEnabled)
+         _mesa_set_enable(ctx, GL_COLOR_LOGIC_OP, save->ColorLogicOpEnabled);
+   }
+
+   if (state & META_COLOR_MASK) {
+      if (!TEST_EQ_4V(ctx->Color.ColorMask, save->ColorMask))
+         _mesa_ColorMask(save->ColorMask[0], save->ColorMask[1],
+                         save->ColorMask[2], save->ColorMask[3]);
+   }
+
+   if (state & META_DEPTH_TEST) {
+      if (ctx->Depth.Test != save->Depth.Test)
+         _mesa_set_enable(ctx, GL_DEPTH_TEST, save->Depth.Test);
+      _mesa_DepthFunc(save->Depth.Func);
+      _mesa_DepthMask(save->Depth.Mask);
+   }
+
+   if (state & META_FOG) {
+      _mesa_set_enable(ctx, GL_FOG, save->Fog);
+   }
+
+   if (state & META_PIXEL_STORE) {
+      ctx->Pack = save->Pack;
+      ctx->Unpack = save->Unpack;
+   }
+
+   if (state & META_RASTERIZATION) {
+      _mesa_PolygonMode(GL_FRONT, save->FrontPolygonMode);
+      _mesa_PolygonMode(GL_BACK, save->BackPolygonMode);
+      _mesa_set_enable(ctx, GL_POLYGON_STIPPLE, save->PolygonStipple);
+      _mesa_set_enable(ctx, GL_POLYGON_OFFSET_FILL, save->PolygonOffset);
+      _mesa_set_enable(ctx, GL_POLYGON_SMOOTH, save->PolygonSmooth);
+      _mesa_set_enable(ctx, GL_CULL_FACE, save->PolygonCull);
+   }
+
+   if (state & META_SCISSOR) {
+      _mesa_set_enable(ctx, GL_SCISSOR_TEST, save->Scissor.Enabled);
+      _mesa_Scissor(save->Scissor.X, save->Scissor.Y,
+                    save->Scissor.Width, save->Scissor.Height);
+   }
+
+   if (state & META_SHADER) {
+      if (ctx->Extensions.ARB_vertex_program) {
+         _mesa_set_enable(ctx, GL_VERTEX_PROGRAM_ARB,
+                          save->VertexProgramEnabled);
+         _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current, 
+                                  save->VertexProgram);
+      }
+
+      if (ctx->Extensions.ARB_fragment_program) {
+         _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB,
+                          save->FragmentProgramEnabled);
+         _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current,
+                                  save->FragmentProgram);
+      }
+
+      if (ctx->Extensions.ARB_shader_objects) {
+         _mesa_UseProgramObjectARB(save->Shader);
+      }
+   }
+
+   if (state & META_STENCIL_TEST) {
+      const struct gl_stencil_attrib *stencil = &save->Stencil;
+
+      _mesa_set_enable(ctx, GL_STENCIL_TEST, stencil->Enabled);
+      _mesa_ClearStencil(stencil->Clear);
+      if (ctx->Extensions.EXT_stencil_two_side) {
+         _mesa_set_enable(ctx, GL_STENCIL_TEST_TWO_SIDE_EXT,
+                          stencil->TestTwoSide);
+         _mesa_ActiveStencilFaceEXT(stencil->ActiveFace
+                                    ? GL_BACK : GL_FRONT);
+      }
+      /* front state */
+      _mesa_StencilFuncSeparate(GL_FRONT,
+                                stencil->Function[0],
+                                stencil->Ref[0],
+                                stencil->ValueMask[0]);
+      _mesa_StencilMaskSeparate(GL_FRONT, stencil->WriteMask[0]);
+      _mesa_StencilOpSeparate(GL_FRONT, stencil->FailFunc[0],
+                              stencil->ZFailFunc[0],
+                              stencil->ZPassFunc[0]);
+      /* back state */
+      _mesa_StencilFuncSeparate(GL_BACK,
+                                stencil->Function[1],
+                                stencil->Ref[1],
+                                stencil->ValueMask[1]);
+      _mesa_StencilMaskSeparate(GL_BACK, stencil->WriteMask[1]);
+      _mesa_StencilOpSeparate(GL_BACK, stencil->FailFunc[1],
+                              stencil->ZFailFunc[1],
+                              stencil->ZPassFunc[1]);
+   }
+
+   if (state & META_TEXTURE) {
+      GLuint u, tgt;
+
+      ASSERT(ctx->Texture.CurrentUnit == 0);
+
+      /* restore texenv for unit[0] */
+      _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, save->EnvMode);
+
+      /* restore texture objects for unit[0] only */
+      for (tgt = 0; tgt < NUM_TEXTURE_TARGETS; tgt++) {
+         _mesa_reference_texobj(&ctx->Texture.Unit[0].CurrentTex[tgt],
+                                save->CurrentTexture[tgt]);
+      }
+
+      /* Re-enable textures, texgen */
+      for (u = 0; u < ctx->Const.MaxTextureUnits; u++) {
+         if (save->TexEnabled[u]) {
+            _mesa_ActiveTextureARB(GL_TEXTURE0 + u);
+
+            if (save->TexEnabled[u] & TEXTURE_1D_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_1D, GL_TRUE);
+            if (save->TexEnabled[u] & TEXTURE_2D_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_2D, GL_TRUE);
+            if (save->TexEnabled[u] & TEXTURE_3D_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_3D, GL_TRUE);
+            if (save->TexEnabled[u] & TEXTURE_CUBE_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_CUBE_MAP, GL_TRUE);
+            if (save->TexEnabled[u] & TEXTURE_RECT_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_RECTANGLE, GL_TRUE);
+         }
+
+         if (save->TexGenEnabled[u]) {
+            _mesa_ActiveTextureARB(GL_TEXTURE0 + u);
+
+            if (save->TexGenEnabled[u] & S_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_GEN_S, GL_TRUE);
+            if (save->TexGenEnabled[u] & T_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_GEN_T, GL_TRUE);
+            if (save->TexGenEnabled[u] & R_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_GEN_R, GL_TRUE);
+            if (save->TexGenEnabled[u] & Q_BIT)
+               _mesa_set_enable(ctx, GL_TEXTURE_GEN_Q, GL_TRUE);
+         }
+      }
+
+      /* restore current unit state */
+      _mesa_ActiveTextureARB(GL_TEXTURE0 + save->ActiveUnit);
+      _mesa_ClientActiveTextureARB(GL_TEXTURE0 + save->ClientActiveUnit);
+   }
+
+   if (state & META_TRANSFORM) {
+      GLuint activeTexture = ctx->Texture.CurrentUnit;
+      _mesa_ActiveTextureARB(GL_TEXTURE0);
+      _mesa_MatrixMode(GL_TEXTURE);
+      _mesa_LoadMatrixf(save->TextureMatrix);
+      _mesa_ActiveTextureARB(GL_TEXTURE0 + activeTexture);
+
+      _mesa_MatrixMode(GL_MODELVIEW);
+      _mesa_LoadMatrixf(save->ModelviewMatrix);
+
+      _mesa_MatrixMode(GL_PROJECTION);
+      _mesa_LoadMatrixf(save->ProjectionMatrix);
+
+      _mesa_MatrixMode(save->MatrixMode);
+
+      save->ClipPlanesEnabled = ctx->Transform.ClipPlanesEnabled;
+      if (save->ClipPlanesEnabled) {
+         GLuint i;
+         for (i = 0; i < ctx->Const.MaxClipPlanes; i++) {
+            if (save->ClipPlanesEnabled & (1 << i)) {
+               _mesa_set_enable(ctx, GL_CLIP_PLANE0 + i, GL_TRUE);
+            }
+         }
+      }
+   }
+
+   if (state & META_VERTEX) {
+      /* restore vertex buffer object */
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, save->ArrayBufferObj->Name);
+      _mesa_reference_buffer_object(ctx, &save->ArrayBufferObj, NULL);
+
+      /* restore vertex array object */
+      _mesa_BindVertexArray(save->ArrayObj->Name);
+      _mesa_reference_array_object(ctx, &save->ArrayObj, NULL);
+   }
+
+   if (state & META_VIEWPORT) {
+      if (save->ViewportX != ctx->Viewport.X ||
+          save->ViewportY != ctx->Viewport.Y ||
+          save->ViewportW != ctx->Viewport.Width ||
+          save->ViewportH != ctx->Viewport.Height) {
+         _mesa_set_viewport(ctx, save->ViewportX, save->ViewportY,
+                            save->ViewportW, save->ViewportH);
+      }
+      _mesa_DepthRange(save->DepthNear, save->DepthFar);
+   }
+
+   /* misc */
+   if (save->Lighting) {
+      _mesa_set_enable(ctx, GL_LIGHTING, GL_TRUE);
+   }
+   if (save->Fog) {
+      _mesa_set_enable(ctx, GL_FOG, GL_TRUE);
+   }
+}
+
+
+/**
+ * One-time init for a temp_texture object.
+ * Choose tex target, compute max tex size, etc.
+ */
+static void
+init_temp_texture(GLcontext *ctx, struct temp_texture *tex)
+{
+   /* prefer texture rectangle */
+   if (ctx->Extensions.NV_texture_rectangle) {
+      tex->Target = GL_TEXTURE_RECTANGLE;
+      tex->MaxSize = ctx->Const.MaxTextureRectSize;
+      tex->NPOT = GL_TRUE;
+   }
+   else {
+      /* use 2D texture, NPOT if possible */
+      tex->Target = GL_TEXTURE_2D;
+      tex->MaxSize = 1 << (ctx->Const.MaxTextureLevels - 1);
+      tex->NPOT = ctx->Extensions.ARB_texture_non_power_of_two;
+   }
+   tex->MinSize = 16;  /* 16 x 16 at least */
+   assert(tex->MaxSize > 0);
+
+   _mesa_GenTextures(1, &tex->TexObj);
+   _mesa_BindTexture(tex->Target, tex->TexObj);
+}
+
+
+/**
+ * Return pointer to temp_texture info for non-bitmap ops.
+ * This does some one-time init if needed.
+ */
+static struct temp_texture *
+get_temp_texture(GLcontext *ctx)
+{
+   struct temp_texture *tex = &ctx->Meta->TempTex;
+
+   if (!tex->TexObj) {
+      init_temp_texture(ctx, tex);
+   }
+
+   return tex;
+}
+
+
+/**
+ * Return pointer to temp_texture info for _mesa_meta_bitmap().
+ * We use a separate texture for bitmaps to reduce texture
+ * allocation/deallocation.
+ */
+static struct temp_texture *
+get_bitmap_temp_texture(GLcontext *ctx)
+{
+   struct temp_texture *tex = &ctx->Meta->Bitmap.Tex;
+
+   if (!tex->TexObj) {
+      init_temp_texture(ctx, tex);
+   }
+
+   return tex;
+}
+
+
+/**
+ * Compute the width/height of texture needed to draw an image of the
+ * given size.  Return a flag indicating whether the current texture
+ * can be re-used (glTexSubImage2D) or if a new texture needs to be
+ * allocated (glTexImage2D).
+ * Also, compute s/t texcoords for drawing.
+ *
+ * \return GL_TRUE if new texture is needed, GL_FALSE otherwise
+ */
+static GLboolean
+alloc_texture(struct temp_texture *tex,
+              GLsizei width, GLsizei height, GLenum intFormat)
+{
+   GLboolean newTex = GL_FALSE;
+
+   ASSERT(width <= tex->MaxSize);
+   ASSERT(height <= tex->MaxSize);
+
+   if (width > tex->Width ||
+       height > tex->Height ||
+       intFormat != tex->IntFormat) {
+      /* alloc new texture (larger or different format) */
+
+      if (tex->NPOT) {
+         /* use non-power of two size */
+         tex->Width = MAX2(tex->MinSize, width);
+         tex->Height = MAX2(tex->MinSize, height);
+      }
+      else {
+         /* find power of two size */
+         GLsizei w, h;
+         w = h = tex->MinSize;
+         while (w < width)
+            w *= 2;
+         while (h < height)
+            h *= 2;
+         tex->Width = w;
+         tex->Height = h;
+      }
+
+      tex->IntFormat = intFormat;
+
+      newTex = GL_TRUE;
+   }
+
+   /* compute texcoords */
+   if (tex->Target == GL_TEXTURE_RECTANGLE) {
+      tex->Sright = (GLfloat) width;
+      tex->Ttop = (GLfloat) height;
+   }
+   else {
+      tex->Sright = (GLfloat) width / tex->Width;
+      tex->Ttop = (GLfloat) height / tex->Height;
+   }
+
+   return newTex;
+}
+
+
+/**
+ * Setup/load texture for glCopyPixels or glBlitFramebuffer.
+ */
+static void
+setup_copypix_texture(struct temp_texture *tex,
+                      GLboolean newTex,
+                      GLint srcX, GLint srcY,
+                      GLsizei width, GLsizei height, GLenum intFormat,
+                      GLenum filter)
+{
+   _mesa_BindTexture(tex->Target, tex->TexObj);
+   _mesa_TexParameteri(tex->Target, GL_TEXTURE_MIN_FILTER, filter);
+   _mesa_TexParameteri(tex->Target, GL_TEXTURE_MAG_FILTER, filter);
+   _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+
+   /* copy framebuffer image to texture */
+   if (newTex) {
+      /* create new tex image */
+      if (tex->Width == width && tex->Height == height) {
+         /* create new tex with framebuffer data */
+         _mesa_CopyTexImage2D(tex->Target, 0, tex->IntFormat,
+                              srcX, srcY, width, height, 0);
+      }
+      else {
+         /* create empty texture */
+         _mesa_TexImage2D(tex->Target, 0, tex->IntFormat,
+                          tex->Width, tex->Height, 0,
+                          intFormat, GL_UNSIGNED_BYTE, NULL);
+         /* load image */
+         _mesa_CopyTexSubImage2D(tex->Target, 0,
+                                 0, 0, srcX, srcY, width, height);
+      }
+   }
+   else {
+      /* replace existing tex image */
+      _mesa_CopyTexSubImage2D(tex->Target, 0,
+                              0, 0, srcX, srcY, width, height);
+   }
+}
+
+
+/**
+ * Setup/load texture for glDrawPixels.
+ */
+static void
+setup_drawpix_texture(struct temp_texture *tex,
+                      GLboolean newTex,
+                      GLenum texIntFormat,
+                      GLsizei width, GLsizei height,
+                      GLenum format, GLenum type,
+                      const GLvoid *pixels)
+{
+   _mesa_BindTexture(tex->Target, tex->TexObj);
+   _mesa_TexParameteri(tex->Target, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+   _mesa_TexParameteri(tex->Target, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+   _mesa_TexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+
+   /* copy pixel data to texture */
+   if (newTex) {
+      /* create new tex image */
+      if (tex->Width == width && tex->Height == height) {
+         /* create new tex and load image data */
+         _mesa_TexImage2D(tex->Target, 0, tex->IntFormat,
+                          tex->Width, tex->Height, 0, format, type, pixels);
+      }
+      else {
+         /* create empty texture */
+         _mesa_TexImage2D(tex->Target, 0, tex->IntFormat,
+                          tex->Width, tex->Height, 0, format, type, NULL);
+         /* load image */
+         _mesa_TexSubImage2D(tex->Target, 0,
+                             0, 0, width, height, format, type, pixels);
+      }
+   }
+   else {
+      /* replace existing tex image */
+      _mesa_TexSubImage2D(tex->Target, 0,
+                          0, 0, width, height, format, type, pixels);
+   }
+}
+
+
+
+/**
+ * One-time init for drawing depth pixels.
+ */
+static void
+init_blit_depth_pixels(GLcontext *ctx)
+{
+   static const char *program =
+      "!!ARBfp1.0\n"
+      "TEX result.depth, fragment.texcoord[0], texture[0], %s; \n"
+      "END \n";
+   char program2[200];
+   struct blit_state *blit = &ctx->Meta->Blit;
+   struct temp_texture *tex = get_temp_texture(ctx);
+   const char *texTarget;
+
+   assert(blit->DepthFP == 0);
+
+   /* replace %s with "RECT" or "2D" */
+   assert(strlen(program) + 4 < sizeof(program2));
+   if (tex->Target == GL_TEXTURE_RECTANGLE)
+      texTarget = "RECT";
+   else
+      texTarget = "2D";
+   _mesa_snprintf(program2, sizeof(program2), program, texTarget);
+
+   _mesa_GenPrograms(1, &blit->DepthFP);
+   _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, blit->DepthFP);
+   _mesa_ProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+                          strlen(program2), (const GLubyte *) program2);
+}
+
+
+/**
+ * Meta implementation of ctx->Driver.BlitFramebuffer() in terms
+ * of texture mapping and polygon rendering.
+ */
+void
+_mesa_meta_blit_framebuffer(GLcontext *ctx,
+                            GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                            GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                            GLbitfield mask, GLenum filter)
+{
+   struct blit_state *blit = &ctx->Meta->Blit;
+   struct temp_texture *tex = get_temp_texture(ctx);
+   const GLsizei maxTexSize = tex->MaxSize;
+   const GLint srcX = MIN2(srcX0, srcX1);
+   const GLint srcY = MIN2(srcY0, srcY1);
+   const GLint srcW = abs(srcX1 - srcX0);
+   const GLint srcH = abs(srcY1 - srcY0);
+   const GLboolean srcFlipX = srcX1 < srcX0;
+   const GLboolean srcFlipY = srcY1 < srcY0;
+   struct vertex {
+      GLfloat x, y, s, t;
+   };
+   struct vertex verts[4];
+   GLboolean newTex;
+
+   if (srcW > maxTexSize || srcH > maxTexSize) {
+      /* XXX avoid this fallback */
+      _swrast_BlitFramebuffer(ctx, srcX0, srcY0, srcX1, srcY1,
+                              dstX0, dstY0, dstX1, dstY1, mask, filter);
+      return;
+   }
+
+   if (srcFlipX) {
+      GLint tmp = dstX0;
+      dstX0 = dstX1;
+      dstX1 = tmp;
+   }
+
+   if (srcFlipY) {
+      GLint tmp = dstY0;
+      dstY0 = dstY1;
+      dstY1 = tmp;
+   }
+
+   /* only scissor effects blit so save/clear all other relevant state */
+   _mesa_meta_begin(ctx, ~META_SCISSOR);
+
+   if (blit->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArrays(1, &blit->ArrayObj);
+      _mesa_BindVertexArray(blit->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &blit->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, blit->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(verts),
+                          NULL, GL_DYNAMIC_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(2, GL_FLOAT, sizeof(struct vertex), OFFSET(x));
+      _mesa_TexCoordPointer(2, GL_FLOAT, sizeof(struct vertex), OFFSET(s));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_TEXTURE_COORD_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(blit->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, blit->VBO);
+   }
+
+   newTex = alloc_texture(tex, srcW, srcH, GL_RGBA);
+
+   /* vertex positions/texcoords (after texture allocation!) */
+   {
+      verts[0].x = (GLfloat) dstX0;
+      verts[0].y = (GLfloat) dstY0;
+      verts[1].x = (GLfloat) dstX1;
+      verts[1].y = (GLfloat) dstY0;
+      verts[2].x = (GLfloat) dstX1;
+      verts[2].y = (GLfloat) dstY1;
+      verts[3].x = (GLfloat) dstX0;
+      verts[3].y = (GLfloat) dstY1;
+
+      verts[0].s = 0.0F;
+      verts[0].t = 0.0F;
+      verts[1].s = tex->Sright;
+      verts[1].t = 0.0F;
+      verts[2].s = tex->Sright;
+      verts[2].t = tex->Ttop;
+      verts[3].s = 0.0F;
+      verts[3].t = tex->Ttop;
+
+      /* upload new vertex data */
+      _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(verts), verts);
+   }
+
+   _mesa_set_enable(ctx, tex->Target, GL_TRUE);
+
+   if (mask & GL_COLOR_BUFFER_BIT) {
+      setup_copypix_texture(tex, newTex, srcX, srcY, srcW, srcH,
+                            GL_RGBA, filter);
+      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+      mask &= ~GL_COLOR_BUFFER_BIT;
+   }
+
+   if (mask & GL_DEPTH_BUFFER_BIT) {
+      GLuint *tmp = (GLuint *) _mesa_malloc(srcW * srcH * sizeof(GLuint));
+      if (tmp) {
+         if (!blit->DepthFP)
+            init_blit_depth_pixels(ctx);
+
+         /* maybe change tex format here */
+         newTex = alloc_texture(tex, srcW, srcH, GL_DEPTH_COMPONENT);
+
+         _mesa_ReadPixels(srcX, srcY, srcW, srcH,
+                          GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, tmp);
+
+         setup_drawpix_texture(tex, newTex, GL_DEPTH_COMPONENT, srcW, srcH,
+                               GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, tmp);
+
+         _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, blit->DepthFP);
+         _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB, GL_TRUE);
+         _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+         _mesa_set_enable(ctx, GL_DEPTH_TEST, GL_TRUE);
+         _mesa_DepthFunc(GL_ALWAYS);
+         _mesa_DepthMask(GL_TRUE);
+
+         _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+         mask &= ~GL_DEPTH_BUFFER_BIT;
+
+         _mesa_free(tmp);
+      }
+   }
+
+   if (mask & GL_STENCIL_BUFFER_BIT) {
+      /* XXX can't easily do stencil */
+   }
+
+   _mesa_set_enable(ctx, tex->Target, GL_FALSE);
+
+   _mesa_meta_end(ctx);
+
+   if (mask) {
+      _swrast_BlitFramebuffer(ctx, srcX0, srcY0, srcX1, srcY1,
+                              dstX0, dstY0, dstX1, dstY1, mask, filter);
+   }
+}
+
+
+/**
+ * Meta implementation of ctx->Driver.Clear() in terms of polygon rendering.
+ */
+void
+_mesa_meta_clear(GLcontext *ctx, GLbitfield buffers)
+{
+   struct clear_state *clear = &ctx->Meta->Clear;
+   struct vertex {
+      GLfloat x, y, z, r, g, b, a;
+   };
+   struct vertex verts[4];
+   /* save all state but scissor, pixel pack/unpack */
+   GLbitfield metaSave = META_ALL - META_SCISSOR - META_PIXEL_STORE;
+
+   if (buffers & BUFFER_BITS_COLOR) {
+      /* if clearing color buffers, don't save/restore colormask */
+      metaSave -= META_COLOR_MASK;
+   }
+
+   _mesa_meta_begin(ctx, metaSave);
+
+   if (clear->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArrays(1, &clear->ArrayObj);
+      _mesa_BindVertexArray(clear->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &clear->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, clear->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(verts),
+                          NULL, GL_DYNAMIC_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(3, GL_FLOAT, sizeof(struct vertex), OFFSET(x));
+      _mesa_ColorPointer(4, GL_FLOAT, sizeof(struct vertex), OFFSET(r));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_COLOR_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(clear->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, clear->VBO);
+   }
+
+   /* GL_COLOR_BUFFER_BIT */
+   if (buffers & BUFFER_BITS_COLOR) {
+      /* leave colormask, glDrawBuffer state as-is */
+   }
+   else {
+      ASSERT(metaSave & META_COLOR_MASK);
+      _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+   }
+
+   /* GL_DEPTH_BUFFER_BIT */
+   if (buffers & BUFFER_BIT_DEPTH) {
+      _mesa_set_enable(ctx, GL_DEPTH_TEST, GL_TRUE);
+      _mesa_DepthFunc(GL_ALWAYS);
+      _mesa_DepthMask(GL_TRUE);
+   }
+   else {
+      assert(!ctx->Depth.Test);
+   }
+
+   /* GL_STENCIL_BUFFER_BIT */
+   if (buffers & BUFFER_BIT_STENCIL) {
+      _mesa_set_enable(ctx, GL_STENCIL_TEST, GL_TRUE);
+      _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
+                              GL_REPLACE, GL_REPLACE, GL_REPLACE);
+      _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
+                                ctx->Stencil.Clear & 0x7fffffff,
+                                ctx->Stencil.WriteMask[0]);
+   }
+   else {
+      assert(!ctx->Stencil.Enabled);
+   }
+
+   /* vertex positions/colors */
+   {
+      const GLfloat x0 = (GLfloat) ctx->DrawBuffer->_Xmin;
+      const GLfloat y0 = (GLfloat) ctx->DrawBuffer->_Ymin;
+      const GLfloat x1 = (GLfloat) ctx->DrawBuffer->_Xmax;
+      const GLfloat y1 = (GLfloat) ctx->DrawBuffer->_Ymax;
+      const GLfloat z = 1.0 - 2.0 * ctx->Depth.Clear;
+      GLuint i;
+
+      verts[0].x = x0;
+      verts[0].y = y0;
+      verts[0].z = z;
+      verts[1].x = x1;
+      verts[1].y = y0;
+      verts[1].z = z;
+      verts[2].x = x1;
+      verts[2].y = y1;
+      verts[2].z = z;
+      verts[3].x = x0;
+      verts[3].y = y1;
+      verts[3].z = z;
+
+      /* vertex colors */
+      for (i = 0; i < 4; i++) {
+         verts[i].r = ctx->Color.ClearColor[0];
+         verts[i].g = ctx->Color.ClearColor[1];
+         verts[i].b = ctx->Color.ClearColor[2];
+         verts[i].a = ctx->Color.ClearColor[3];
+      }
+
+      /* upload new vertex data */
+      _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(verts), verts);
+   }
+
+   /* draw quad */
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   _mesa_meta_end(ctx);
+}
+
+
+/**
+ * Meta implementation of ctx->Driver.CopyPixels() in terms
+ * of texture mapping and polygon rendering.
+ */
+void
+_mesa_meta_copy_pixels(GLcontext *ctx, GLint srcX, GLint srcY,
+                       GLsizei width, GLsizei height,
+                       GLint dstX, GLint dstY, GLenum type)
+{
+   struct copypix_state *copypix = &ctx->Meta->CopyPix;
+   struct temp_texture *tex = get_temp_texture(ctx);
+   struct vertex {
+      GLfloat x, y, z, s, t;
+   };
+   struct vertex verts[4];
+   GLboolean newTex;
+   GLenum intFormat = GL_RGBA;
+
+   if (type != GL_COLOR ||
+       ctx->_ImageTransferState ||
+       ctx->Fog.Enabled ||
+       width > tex->MaxSize ||
+       height > tex->MaxSize) {
+      /* XXX avoid this fallback */
+      _swrast_CopyPixels(ctx, srcX, srcY, width, height, dstX, dstY, type);
+      return;
+   }
+
+   /* Most GL state applies to glCopyPixels, but a there's a few things
+    * we need to override:
+    */
+   _mesa_meta_begin(ctx, (META_RASTERIZATION |
+                          META_SHADER |
+                          META_TEXTURE |
+                          META_TRANSFORM |
+                          META_VERTEX |
+                          META_VIEWPORT));
+
+   if (copypix->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArrays(1, &copypix->ArrayObj);
+      _mesa_BindVertexArray(copypix->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &copypix->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, copypix->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(verts),
+                          NULL, GL_DYNAMIC_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(3, GL_FLOAT, sizeof(struct vertex), OFFSET(x));
+      _mesa_TexCoordPointer(2, GL_FLOAT, sizeof(struct vertex), OFFSET(s));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_TEXTURE_COORD_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(copypix->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, copypix->VBO);
+   }
+
+   newTex = alloc_texture(tex, width, height, intFormat);
+
+   /* vertex positions, texcoords (after texture allocation!) */
+   {
+      const GLfloat dstX0 = (GLfloat) dstX;
+      const GLfloat dstY0 = (GLfloat) dstY;
+      const GLfloat dstX1 = dstX + width * ctx->Pixel.ZoomX;
+      const GLfloat dstY1 = dstY + height * ctx->Pixel.ZoomY;
+      const GLfloat z = ctx->Current.RasterPos[2];
+
+      verts[0].x = dstX0;
+      verts[0].y = dstY0;
+      verts[0].z = z;
+      verts[0].s = 0.0F;
+      verts[0].t = 0.0F;
+      verts[1].x = dstX1;
+      verts[1].y = dstY0;
+      verts[1].z = z;
+      verts[1].s = tex->Sright;
+      verts[1].t = 0.0F;
+      verts[2].x = dstX1;
+      verts[2].y = dstY1;
+      verts[2].z = z;
+      verts[2].s = tex->Sright;
+      verts[2].t = tex->Ttop;
+      verts[3].x = dstX0;
+      verts[3].y = dstY1;
+      verts[3].z = z;
+      verts[3].s = 0.0F;
+      verts[3].t = tex->Ttop;
+
+      /* upload new vertex data */
+      _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(verts), verts);
+   }
+
+   /* Alloc/setup texture */
+   setup_copypix_texture(tex, newTex, srcX, srcY, width, height,
+                         GL_RGBA, GL_NEAREST);
+
+   _mesa_set_enable(ctx, tex->Target, GL_TRUE);
+
+   /* draw textured quad */
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   _mesa_set_enable(ctx, tex->Target, GL_FALSE);
+
+   _mesa_meta_end(ctx);
+}
+
+
+
+/**
+ * When the glDrawPixels() image size is greater than the max rectangle
+ * texture size we use this function to break the glDrawPixels() image
+ * into tiles which fit into the max texture size.
+ */
+static void
+tiled_draw_pixels(GLcontext *ctx,
+                  GLint tileSize,
+                  GLint x, GLint y, GLsizei width, GLsizei height,
+                  GLenum format, GLenum type,
+                  const struct gl_pixelstore_attrib *unpack,
+                  const GLvoid *pixels)
+{
+   struct gl_pixelstore_attrib tileUnpack = *unpack;
+   GLint i, j;
+
+   if (tileUnpack.RowLength == 0)
+      tileUnpack.RowLength = width;
+
+   for (i = 0; i < width; i += tileSize) {
+      const GLint tileWidth = MIN2(tileSize, width - i);
+      const GLint tileX = (GLint) (x + i * ctx->Pixel.ZoomX);
+
+      tileUnpack.SkipPixels = unpack->SkipPixels + i;
+
+      for (j = 0; j < height; j += tileSize) {
+         const GLint tileHeight = MIN2(tileSize, height - j);
+         const GLint tileY = (GLint) (y + j * ctx->Pixel.ZoomY);
+
+         tileUnpack.SkipRows = unpack->SkipRows + j;
+
+         _mesa_meta_draw_pixels(ctx, tileX, tileY,
+                                tileWidth, tileHeight,
+                                format, type, &tileUnpack, pixels);
+      }
+   }
+}
+
+
+/**
+ * One-time init for drawing stencil pixels.
+ */
+static void
+init_draw_stencil_pixels(GLcontext *ctx)
+{
+   /* This program is run eight times, once for each stencil bit.
+    * The stencil values to draw are found in an 8-bit alpha texture.
+    * We read the texture/stencil value and test if bit 'b' is set.
+    * If the bit is not set, use KIL to kill the fragment.
+    * Finally, we use the stencil test to update the stencil buffer.
+    *
+    * The basic algorithm for checking if a bit is set is:
+    *   if (is_odd(value / (1 << bit)))
+    *      result is one (or non-zero).
+    *   else
+    *      result is zero.
+    * The program parameter contains three values:
+    *   parm.x = 255 / (1 << bit)
+    *   parm.y = 0.5
+    *   parm.z = 0.0
+    */
+   static const char *program =
+      "!!ARBfp1.0\n"
+      "PARAM parm = program.local[0]; \n"
+      "TEMP t; \n"
+      "TEX t, fragment.texcoord[0], texture[0], %s; \n"   /* NOTE %s here! */
+      "# t = t * 255 / bit \n"
+      "MUL t.x, t.a, parm.x; \n"
+      "# t = (int) t \n"
+      "FRC t.y, t.x; \n"
+      "SUB t.x, t.x, t.y; \n"
+      "# t = t * 0.5 \n"
+      "MUL t.x, t.x, parm.y; \n"
+      "# t = fract(t.x) \n"
+      "FRC t.x, t.x; # if t.x != 0, then the bit is set \n"
+      "# t.x = (t.x == 0 ? 1 : 0) \n"
+      "SGE t.x, -t.x, parm.z; \n"
+      "KIL -t.x; \n"
+      "# for debug only \n"
+      "#MOV result.color, t.x; \n"
+      "END \n";
+   char program2[1000];
+   struct drawpix_state *drawpix = &ctx->Meta->DrawPix;
+   struct temp_texture *tex = get_temp_texture(ctx);
+   const char *texTarget;
+
+   assert(drawpix->StencilFP == 0);
+
+   /* replace %s with "RECT" or "2D" */
+   assert(strlen(program) + 4 < sizeof(program2));
+   if (tex->Target == GL_TEXTURE_RECTANGLE)
+      texTarget = "RECT";
+   else
+      texTarget = "2D";
+   _mesa_snprintf(program2, sizeof(program2), program, texTarget);
+
+   _mesa_GenPrograms(1, &drawpix->StencilFP);
+   _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, drawpix->StencilFP);
+   _mesa_ProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+                          strlen(program2), (const GLubyte *) program2);
+}
+
+
+/**
+ * One-time init for drawing depth pixels.
+ */
+static void
+init_draw_depth_pixels(GLcontext *ctx)
+{
+   static const char *program =
+      "!!ARBfp1.0\n"
+      "PARAM color = program.local[0]; \n"
+      "TEX result.depth, fragment.texcoord[0], texture[0], %s; \n"
+      "MOV result.color, color; \n"
+      "END \n";
+   char program2[200];
+   struct drawpix_state *drawpix = &ctx->Meta->DrawPix;
+   struct temp_texture *tex = get_temp_texture(ctx);
+   const char *texTarget;
+
+   assert(drawpix->DepthFP == 0);
+
+   /* replace %s with "RECT" or "2D" */
+   assert(strlen(program) + 4 < sizeof(program2));
+   if (tex->Target == GL_TEXTURE_RECTANGLE)
+      texTarget = "RECT";
+   else
+      texTarget = "2D";
+   _mesa_snprintf(program2, sizeof(program2), program, texTarget);
+
+   _mesa_GenPrograms(1, &drawpix->DepthFP);
+   _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, drawpix->DepthFP);
+   _mesa_ProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
+                          strlen(program2), (const GLubyte *) program2);
+}
+
+
+/**
+ * Meta implementation of ctx->Driver.DrawPixels() in terms
+ * of texture mapping and polygon rendering.
+ */
+void
+_mesa_meta_draw_pixels(GLcontext *ctx,
+		       GLint x, GLint y, GLsizei width, GLsizei height,
+		       GLenum format, GLenum type,
+		       const struct gl_pixelstore_attrib *unpack,
+		       const GLvoid *pixels)
+{
+   struct drawpix_state *drawpix = &ctx->Meta->DrawPix;
+   struct temp_texture *tex = get_temp_texture(ctx);
+   const struct gl_pixelstore_attrib unpackSave = ctx->Unpack;
+   const GLuint origStencilMask = ctx->Stencil.WriteMask[0];
+   struct vertex {
+      GLfloat x, y, z, s, t;
+   };
+   struct vertex verts[4];
+   GLenum texIntFormat;
+   GLboolean fallback, newTex;
+   GLbitfield metaExtraSave = 0x0;
+
+   /*
+    * Determine if we can do the glDrawPixels with texture mapping.
+    */
+   fallback = GL_FALSE;
+   if (ctx->_ImageTransferState ||
+       ctx->Fog.Enabled) {
+      fallback = GL_TRUE;
+   }
+
+   if (_mesa_is_color_format(format)) {
+      /* use more compact format when possible */
+      /* XXX disable special case for GL_LUMINANCE for now to work around
+       * apparent i965 driver bug (see bug #23670).
+       */
+      if (/*format == GL_LUMINANCE ||*/ format == GL_LUMINANCE_ALPHA)
+         texIntFormat = format;
+      else
+         texIntFormat = GL_RGBA;
+   }
+   else if (_mesa_is_stencil_format(format)) {
+      if (ctx->Extensions.ARB_fragment_program &&
+          ctx->Pixel.IndexShift == 0 &&
+          ctx->Pixel.IndexOffset == 0 &&
+          type == GL_UNSIGNED_BYTE) {
+         /* We'll store stencil as alpha.  This only works for GLubyte
+          * image data because of how incoming values are mapped to alpha
+          * in [0,1].
+          */
+         texIntFormat = GL_ALPHA;
+         metaExtraSave = (META_COLOR_MASK |
+                          META_DEPTH_TEST |
+                          META_SHADER |
+                          META_STENCIL_TEST);
+      }
+      else {
+         fallback = GL_TRUE;
+      }
+   }
+   else if (_mesa_is_depth_format(format)) {
+      if (ctx->Extensions.ARB_depth_texture &&
+          ctx->Extensions.ARB_fragment_program) {
+         texIntFormat = GL_DEPTH_COMPONENT;
+         metaExtraSave = (META_SHADER);
+      }
+      else {
+         fallback = GL_TRUE;
+      }
+   }
+   else {
+      fallback = GL_TRUE;
+   }
+
+   if (fallback) {
+      _swrast_DrawPixels(ctx, x, y, width, height,
+                         format, type, unpack, pixels);
+      return;
+   }
+
+   /*
+    * Check image size against max texture size, draw as tiles if needed.
+    */
+   if (width > tex->MaxSize || height > tex->MaxSize) {
+      tiled_draw_pixels(ctx, tex->MaxSize, x, y, width, height,
+                        format, type, unpack, pixels);
+      return;
+   }
+
+   /* Most GL state applies to glDrawPixels (like blending, stencil, etc),
+    * but a there's a few things we need to override:
+    */
+   _mesa_meta_begin(ctx, (META_RASTERIZATION |
+                          META_SHADER |
+                          META_TEXTURE |
+                          META_TRANSFORM |
+                          META_VERTEX |
+                          META_VIEWPORT |
+                          metaExtraSave));
+
+   if (drawpix->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArrays(1, &drawpix->ArrayObj);
+      _mesa_BindVertexArray(drawpix->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &drawpix->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, drawpix->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(verts),
+                          NULL, GL_DYNAMIC_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(3, GL_FLOAT, sizeof(struct vertex), OFFSET(x));
+      _mesa_TexCoordPointer(2, GL_FLOAT, sizeof(struct vertex), OFFSET(s));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_TEXTURE_COORD_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(drawpix->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, drawpix->VBO);
+   }
+
+   newTex = alloc_texture(tex, width, height, texIntFormat);
+
+   /* vertex positions, texcoords (after texture allocation!) */
+   {
+      const GLfloat x0 = (GLfloat) x;
+      const GLfloat y0 = (GLfloat) y;
+      const GLfloat x1 = x + width * ctx->Pixel.ZoomX;
+      const GLfloat y1 = y + height * ctx->Pixel.ZoomY;
+      const GLfloat z = ctx->Current.RasterPos[2];
+
+      verts[0].x = x0;
+      verts[0].y = y0;
+      verts[0].z = z;
+      verts[0].s = 0.0F;
+      verts[0].t = 0.0F;
+      verts[1].x = x1;
+      verts[1].y = y0;
+      verts[1].z = z;
+      verts[1].s = tex->Sright;
+      verts[1].t = 0.0F;
+      verts[2].x = x1;
+      verts[2].y = y1;
+      verts[2].z = z;
+      verts[2].s = tex->Sright;
+      verts[2].t = tex->Ttop;
+      verts[3].x = x0;
+      verts[3].y = y1;
+      verts[3].z = z;
+      verts[3].s = 0.0F;
+      verts[3].t = tex->Ttop;
+
+      /* upload new vertex data */
+      _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(verts), verts);
+   }
+
+   /* set given unpack params */
+   ctx->Unpack = *unpack;
+
+   _mesa_set_enable(ctx, tex->Target, GL_TRUE);
+
+   if (_mesa_is_stencil_format(format)) {
+      /* Drawing stencil */
+      GLint bit;
+
+      if (!drawpix->StencilFP)
+         init_draw_stencil_pixels(ctx);
+
+      setup_drawpix_texture(tex, newTex, texIntFormat, width, height,
+                            GL_ALPHA, type, pixels);
+
+      _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+
+      _mesa_set_enable(ctx, GL_STENCIL_TEST, GL_TRUE);
+
+      /* set all stencil bits to 0 */
+      _mesa_StencilOp(GL_REPLACE, GL_REPLACE, GL_REPLACE);
+      _mesa_StencilFunc(GL_ALWAYS, 0, 255);
+      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+  
+      /* set stencil bits to 1 where needed */
+      _mesa_StencilOp(GL_KEEP, GL_KEEP, GL_REPLACE);
+
+      _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, drawpix->StencilFP);
+      _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB, GL_TRUE);
+
+      for (bit = 0; bit < ctx->Visual.stencilBits; bit++) {
+         const GLuint mask = 1 << bit;
+         if (mask & origStencilMask) {
+            _mesa_StencilFunc(GL_ALWAYS, mask, mask);
+            _mesa_StencilMask(mask);
+
+            _mesa_ProgramLocalParameter4fARB(GL_FRAGMENT_PROGRAM_ARB, 0,
+                                             255.0 / mask, 0.5, 0.0, 0.0);
+
+            _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+         }
+      }
+   }
+   else if (_mesa_is_depth_format(format)) {
+      /* Drawing depth */
+      if (!drawpix->DepthFP)
+         init_draw_depth_pixels(ctx);
+
+      _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, drawpix->DepthFP);
+      _mesa_set_enable(ctx, GL_FRAGMENT_PROGRAM_ARB, GL_TRUE);
+
+      /* polygon color = current raster color */
+      _mesa_ProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0,
+                                        ctx->Current.RasterColor);
+
+      setup_drawpix_texture(tex, newTex, texIntFormat, width, height,
+                            format, type, pixels);
+
+      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+   }
+   else {
+      /* Drawing RGBA */
+      setup_drawpix_texture(tex, newTex, texIntFormat, width, height,
+                            format, type, pixels);
+      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+   }
+
+   _mesa_set_enable(ctx, tex->Target, GL_FALSE);
+
+   /* restore unpack params */
+   ctx->Unpack = unpackSave;
+
+   _mesa_meta_end(ctx);
+}
+
+
+/**
+ * Do glBitmap with a alpha texture quad.  Use the alpha test to
+ * cull the 'off' bits.  If alpha test is already enabled, fall back
+ * to swrast (should be a rare case).
+ * A bitmap cache as in the gallium/mesa state tracker would
+ * improve performance a lot.
+ */
+void
+_mesa_meta_bitmap(GLcontext *ctx,
+                  GLint x, GLint y, GLsizei width, GLsizei height,
+                  const struct gl_pixelstore_attrib *unpack,
+                  const GLubyte *bitmap1)
+{
+   struct bitmap_state *bitmap = &ctx->Meta->Bitmap;
+   struct temp_texture *tex = get_bitmap_temp_texture(ctx);
+   const GLenum texIntFormat = GL_ALPHA;
+   const struct gl_pixelstore_attrib unpackSave = *unpack;
+   struct vertex {
+      GLfloat x, y, z, s, t, r, g, b, a;
+   };
+   struct vertex verts[4];
+   GLboolean newTex;
+   GLubyte *bitmap8;
+
+   /*
+    * Check if swrast fallback is needed.
+    */
+   if (ctx->_ImageTransferState ||
+       ctx->Color.AlphaEnabled ||
+       ctx->Fog.Enabled ||
+       ctx->Texture._EnabledUnits ||
+       width > tex->MaxSize ||
+       height > tex->MaxSize) {
+      _swrast_Bitmap(ctx, x, y, width, height, unpack, bitmap1);
+      return;
+   }
+
+   /* Most GL state applies to glBitmap (like blending, stencil, etc),
+    * but a there's a few things we need to override:
+    */
+   _mesa_meta_begin(ctx, (META_ALPHA_TEST |
+                          META_PIXEL_STORE |
+                          META_RASTERIZATION |
+                          META_SHADER |
+                          META_TEXTURE |
+                          META_TRANSFORM |
+                          META_VERTEX |
+                          META_VIEWPORT));
+
+   if (bitmap->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArraysAPPLE(1, &bitmap->ArrayObj);
+      _mesa_BindVertexArrayAPPLE(bitmap->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &bitmap->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, bitmap->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(verts),
+                          NULL, GL_DYNAMIC_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(3, GL_FLOAT, sizeof(struct vertex), OFFSET(x));
+      _mesa_TexCoordPointer(2, GL_FLOAT, sizeof(struct vertex), OFFSET(s));
+      _mesa_ColorPointer(4, GL_FLOAT, sizeof(struct vertex), OFFSET(r));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_TEXTURE_COORD_ARRAY);
+      _mesa_EnableClientState(GL_COLOR_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(bitmap->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, bitmap->VBO);
+   }
+
+   newTex = alloc_texture(tex, width, height, texIntFormat);
+
+   /* vertex positions, texcoords, colors (after texture allocation!) */
+   {
+      const GLfloat x0 = (GLfloat) x;
+      const GLfloat y0 = (GLfloat) y;
+      const GLfloat x1 = (GLfloat) (x + width);
+      const GLfloat y1 = (GLfloat) (y + height);
+      const GLfloat z = ctx->Current.RasterPos[2];
+      GLuint i;
+
+      verts[0].x = x0;
+      verts[0].y = y0;
+      verts[0].z = z;
+      verts[0].s = 0.0F;
+      verts[0].t = 0.0F;
+      verts[1].x = x1;
+      verts[1].y = y0;
+      verts[1].z = z;
+      verts[1].s = tex->Sright;
+      verts[1].t = 0.0F;
+      verts[2].x = x1;
+      verts[2].y = y1;
+      verts[2].z = z;
+      verts[2].s = tex->Sright;
+      verts[2].t = tex->Ttop;
+      verts[3].x = x0;
+      verts[3].y = y1;
+      verts[3].z = z;
+      verts[3].s = 0.0F;
+      verts[3].t = tex->Ttop;
+
+      for (i = 0; i < 4; i++) {
+         verts[i].r = ctx->Current.RasterColor[0];
+         verts[i].g = ctx->Current.RasterColor[1];
+         verts[i].b = ctx->Current.RasterColor[2];
+         verts[i].a = ctx->Current.RasterColor[3];
+      }
+
+      /* upload new vertex data */
+      _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(verts), verts);
+   }
+
+   bitmap1 = _mesa_map_pbo_source(ctx, &unpackSave, bitmap1);
+   if (!bitmap1)
+      return;
+
+   bitmap8 = (GLubyte *) _mesa_calloc(width * height);
+   if (bitmap8) {
+      _mesa_expand_bitmap(width, height, &unpackSave, bitmap1,
+                          bitmap8, width, 0xff);
+
+      _mesa_set_enable(ctx, tex->Target, GL_TRUE);
+
+      _mesa_set_enable(ctx, GL_ALPHA_TEST, GL_TRUE);
+      _mesa_AlphaFunc(GL_GREATER, 0.0);
+
+      setup_drawpix_texture(tex, newTex, texIntFormat, width, height,
+                            GL_ALPHA, GL_UNSIGNED_BYTE, bitmap8);
+
+      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+      _mesa_set_enable(ctx, tex->Target, GL_FALSE);
+
+      _mesa_free(bitmap8);
+   }
+
+   _mesa_unmap_pbo_source(ctx, &unpackSave);
+
+   _mesa_meta_end(ctx);
+}
+
+
+void
+_mesa_meta_generate_mipmap(GLcontext *ctx, GLenum target,
+                           struct gl_texture_object *texObj)
+{
+   struct gen_mipmap_state *mipmap = &ctx->Meta->Mipmap;
+   struct vertex {
+      GLfloat x, y, s, t, r;
+   };
+   struct vertex verts[4];
+   const GLuint baseLevel = texObj->BaseLevel;
+   const GLuint maxLevel = texObj->MaxLevel;
+   const GLenum minFilterSave = texObj->MinFilter;
+   const GLenum magFilterSave = texObj->MagFilter;
+   const GLuint fboSave = ctx->DrawBuffer->Name;
+   GLenum faceTarget;
+   GLuint level;
+   GLuint border = 0;
+
+   /* check for fallbacks */
+   if (!ctx->Extensions.EXT_framebuffer_object) {
+      _mesa_generate_mipmap(ctx, target, texObj);
+      return;
+   }
+
+   if (target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X &&
+       target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z) {
+      faceTarget = target;
+      target = GL_TEXTURE_CUBE_MAP;
+   }
+   else {
+      faceTarget = target;
+   }
+
+   _mesa_meta_begin(ctx, META_ALL);
+
+   if (mipmap->ArrayObj == 0) {
+      /* one-time setup */
+
+      /* create vertex array object */
+      _mesa_GenVertexArraysAPPLE(1, &mipmap->ArrayObj);
+      _mesa_BindVertexArrayAPPLE(mipmap->ArrayObj);
+
+      /* create vertex array buffer */
+      _mesa_GenBuffersARB(1, &mipmap->VBO);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, mipmap->VBO);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(verts),
+                          NULL, GL_DYNAMIC_DRAW_ARB);
+
+      /* setup vertex arrays */
+      _mesa_VertexPointer(2, GL_FLOAT, sizeof(struct vertex), OFFSET(x));
+      _mesa_TexCoordPointer(3, GL_FLOAT, sizeof(struct vertex), OFFSET(s));
+      _mesa_EnableClientState(GL_VERTEX_ARRAY);
+      _mesa_EnableClientState(GL_TEXTURE_COORD_ARRAY);
+   }
+   else {
+      _mesa_BindVertexArray(mipmap->ArrayObj);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, mipmap->VBO);
+   }
+
+   if (!mipmap->FBO) {
+      /* Bind the new renderbuffer to the color attachment point. */
+      _mesa_GenFramebuffersEXT(1, &mipmap->FBO);
+   }
+
+   _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, mipmap->FBO);
+
+   _mesa_TexParameteri(target, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+   _mesa_TexParameteri(target, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+   _mesa_set_enable(ctx, target, GL_TRUE);
+
+   /* setup texcoords once (XXX what about border?) */
+   switch (faceTarget) {
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+      break;
+   case GL_TEXTURE_2D:
+      verts[0].s = 0.0F;
+      verts[0].t = 0.0F;
+      verts[0].r = 0.0F;
+      verts[1].s = 1.0F;
+      verts[1].t = 0.0F;
+      verts[1].r = 0.0F;
+      verts[2].s = 1.0F;
+      verts[2].t = 1.0F;
+      verts[2].r = 0.0F;
+      verts[3].s = 0.0F;
+      verts[3].t = 1.0F;
+      verts[3].r = 0.0F;
+      break;
+   }
+
+
+   for (level = baseLevel + 1; level <= maxLevel; level++) {
+      const struct gl_texture_image *srcImage;
+      const GLuint srcLevel = level - 1;
+      GLsizei srcWidth, srcHeight;
+      GLsizei newWidth, newHeight;
+      GLenum status;
+
+      srcImage = _mesa_select_tex_image(ctx, texObj, target, srcLevel);
+      assert(srcImage->Border == 0); /* XXX we can fix this */
+
+      srcWidth = srcImage->Width - 2 * border;
+      srcHeight = srcImage->Height - 2 * border;
+
+      newWidth = MAX2(1, srcWidth / 2) + 2 * border;
+      newHeight = MAX2(1, srcHeight / 2) + 2 * border;
+
+      if (newWidth == srcImage->Width && newHeight == srcImage->Height) {
+	 break;
+      }
+
+      /* Create empty image */
+      _mesa_TexImage2D(GL_TEXTURE_2D, level, srcImage->InternalFormat,
+		       newWidth, newHeight, border,
+		       GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+
+      /* vertex positions */
+      {
+         verts[0].x = 0.0F;
+         verts[0].y = 0.0F;
+         verts[1].x = (GLfloat) newWidth;
+         verts[1].y = 0.0F;
+         verts[2].x = (GLfloat) newWidth;
+         verts[2].y = (GLfloat) newHeight;
+         verts[3].x = 0.0F;
+         verts[3].y = (GLfloat) newHeight;
+
+         /* upload new vertex data */
+         _mesa_BufferSubDataARB(GL_ARRAY_BUFFER_ARB, 0, sizeof(verts), verts);
+      }
+
+      /* limit sampling to src level */
+      _mesa_TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, srcLevel);
+      _mesa_TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, srcLevel);
+
+      /* Set to draw into the current level */
+      _mesa_FramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
+                                    GL_COLOR_ATTACHMENT0_EXT,
+                                    target,
+                                    texObj->Name,
+                                    level);
+
+      /* Choose to render to the color attachment. */
+      _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
+
+      status = _mesa_CheckFramebufferStatusEXT (GL_FRAMEBUFFER_EXT);
+      if (status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+         abort();
+         break;
+      }
+
+      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+   }
+
+   _mesa_meta_end(ctx);
+
+   _mesa_TexParameteri(target, GL_TEXTURE_MIN_FILTER, minFilterSave);
+   _mesa_TexParameteri(target, GL_TEXTURE_MAG_FILTER, magFilterSave);
+
+   /* restore (XXX add to meta_begin/end()? */
+   _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, fboSave);
+}
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
new file mode 100644
index 0000000000..171ad27f26
--- /dev/null
+++ b/src/mesa/drivers/common/meta.h
@@ -0,0 +1,91 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.6
+ *
+ * Copyright (C) 2009  VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef META_H
+#define META_H
+
+
+/**
+ * Flags passed to _mesa_meta_begin().
+ * XXX these flags may evolve...
+ */
+/*@{*/
+#define META_ALPHA_TEST      0x1
+#define META_BLEND           0x2  /**< includes logicop */
+#define META_COLOR_MASK      0x4
+#define META_DEPTH_TEST      0x8
+#define META_FOG            0x10
+#define META_RASTERIZATION  0x20
+#define META_SCISSOR        0x40
+#define META_SHADER         0x80
+#define META_STENCIL_TEST  0x100
+#define META_TRANSFORM     0x200 /**< modelview, projection */
+#define META_TEXTURE       0x400
+#define META_VERTEX        0x800
+#define META_VIEWPORT     0x1000
+#define META_PIXEL_STORE  0x2000
+#define META_ALL            ~0x0
+/*@}*/
+
+
+extern void
+_mesa_meta_init(GLcontext *ctx);
+
+extern void
+_mesa_meta_free(GLcontext *ctx);
+
+extern void
+_mesa_meta_blit_framebuffer(GLcontext *ctx,
+                            GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                            GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                            GLbitfield mask, GLenum filter);
+
+extern void
+_mesa_meta_clear(GLcontext *ctx, GLbitfield buffers);
+
+extern void
+_mesa_meta_copy_pixels(GLcontext *ctx, GLint srcx, GLint srcy,
+                       GLsizei width, GLsizei height,
+                       GLint dstx, GLint dsty, GLenum type);
+
+extern void
+_mesa_meta_draw_pixels(GLcontext *ctx,
+		       GLint x, GLint y, GLsizei width, GLsizei height,
+		       GLenum format, GLenum type,
+		       const struct gl_pixelstore_attrib *unpack,
+		       const GLvoid *pixels);
+
+extern void
+_mesa_meta_bitmap(GLcontext *ctx,
+                  GLint x, GLint y, GLsizei width, GLsizei height,
+                  const struct gl_pixelstore_attrib *unpack,
+                  const GLubyte *bitmap);
+
+extern void
+_mesa_meta_generate_mipmap(GLcontext *ctx, GLenum target,
+                           struct gl_texture_object *texObj);
+
+
+#endif /* META_H */
diff --git a/src/mesa/drivers/dri/Makefile b/src/mesa/drivers/dri/Makefile
index 9e49fb16f5..32db097861 100644
--- a/src/mesa/drivers/dri/Makefile
+++ b/src/mesa/drivers/dri/Makefile
@@ -6,7 +6,7 @@ include $(TOP)/configs/current
 
 
 
-default: $(TOP)/$(LIB_DIR) subdirs
+default: $(TOP)/$(LIB_DIR) subdirs dri.pc
 
 
 $(TOP)/$(LIB_DIR):
diff --git a/src/mesa/drivers/dri/Makefile.es b/src/mesa/drivers/dri/Makefile.es
new file mode 100644
index 0000000000..32cf9bd222
--- /dev/null
+++ b/src/mesa/drivers/dri/Makefile.es
@@ -0,0 +1,104 @@
+# -*-makefile-*-
+
+ES_VERSION ?= 1
+DRIVER_DEFINES += -UIN_DRI_DRIVER
+
+MESA_MODULES = $(TOP)/src/mesa/es/libes$(ES_VERSION).a
+
+COMMON_GALLIUM_SOURCES = \
+        ../common/utils.c \
+        ../common/vblank.c \
+        ../common/dri_util.c \
+        ../common/xmlconfig.c
+
+COMMON_SOURCES = $(COMMON_GALLIUM_SOURCES) \
+        ../../common/driverfuncs.c \
+        ../common/texmem.c \
+        ../common/drirenderbuffer.c \
+	../common/dri_metaops.c
+
+ifeq ($(WINDOW_SYSTEM),dri)
+WINOBJ=
+WINLIB=
+INCLUDES = $(SHARED_INCLUDES) $(EXPAT_INCLUDES)
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+
+else
+# miniglx
+WINOBJ=
+WINLIB=-L$(MESA)/src/glx/mini
+MINIGLX_INCLUDES = -I$(TOP)/src/glx/mini
+INCLUDES = $(MINIGLX_INCLUDES) \
+	   $(SHARED_INCLUDES) \
+	   $(PCIACCESS_CFLAGS)
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(MINIGLX_SOURCES:.c=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+endif
+
+
+### Include directories
+SHARED_INCLUDES = \
+	-I. \
+	-I$(TOP)/src/mesa/drivers/dri/common \
+	-Iserver \
+	-I$(TOP)/src/mesa/es/glapi/headers-es$(ES_VERSION) \
+	-I$(TOP)/src/mesa/es \
+	-I$(TOP)/include \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/egl/main \
+	-I$(TOP)/src/egl/drivers/dri \
+	$(LIBDRM_CFLAGS)
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
+
+
+##### TARGETS #####
+
+default: symlinks depend $(LIBNAME) $(TOP)/$(LIB_DIR)/$(LIBNAME)
+
+
+$(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(PIPE_DRIVERS) $(WINOBJ) Makefile $(TOP)/src/mesa/drivers/dri/Makefile.template
+	$(MKLIB) -o $@ -noprefix -linker '$(CC)' -ldflags '$(LDFLAGS)' \
+		$(OBJECTS) $(PIPE_DRIVERS) $(MESA_MODULES) $(WINOBJ) $(DRI_LIB_DEPS)
+
+
+$(TOP)/$(LIB_DIR)/$(LIBNAME): $(LIBNAME)
+	$(INSTALL) $(LIBNAME) $(TOP)/$(LIB_DIR) 
+
+
+depend: $(C_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
+	@ echo "running $(MKDEP)"
+	@ rm -f depend
+	@ touch depend
+	@ $(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
+		$(ASM_SOURCES) > /dev/null 2>/dev/null
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+# Remove .o and backup files
+clean:
+	-rm -f *.o */*.o *~ *.so *~ server/*.o $(SYMLINKS)
+	-rm -f depend depend.bak
+
+
+install: $(LIBNAME)
+	$(INSTALL) -d $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
+	$(MINSTALL) -m 755 $(LIBNAME) $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
+
+
+-include depend
diff --git a/src/mesa/drivers/dri/Makefile.template b/src/mesa/drivers/dri/Makefile.template
index 2dc3664cc6..18dbeba24a 100644
--- a/src/mesa/drivers/dri/Makefile.template
+++ b/src/mesa/drivers/dri/Makefile.template
@@ -2,13 +2,17 @@
 
 MESA_MODULES = $(TOP)/src/mesa/libmesa.a
 
-COMMON_SOURCES = \
+COMMON_GALLIUM_SOURCES = \
         ../common/utils.c \
-        ../common/texmem.c \
         ../common/vblank.c \
         ../common/dri_util.c \
-        ../common/xmlconfig.c \
-        ../common/drirenderbuffer.c 
+        ../common/xmlconfig.c
+
+COMMON_SOURCES = $(COMMON_GALLIUM_SOURCES) \
+        ../../common/driverfuncs.c \
+        ../common/texmem.c \
+        ../common/drirenderbuffer.c \
+	../common/dri_metaops.c
 
 ifeq ($(WINDOW_SYSTEM),dri)
 WINOBJ=
@@ -59,9 +63,9 @@ SHARED_INCLUDES = \
 default: symlinks depend $(LIBNAME) $(TOP)/$(LIB_DIR)/$(LIBNAME)
 
 
-$(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(WINOBJ) Makefile $(TOP)/src/mesa/drivers/dri/Makefile.template
+$(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(PIPE_DRIVERS) $(WINOBJ) Makefile $(TOP)/src/mesa/drivers/dri/Makefile.template
 	$(MKLIB) -o $@ -noprefix -linker '$(CC)' -ldflags '$(LDFLAGS)' \
-		$(OBJECTS) $(MESA_MODULES) $(WINOBJ) $(DRI_LIB_DEPS)
+		$(OBJECTS) $(PIPE_DRIVERS) $(MESA_MODULES) $(WINOBJ) $(DRI_LIB_DEPS)
 
 
 $(TOP)/$(LIB_DIR)/$(LIBNAME): $(LIBNAME)
@@ -69,9 +73,11 @@ $(TOP)/$(LIB_DIR)/$(LIBNAME): $(LIBNAME)
 
 
 depend: $(C_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
-	touch depend
-	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
-		$(ASM_SOURCES)
+	@ echo "running $(MKDEP)"
+	@ rm -f depend
+	@ touch depend
+	@ $(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
+		$(ASM_SOURCES) > /dev/null 2>/dev/null
 
 
 # Emacs tags
@@ -87,7 +93,7 @@ clean:
 
 install: $(LIBNAME)
 	$(INSTALL) -d $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
-	$(INSTALL) -m 755 $(LIBNAME) $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
+	$(MINSTALL) -m 755 $(LIBNAME) $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
 
 
 -include depend
diff --git a/src/mesa/drivers/dri/common/.gitignore b/src/mesa/drivers/dri/common/.gitignore
new file mode 100644
index 0000000000..1edeb79fd1
--- /dev/null
+++ b/src/mesa/drivers/dri/common/.gitignore
@@ -0,0 +1 @@
+*.os
diff --git a/src/mesa/drivers/dri/common/dri_metaops.c b/src/mesa/drivers/dri/common/dri_metaops.c
new file mode 100644
index 0000000000..c7bea07dc9
--- /dev/null
+++ b/src/mesa/drivers/dri/common/dri_metaops.c
@@ -0,0 +1,298 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "main/arrayobj.h"
+#include "main/attrib.h"
+#include "main/blend.h"
+#include "main/bufferobj.h"
+#include "main/buffers.h"
+#include "main/depth.h"
+#include "main/enable.h"
+#include "main/matrix.h"
+#include "main/macros.h"
+#include "main/polygon.h"
+#include "main/shaders.h"
+#include "main/stencil.h"
+#include "main/texstate.h"
+#include "main/varray.h"
+#include "main/viewport.h"
+#include "shader/arbprogram.h"
+#include "shader/program.h"
+#include "dri_metaops.h"
+
+void
+meta_set_passthrough_transform(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+
+   meta->saved_vp_x = ctx->Viewport.X;
+   meta->saved_vp_y = ctx->Viewport.Y;
+   meta->saved_vp_width = ctx->Viewport.Width;
+   meta->saved_vp_height = ctx->Viewport.Height;
+   meta->saved_matrix_mode = ctx->Transform.MatrixMode;
+
+   meta->internal_viewport_call = GL_TRUE;
+   _mesa_Viewport(0, 0, ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);
+   meta->internal_viewport_call = GL_FALSE;
+
+   _mesa_MatrixMode(GL_PROJECTION);
+   _mesa_PushMatrix();
+   _mesa_LoadIdentity();
+   _mesa_Ortho(0, ctx->DrawBuffer->Width, 0, ctx->DrawBuffer->Height, 1, -1);
+
+   _mesa_MatrixMode(GL_MODELVIEW);
+   _mesa_PushMatrix();
+   _mesa_LoadIdentity();
+}
+
+void
+meta_restore_transform(struct dri_metaops *meta)
+{
+   _mesa_MatrixMode(GL_PROJECTION);
+   _mesa_PopMatrix();
+   _mesa_MatrixMode(GL_MODELVIEW);
+   _mesa_PopMatrix();
+
+   _mesa_MatrixMode(meta->saved_matrix_mode);
+
+   meta->internal_viewport_call = GL_TRUE;
+   _mesa_Viewport(meta->saved_vp_x, meta->saved_vp_y,
+		  meta->saved_vp_width, meta->saved_vp_height);
+   meta->internal_viewport_call = GL_FALSE;
+}
+
+
+/**
+ * Set up a vertex program to pass through the position and first texcoord
+ * for pixel path.
+ */
+void
+meta_set_passthrough_vertex_program(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+   static const char *vp =
+      "!!ARBvp1.0\n"
+      "TEMP vertexClip;\n"
+      "DP4 vertexClip.x, state.matrix.mvp.row[0], vertex.position;\n"
+      "DP4 vertexClip.y, state.matrix.mvp.row[1], vertex.position;\n"
+      "DP4 vertexClip.z, state.matrix.mvp.row[2], vertex.position;\n"
+      "DP4 vertexClip.w, state.matrix.mvp.row[3], vertex.position;\n"
+      "MOV result.position, vertexClip;\n"
+      "MOV result.texcoord[0], vertex.texcoord[0];\n"
+      "MOV result.color, vertex.color;\n"
+      "END\n";
+
+   assert(meta->saved_vp == NULL);
+
+   _mesa_reference_vertprog(ctx, &meta->saved_vp,
+			    ctx->VertexProgram.Current);
+   if (meta->passthrough_vp == NULL) {
+      GLuint prog_name;
+      _mesa_GenPrograms(1, &prog_name);
+      _mesa_BindProgram(GL_VERTEX_PROGRAM_ARB, prog_name);
+      _mesa_ProgramStringARB(GL_VERTEX_PROGRAM_ARB,
+			     GL_PROGRAM_FORMAT_ASCII_ARB,
+			     strlen(vp), (const GLubyte *)vp);
+      _mesa_reference_vertprog(ctx, &meta->passthrough_vp,
+			       ctx->VertexProgram.Current);
+      _mesa_DeletePrograms(1, &prog_name);
+   }
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
+			    meta->passthrough_vp);
+   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
+			   &meta->passthrough_vp->Base);
+
+   meta->saved_vp_enable = ctx->VertexProgram.Enabled;
+   _mesa_Enable(GL_VERTEX_PROGRAM_ARB);
+}
+
+/**
+ * Restores the previous vertex program after
+ * meta_set_passthrough_vertex_program()
+ */
+void
+meta_restore_vertex_program(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
+			    meta->saved_vp);
+   _mesa_reference_vertprog(ctx, &meta->saved_vp, NULL);
+   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
+			   &ctx->VertexProgram.Current->Base);
+
+   if (!meta->saved_vp_enable)
+      _mesa_Disable(GL_VERTEX_PROGRAM_ARB);
+}
+
+/**
+ * Binds the given program string to GL_FRAGMENT_PROGRAM_ARB, caching the
+ * program object.
+ */
+void
+meta_set_fragment_program(struct dri_metaops *meta,
+			  struct gl_fragment_program **prog,
+			  const char *prog_string)
+{
+   GLcontext *ctx = meta->ctx;
+   assert(meta->saved_fp == NULL);
+
+   _mesa_reference_fragprog(ctx, &meta->saved_fp,
+			    ctx->FragmentProgram.Current);
+   if (*prog == NULL) {
+      GLuint prog_name;
+      _mesa_GenPrograms(1, &prog_name);
+      _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, prog_name);
+      _mesa_ProgramStringARB(GL_FRAGMENT_PROGRAM_ARB,
+			     GL_PROGRAM_FORMAT_ASCII_ARB,
+			     strlen(prog_string), (const GLubyte *)prog_string);
+      _mesa_reference_fragprog(ctx, prog, ctx->FragmentProgram.Current);
+      /* Note that DeletePrograms unbinds the program on us */
+      _mesa_DeletePrograms(1, &prog_name);
+   }
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, *prog);
+   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, &((*prog)->Base));
+
+   meta->saved_fp_enable = ctx->FragmentProgram.Enabled;
+   _mesa_Enable(GL_FRAGMENT_PROGRAM_ARB);
+}
+
+/**
+ * Restores the previous fragment program after
+ * meta_set_fragment_program()
+ */
+void
+meta_restore_fragment_program(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current,
+			    meta->saved_fp);
+   _mesa_reference_fragprog(ctx, &meta->saved_fp, NULL);
+   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
+			   &ctx->FragmentProgram.Current->Base);
+
+   if (!meta->saved_fp_enable)
+      _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
+}
+
+static const float default_texcoords[4][2] = { { 0.0, 0.0 },
+					       { 1.0, 0.0 },
+					       { 1.0, 1.0 },
+					       { 0.0, 1.0 } };
+
+void
+meta_set_default_texrect(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+   struct gl_client_array *old_texcoord_array;
+
+   meta->saved_active_texture = ctx->Texture.CurrentUnit;
+   if (meta->saved_array_vbo == NULL) {
+      _mesa_reference_buffer_object(ctx, &meta->saved_array_vbo,
+				    ctx->Array.ArrayBufferObj);
+   }
+
+   old_texcoord_array = &ctx->Array.ArrayObj->TexCoord[0];
+   meta->saved_texcoord_type = old_texcoord_array->Type;
+   meta->saved_texcoord_size = old_texcoord_array->Size;
+   meta->saved_texcoord_stride = old_texcoord_array->Stride;
+   meta->saved_texcoord_enable = old_texcoord_array->Enabled;
+   meta->saved_texcoord_ptr = old_texcoord_array->Ptr;
+   _mesa_reference_buffer_object(ctx, &meta->saved_texcoord_vbo,
+				 old_texcoord_array->BufferObj);
+
+   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
+
+   if (meta->texcoord_vbo == NULL) {
+      GLuint vbo_name;
+
+      _mesa_GenBuffersARB(1, &vbo_name);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, vbo_name);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(default_texcoords),
+			  default_texcoords, GL_STATIC_DRAW_ARB);
+      _mesa_reference_buffer_object(ctx, &meta->texcoord_vbo,
+				    ctx->Array.ArrayBufferObj);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  meta->texcoord_vbo->Name);
+   }
+   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), NULL);
+
+   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+}
+
+void
+meta_restore_texcoords(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+
+   /* Restore the old TexCoordPointer */
+   if (meta->saved_texcoord_vbo) {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  meta->saved_texcoord_vbo->Name);
+      _mesa_reference_buffer_object(ctx, &meta->saved_texcoord_vbo, NULL);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   }
+
+   _mesa_TexCoordPointer(meta->saved_texcoord_size,
+			 meta->saved_texcoord_type,
+			 meta->saved_texcoord_stride,
+			 meta->saved_texcoord_ptr);
+   if (!meta->saved_texcoord_enable)
+      _mesa_Disable(GL_TEXTURE_COORD_ARRAY);
+
+   _mesa_ClientActiveTextureARB(GL_TEXTURE0 +
+				meta->saved_active_texture);
+
+   if (meta->saved_array_vbo) {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  meta->saved_array_vbo->Name);
+      _mesa_reference_buffer_object(ctx, &meta->saved_array_vbo, NULL);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   }
+}
+
+
+void meta_init_metaops(GLcontext *ctx, struct dri_metaops *meta)
+{
+   meta->ctx = ctx;
+}
+
+void meta_destroy_metaops(struct dri_metaops *meta)
+{
+
+}
diff --git a/src/mesa/drivers/dri/common/dri_metaops.h b/src/mesa/drivers/dri/common/dri_metaops.h
new file mode 100644
index 0000000000..2487145326
--- /dev/null
+++ b/src/mesa/drivers/dri/common/dri_metaops.h
@@ -0,0 +1,81 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef DRI_METAOPS_H
+#define DRI_METAOPS_H
+
+
+struct dri_metaops {
+    GLcontext *ctx;
+    GLboolean internal_viewport_call;
+    struct gl_fragment_program *bitmap_fp;
+    struct gl_vertex_program *passthrough_vp;
+    struct gl_buffer_object *texcoord_vbo;
+    
+    struct gl_fragment_program *saved_fp;
+    GLboolean saved_fp_enable;
+    struct gl_vertex_program *saved_vp;
+    GLboolean saved_vp_enable;
+
+    struct gl_fragment_program *tex2d_fp;
+    
+    GLboolean saved_texcoord_enable;
+    struct gl_buffer_object *saved_array_vbo, *saved_texcoord_vbo;
+    GLenum saved_texcoord_type;
+    GLsizei saved_texcoord_size, saved_texcoord_stride;
+    const void *saved_texcoord_ptr;
+    int saved_active_texture;
+
+    GLint saved_vp_x, saved_vp_y;
+    GLsizei saved_vp_width, saved_vp_height;
+    GLenum saved_matrix_mode;
+};
+
+
+void meta_set_passthrough_transform(struct dri_metaops *meta);
+
+void meta_restore_transform(struct dri_metaops *meta);
+
+void meta_set_passthrough_vertex_program(struct dri_metaops *meta);
+
+void meta_restore_vertex_program(struct dri_metaops *meta);
+
+void meta_set_fragment_program(struct dri_metaops *meta,
+			  struct gl_fragment_program **prog,
+			  const char *prog_string);
+
+void meta_restore_fragment_program(struct dri_metaops *meta);
+
+void meta_set_default_texrect(struct dri_metaops *meta);
+
+void meta_restore_texcoords(struct dri_metaops *meta);
+
+void meta_init_metaops(GLcontext *ctx, struct dri_metaops *meta);
+void meta_destroy_metaops(struct dri_metaops *meta);
+
+#endif
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index ae79055405..e48e10d7c0 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -1,4 +1,3 @@
-/* $XFree86: xc/lib/GL/dri/dri_util.c,v 1.7 2003/04/28 17:01:25 dawes Exp $ */
 /**
  * \file dri_util.c
  * DRI utility functions.
@@ -37,6 +36,9 @@
 typedef GLboolean ( * PFNGLXGETMSCRATEOMLPROC) (__DRIdrawable *drawable, int32_t *numerator, int32_t *denominator);
 #endif
 
+static void dri_get_drawable(__DRIdrawable *pdp);
+static void dri_put_drawable(__DRIdrawable *pdp);
+
 /**
  * This is just a token extension used to signal that the driver
  * supports setting a read drawable.
@@ -59,7 +61,7 @@ __driUtilMessage(const char *f, ...)
     va_list args;
 
     if (getenv("LIBGL_DEBUG")) {
-        fprintf(stderr, "libGL error: \n");
+        fprintf(stderr, "libGL: ");
         va_start(args, f);
         vfprintf(stderr, f, args);
         va_end(args);
@@ -119,6 +121,9 @@ static int driUnbindContext(__DRIcontext *pcp)
     pdp = pcp->driDrawablePriv;
     prp = pcp->driReadablePriv;
 
+    /* already unbound */
+    if (!pdp && !prp)
+      return GL_TRUE;
     /* Let driver unbind drawable from context */
     (*psp->DriverAPI.UnbindContext)(pcp);
 
@@ -127,7 +132,7 @@ static int driUnbindContext(__DRIcontext *pcp)
 	return GL_FALSE;
     }
 
-    pdp->refcount--;
+    dri_put_drawable(pdp);
 
     if (prp != pdp) {
         if (prp->refcount == 0) {
@@ -135,7 +140,7 @@ static int driUnbindContext(__DRIcontext *pcp)
 	    return GL_FALSE;
 	}
 
-	prp->refcount--;
+    	dri_put_drawable(prp);
     }
 
 
@@ -143,9 +148,10 @@ static int driUnbindContext(__DRIcontext *pcp)
      * window we can determine the last context bound to the window and
      * use that context's lock. (BrianP, 2-Dec-2000)
      */
+    pcp->driDrawablePriv = pcp->driReadablePriv = NULL;
+
 #if 0
     /* Unbind the drawable */
-    pcp->driDrawablePriv = NULL;
     pdp->driContextPriv = &psp->dummyContextPriv;
 #endif
 
@@ -163,21 +169,18 @@ static int driBindContext(__DRIcontext *pcp,
 {
     __DRIscreenPrivate *psp = pcp->driScreenPriv;
 
-    /*
-    ** Assume error checking is done properly in glXMakeCurrent before
-    ** calling driBindContext.
-    */
-
-    if (pcp == NULL || pdp == None || prp == None)
-	return GL_FALSE;
-
     /* Bind the drawable to the context */
-    pcp->driDrawablePriv = pdp;
-    pcp->driReadablePriv = prp;
-    pdp->driContextPriv = pcp;
-    pdp->refcount++;
-    if ( pdp != prp ) {
-	prp->refcount++;
+
+    if (pcp) {
+	pcp->driDrawablePriv = pdp;
+	pcp->driReadablePriv = prp;
+	if (pdp) {
+	    pdp->driContextPriv = pcp;
+    	    dri_get_drawable(pdp);
+	}
+	if ( prp && pdp != prp ) {
+    	    dri_get_drawable(prp);
+	}
     }
 
     /*
@@ -186,23 +189,21 @@ static int driBindContext(__DRIcontext *pcp,
     */
 
     if (!psp->dri2.enabled) {
-	if (!pdp->pStamp || *pdp->pStamp != pdp->lastStamp) {
+	if (pdp && !pdp->pStamp) {
 	    DRM_SPINLOCK(&psp->pSAREA->drawable_lock, psp->drawLockID);
 	    __driUtilUpdateDrawableInfo(pdp);
 	    DRM_SPINUNLOCK(&psp->pSAREA->drawable_lock, psp->drawLockID);
 	}
-	
-	if ((pdp != prp) && (!prp->pStamp || *prp->pStamp != prp->lastStamp)) {
+	if (prp && pdp != prp && !prp->pStamp) {
 	    DRM_SPINLOCK(&psp->pSAREA->drawable_lock, psp->drawLockID);
 	    __driUtilUpdateDrawableInfo(prp);
 	    DRM_SPINUNLOCK(&psp->pSAREA->drawable_lock, psp->drawLockID);
-	}
+        }
     }
 
     /* Call device-specific MakeCurrent */
-    (*psp->DriverAPI.MakeCurrent)(pcp, pdp, prp);
 
-    return GL_TRUE;
+    return (*psp->DriverAPI.MakeCurrent)(pcp, pdp, prp);
 }
 
 /*@}*/
@@ -316,12 +317,12 @@ static void driSwapBuffers(__DRIdrawable *dPriv)
     __DRIscreen *psp = dPriv->driScreenPriv;
     drm_clip_rect_t *rects;
     int i;
-    
-    if (!dPriv->numClipRects)
-        return;
 
     psp->DriverAPI.SwapBuffers(dPriv);
 
+    if (!dPriv->numClipRects)
+        return;
+
     rects = _mesa_malloc(sizeof(*rects) * dPriv->numClipRects);
 
     if (!rects)
@@ -435,7 +436,7 @@ driCreateNewDrawable(__DRIscreen *psp, const __DRIconfig *config,
 
     pdp->loaderPrivate = data;
     pdp->hHWDrawable = hwDrawable;
-    pdp->refcount = 0;
+    pdp->refcount = 1;
     pdp->pStamp = NULL;
     pdp->lastStamp = 0;
     pdp->index = 0;
@@ -488,12 +489,19 @@ dri2CreateNewDrawable(__DRIscreen *screen,
     return pdraw;
 }
 
-
-static void
-driDestroyDrawable(__DRIdrawable *pdp)
+static void dri_get_drawable(__DRIdrawable *pdp)
+{
+    pdp->refcount++;
+}
+	
+static void dri_put_drawable(__DRIdrawable *pdp)
 {
     __DRIscreenPrivate *psp;
 
+    pdp->refcount--;
+    if (pdp->refcount)
+	return;
+
     if (pdp) {
 	psp = pdp->driScreenPriv;
         (*psp->DriverAPI.DestroyBuffer)(pdp);
@@ -509,6 +517,12 @@ driDestroyDrawable(__DRIdrawable *pdp)
     }
 }
 
+static void
+driDestroyDrawable(__DRIdrawable *pdp)
+{
+    dri_put_drawable(pdp);
+}
+
 /*@}*/
 
 
@@ -764,7 +778,7 @@ dri2CreateNewScreen(int scrn, int fd,
     if (driDriverAPI.InitScreen2 == NULL)
         return NULL;
 
-    psp = _mesa_malloc(sizeof(*psp));
+    psp = _mesa_calloc(sizeof(*psp));
     if (!psp)
 	return NULL;
 
diff --git a/src/mesa/drivers/dri/common/dri_util.h b/src/mesa/drivers/dri/common/dri_util.h
index c6781f1c7a..c95a5c8299 100644
--- a/src/mesa/drivers/dri/common/dri_util.h
+++ b/src/mesa/drivers/dri/common/dri_util.h
@@ -1,25 +1,3 @@
-/* $XFree86: xc/lib/GL/dri/dri_util.h,v 1.1 2002/02/22 21:32:52 dawes Exp $ */
-/**
- * \file dri_util.h
- * DRI utility functions definitions.
- *
- * This module acts as glue between GLX and the actual hardware driver.  A DRI
- * driver doesn't really \e have to use any of this - it's optional.  But, some
- * useful stuff is done here that otherwise would have to be duplicated in most
- * drivers.
- * 
- * Basically, these utility functions take care of some of the dirty details of
- * screen initialization, context creation, context binding, DRM setup, etc.
- *
- * These functions are compiled into each DRI driver so libGL.so knows nothing
- * about them.
- *
- * \sa dri_util.c.
- * 
- * \author Kevin E. Martin <kevin@precisioninsight.com>
- * \author Brian Paul <brian@precisioninsight.com>
- */
-
 /*
  * Copyright 1998-1999 Precision Insight, Inc., Cedar Park, Texas.
  * All Rights Reserved.
@@ -45,6 +23,26 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+/**
+ * \file dri_util.h
+ * DRI utility functions definitions.
+ *
+ * This module acts as glue between GLX and the actual hardware driver.  A DRI
+ * driver doesn't really \e have to use any of this - it's optional.  But, some
+ * useful stuff is done here that otherwise would have to be duplicated in most
+ * drivers.
+ * 
+ * Basically, these utility functions take care of some of the dirty details of
+ * screen initialization, context creation, context binding, DRM setup, etc.
+ *
+ * These functions are compiled into each DRI driver so libGL.so knows nothing
+ * about them.
+ *
+ * \sa dri_util.c.
+ * 
+ * \author Kevin E. Martin <kevin@precisioninsight.com>
+ * \author Brian Paul <brian@precisioninsight.com>
+ */
 
 #ifndef _DRI_UTIL_H_
 #define _DRI_UTIL_H_
diff --git a/src/mesa/drivers/dri/common/drirenderbuffer.c b/src/mesa/drivers/dri/common/drirenderbuffer.c
index b99bf2033b..15af99136c 100644
--- a/src/mesa/drivers/dri/common/drirenderbuffer.c
+++ b/src/mesa/drivers/dri/common/drirenderbuffer.c
@@ -209,6 +209,8 @@ driUpdateFramebufferSize(GLcontext *ctx, const __DRIdrawablePrivate *dPriv)
    struct gl_framebuffer *fb = (struct gl_framebuffer *) dPriv->driverPrivate;
    if (fb && (dPriv->w != fb->Width || dPriv->h != fb->Height)) {
       ctx->Driver.ResizeBuffers(ctx, fb, dPriv->w, dPriv->h);
+      /* if the driver needs the hw lock for ResizeBuffers, the drawable
+         might have changed again by now */
       assert(fb->Width == dPriv->w);
       assert(fb->Height == dPriv->h);
    }
diff --git a/src/mesa/drivers/dri/common/extension_helper.h b/src/mesa/drivers/dri/common/extension_helper.h
index b977ebf015..49e7278adb 100644
--- a/src/mesa/drivers/dri/common/extension_helper.h
+++ b/src/mesa/drivers/dri/common/extension_helper.h
@@ -32,8 +32,10 @@
 # define NULL 0
 #endif
 
+#if defined(IN_DRI_DRIVER)
+
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char UniformMatrix3fvARB_names[] = 
+static const char UniformMatrix3fvARB_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix3fv\0"
     "glUniformMatrix3fvARB\0"
@@ -41,7 +43,7 @@ static const char UniformMatrix3fvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_multisample)
-static const char SampleCoverageARB_names[] = 
+static const char SampleCoverageARB_names[] =
     "fi\0" /* Parameter signature */
     "glSampleCoverage\0"
     "glSampleCoverageARB\0"
@@ -49,7 +51,7 @@ static const char SampleCoverageARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char ConvolutionFilter1D_names[] = 
+static const char ConvolutionFilter1D_names[] =
     "iiiiip\0" /* Parameter signature */
     "glConvolutionFilter1D\0"
     "glConvolutionFilter1DEXT\0"
@@ -57,7 +59,7 @@ static const char ConvolutionFilter1D_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_occlusion_query)
-static const char BeginQueryARB_names[] = 
+static const char BeginQueryARB_names[] =
     "ii\0" /* Parameter signature */
     "glBeginQuery\0"
     "glBeginQueryARB\0"
@@ -65,7 +67,7 @@ static const char BeginQueryARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_NV_point_sprite)
-static const char PointParameteriNV_names[] = 
+static const char PointParameteriNV_names[] =
     "ii\0" /* Parameter signature */
     "glPointParameteri\0"
     "glPointParameteriNV\0"
@@ -73,14 +75,14 @@ static const char PointParameteriNV_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char GetProgramiv_names[] = 
+static const char GetProgramiv_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramiv\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord3sARB_names[] = 
+static const char MultiTexCoord3sARB_names[] =
     "iiii\0" /* Parameter signature */
     "glMultiTexCoord3s\0"
     "glMultiTexCoord3sARB\0"
@@ -88,7 +90,7 @@ static const char MultiTexCoord3sARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3iEXT_names[] = 
+static const char SecondaryColor3iEXT_names[] =
     "iii\0" /* Parameter signature */
     "glSecondaryColor3i\0"
     "glSecondaryColor3iEXT\0"
@@ -96,7 +98,7 @@ static const char SecondaryColor3iEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos3fMESA_names[] = 
+static const char WindowPos3fMESA_names[] =
     "fff\0" /* Parameter signature */
     "glWindowPos3f\0"
     "glWindowPos3fARB\0"
@@ -105,43 +107,44 @@ static const char WindowPos3fMESA_names[] =
 #endif
 
 #if defined(need_GL_SGIS_pixel_texture)
-static const char PixelTexGenParameterfvSGIS_names[] = 
+static const char PixelTexGenParameterfvSGIS_names[] =
     "ip\0" /* Parameter signature */
     "glPixelTexGenParameterfvSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char ActiveTextureARB_names[] = 
+static const char ActiveTextureARB_names[] =
     "i\0" /* Parameter signature */
     "glActiveTexture\0"
     "glActiveTextureARB\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_blit)
-static const char BlitFramebufferEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_blit)
+static const char BlitFramebufferEXT_names[] =
     "iiiiiiiiii\0" /* Parameter signature */
+    "glBlitFramebuffer\0"
     "glBlitFramebufferEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib4ubvNV_names[] = 
+static const char VertexAttrib4ubvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4ubvNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_fragment_program)
-static const char GetProgramNamedParameterdvNV_names[] = 
+static const char GetProgramNamedParameterdvNV_names[] =
     "iipp\0" /* Parameter signature */
     "glGetProgramNamedParameterdvNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char Histogram_names[] = 
+static const char Histogram_names[] =
     "iiii\0" /* Parameter signature */
     "glHistogram\0"
     "glHistogramEXT\0"
@@ -149,14 +152,14 @@ static const char Histogram_names[] =
 #endif
 
 #if defined(need_GL_SGIS_texture4D)
-static const char TexImage4DSGIS_names[] = 
+static const char TexImage4DSGIS_names[] =
     "iiiiiiiiiip\0" /* Parameter signature */
     "glTexImage4DSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos2dvMESA_names[] = 
+static const char WindowPos2dvMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos2dv\0"
     "glWindowPos2dvARB\0"
@@ -165,14 +168,14 @@ static const char WindowPos2dvMESA_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiColor3fVertex3fvSUN_names[] = 
+static const char ReplacementCodeuiColor3fVertex3fvSUN_names[] =
     "ppp\0" /* Parameter signature */
     "glReplacementCodeuiColor3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_EXT_blend_equation_separate) || defined(need_GL_ATI_blend_equation_separate)
-static const char BlendEquationSeparateEXT_names[] = 
+static const char BlendEquationSeparateEXT_names[] =
     "ii\0" /* Parameter signature */
     "glBlendEquationSeparate\0"
     "glBlendEquationSeparateEXT\0"
@@ -181,14 +184,14 @@ static const char BlendEquationSeparateEXT_names[] =
 #endif
 
 #if defined(need_GL_SGIX_list_priority)
-static const char ListParameterfSGIX_names[] = 
+static const char ListParameterfSGIX_names[] =
     "iif\0" /* Parameter signature */
     "glListParameterfSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3bEXT_names[] = 
+static const char SecondaryColor3bEXT_names[] =
     "iii\0" /* Parameter signature */
     "glSecondaryColor3b\0"
     "glSecondaryColor3bEXT\0"
@@ -196,21 +199,21 @@ static const char SecondaryColor3bEXT_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord4fColor4fNormal3fVertex4fvSUN_names[] = 
+static const char TexCoord4fColor4fNormal3fVertex4fvSUN_names[] =
     "pppp\0" /* Parameter signature */
     "glTexCoord4fColor4fNormal3fVertex4fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib4svNV_names[] = 
+static const char VertexAttrib4svNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4svNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char GetBufferSubDataARB_names[] = 
+static const char GetBufferSubDataARB_names[] =
     "iiip\0" /* Parameter signature */
     "glGetBufferSubData\0"
     "glGetBufferSubDataARB\0"
@@ -218,7 +221,7 @@ static const char GetBufferSubDataARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char BufferSubDataARB_names[] = 
+static const char BufferSubDataARB_names[] =
     "iiip\0" /* Parameter signature */
     "glBufferSubData\0"
     "glBufferSubDataARB\0"
@@ -226,21 +229,21 @@ static const char BufferSubDataARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fColor4ubVertex3fvSUN_names[] = 
+static const char TexCoord2fColor4ubVertex3fvSUN_names[] =
     "ppp\0" /* Parameter signature */
     "glTexCoord2fColor4ubVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char AttachShader_names[] = 
+static const char AttachShader_names[] =
     "ii\0" /* Parameter signature */
     "glAttachShader\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib2fARB_names[] = 
+static const char VertexAttrib2fARB_names[] =
     "iff\0" /* Parameter signature */
     "glVertexAttrib2f\0"
     "glVertexAttrib2fARB\0"
@@ -248,14 +251,14 @@ static const char VertexAttrib2fARB_names[] =
 #endif
 
 #if defined(need_GL_MESA_shader_debug)
-static const char GetDebugLogLengthMESA_names[] = 
+static const char GetDebugLogLengthMESA_names[] =
     "iii\0" /* Parameter signature */
     "glGetDebugLogLengthMESA\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib3fARB_names[] = 
+static const char VertexAttrib3fARB_names[] =
     "ifff\0" /* Parameter signature */
     "glVertexAttrib3f\0"
     "glVertexAttrib3fARB\0"
@@ -263,7 +266,7 @@ static const char VertexAttrib3fARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_occlusion_query)
-static const char GetQueryivARB_names[] = 
+static const char GetQueryivARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetQueryiv\0"
     "glGetQueryivARB\0"
@@ -271,7 +274,7 @@ static const char GetQueryivARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_texture3D)
-static const char TexImage3D_names[] = 
+static const char TexImage3D_names[] =
     "iiiiiiiiip\0" /* Parameter signature */
     "glTexImage3D\0"
     "glTexImage3DEXT\0"
@@ -279,14 +282,14 @@ static const char TexImage3D_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiVertex3fvSUN_names[] = 
+static const char ReplacementCodeuiVertex3fvSUN_names[] =
     "pp\0" /* Parameter signature */
     "glReplacementCodeuiVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_occlusion_query)
-static const char GetQueryObjectivARB_names[] = 
+static const char GetQueryObjectivARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetQueryObjectiv\0"
     "glGetQueryObjectivARB\0"
@@ -294,14 +297,14 @@ static const char GetQueryObjectivARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiTexCoord2fVertex3fvSUN_names[] = 
+static const char ReplacementCodeuiTexCoord2fVertex3fvSUN_names[] =
     "ppp\0" /* Parameter signature */
     "glReplacementCodeuiTexCoord2fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_texture_compression)
-static const char CompressedTexSubImage2DARB_names[] = 
+static const char CompressedTexSubImage2DARB_names[] =
     "iiiiiiiip\0" /* Parameter signature */
     "glCompressedTexSubImage2D\0"
     "glCompressedTexSubImage2DARB\0"
@@ -309,14 +312,21 @@ static const char CompressedTexSubImage2DARB_names[] =
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char CombinerOutputNV_names[] = 
+static const char CombinerOutputNV_names[] =
     "iiiiiiiiii\0" /* Parameter signature */
     "glCombinerOutputNV\0"
     "";
 #endif
 
+#if defined(need_GL_NV_vertex_program)
+static const char VertexAttribs3fvNV_names[] =
+    "iip\0" /* Parameter signature */
+    "glVertexAttribs3fvNV\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform2fARB_names[] = 
+static const char Uniform2fARB_names[] =
     "iff\0" /* Parameter signature */
     "glUniform2f\0"
     "glUniform2fARB\0"
@@ -324,7 +334,7 @@ static const char Uniform2fARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib1svARB_names[] = 
+static const char VertexAttrib1svARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib1sv\0"
     "glVertexAttrib1svARB\0"
@@ -332,14 +342,14 @@ static const char VertexAttrib1svARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs1dvNV_names[] = 
+static const char VertexAttribs1dvNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs1dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform2ivARB_names[] = 
+static const char Uniform2ivARB_names[] =
     "iip\0" /* Parameter signature */
     "glUniform2iv\0"
     "glUniform2ivARB\0"
@@ -347,28 +357,28 @@ static const char Uniform2ivARB_names[] =
 #endif
 
 #if defined(need_GL_HP_image_transform)
-static const char GetImageTransformParameterfvHP_names[] = 
+static const char GetImageTransformParameterfvHP_names[] =
     "iip\0" /* Parameter signature */
     "glGetImageTransformParameterfvHP\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightubvARB_names[] = 
+static const char WeightubvARB_names[] =
     "ip\0" /* Parameter signature */
     "glWeightubvARB\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib1fvNV_names[] = 
+static const char VertexAttrib1fvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib1fvNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char CopyConvolutionFilter1D_names[] = 
+static const char CopyConvolutionFilter1D_names[] =
     "iiiii\0" /* Parameter signature */
     "glCopyConvolutionFilter1D\0"
     "glCopyConvolutionFilter1DEXT\0"
@@ -376,21 +386,28 @@ static const char CopyConvolutionFilter1D_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiNormal3fVertex3fSUN_names[] = 
+static const char ReplacementCodeuiNormal3fVertex3fSUN_names[] =
     "iffffff\0" /* Parameter signature */
     "glReplacementCodeuiNormal3fVertex3fSUN\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_sync)
+static const char DeleteSync_names[] =
+    "i\0" /* Parameter signature */
+    "glDeleteSync\0"
+    "";
+#endif
+
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentMaterialfvSGIX_names[] = 
+static const char FragmentMaterialfvSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glFragmentMaterialfvSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_blend_color)
-static const char BlendColor_names[] = 
+static const char BlendColor_names[] =
     "ffff\0" /* Parameter signature */
     "glBlendColor\0"
     "glBlendColorEXT\0"
@@ -398,57 +415,59 @@ static const char BlendColor_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char UniformMatrix4fvARB_names[] = 
+static const char UniformMatrix4fvARB_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix4fv\0"
     "glUniformMatrix4fvARB\0"
     "";
 #endif
 
-#if defined(need_GL_APPLE_vertex_array_object)
-static const char DeleteVertexArraysAPPLE_names[] = 
+#if defined(need_GL_ARB_vertex_array_object) || defined(need_GL_APPLE_vertex_array_object)
+static const char DeleteVertexArraysAPPLE_names[] =
     "ip\0" /* Parameter signature */
+    "glDeleteVertexArrays\0"
     "glDeleteVertexArraysAPPLE\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_instruments)
-static const char ReadInstrumentsSGIX_names[] = 
+static const char ReadInstrumentsSGIX_names[] =
     "i\0" /* Parameter signature */
     "glReadInstrumentsSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_1)
-static const char UniformMatrix2x4fv_names[] = 
+static const char UniformMatrix2x4fv_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix2x4fv\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Color4ubVertex3fvSUN_names[] = 
+static const char Color4ubVertex3fvSUN_names[] =
     "pp\0" /* Parameter signature */
     "glColor4ubVertex3fvSUN\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_texture_array)
-static const char FramebufferTextureLayerEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_texture_array)
+static const char FramebufferTextureLayerEXT_names[] =
     "iiiii\0" /* Parameter signature */
+    "glFramebufferTextureLayer\0"
     "glFramebufferTextureLayerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_list_priority)
-static const char GetListParameterivSGIX_names[] = 
+static const char GetListParameterfvSGIX_names[] =
     "iip\0" /* Parameter signature */
-    "glGetListParameterivSGIX\0"
+    "glGetListParameterfvSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4NusvARB_names[] = 
+static const char VertexAttrib4NusvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4Nusv\0"
     "glVertexAttrib4NusvARB\0"
@@ -456,35 +475,35 @@ static const char VertexAttrib4NusvARB_names[] =
 #endif
 
 #if defined(need_GL_MESA_window_pos)
-static const char WindowPos4svMESA_names[] = 
+static const char WindowPos4svMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos4svMESA\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char CreateProgramObjectARB_names[] = 
+static const char CreateProgramObjectARB_names[] =
     "\0" /* Parameter signature */
     "glCreateProgramObjectARB\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentLightModelivSGIX_names[] = 
+static const char FragmentLightModelivSGIX_names[] =
     "ip\0" /* Parameter signature */
     "glFragmentLightModelivSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_1)
-static const char UniformMatrix4x3fv_names[] = 
+static const char UniformMatrix4x3fv_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix4x3fv\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_texture_object)
-static const char PrioritizeTextures_names[] = 
+static const char PrioritizeTextures_names[] =
     "ipp\0" /* Parameter signature */
     "glPrioritizeTextures\0"
     "glPrioritizeTexturesEXT\0"
@@ -492,28 +511,28 @@ static const char PrioritizeTextures_names[] =
 #endif
 
 #if defined(need_GL_SGIX_async)
-static const char AsyncMarkerSGIX_names[] = 
+static const char AsyncMarkerSGIX_names[] =
     "i\0" /* Parameter signature */
     "glAsyncMarkerSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_global_alpha)
-static const char GlobalAlphaFactorubSUN_names[] = 
+static const char GlobalAlphaFactorubSUN_names[] =
     "i\0" /* Parameter signature */
     "glGlobalAlphaFactorubSUN\0"
     "";
 #endif
 
 #if defined(need_GL_MESA_shader_debug)
-static const char ClearDebugLogMESA_names[] = 
+static const char ClearDebugLogMESA_names[] =
     "iii\0" /* Parameter signature */
     "glClearDebugLogMESA\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char ResetHistogram_names[] = 
+static const char ResetHistogram_names[] =
     "i\0" /* Parameter signature */
     "glResetHistogram\0"
     "glResetHistogramEXT\0"
@@ -521,14 +540,14 @@ static const char ResetHistogram_names[] =
 #endif
 
 #if defined(need_GL_NV_fragment_program)
-static const char GetProgramNamedParameterfvNV_names[] = 
+static const char GetProgramNamedParameterfvNV_names[] =
     "iipp\0" /* Parameter signature */
     "glGetProgramNamedParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_point_parameters) || defined(need_GL_EXT_point_parameters) || defined(need_GL_SGIS_point_parameters)
-static const char PointParameterfEXT_names[] = 
+static const char PointParameterfEXT_names[] =
     "if\0" /* Parameter signature */
     "glPointParameterf\0"
     "glPointParameterfARB\0"
@@ -538,35 +557,42 @@ static const char PointParameterfEXT_names[] =
 #endif
 
 #if defined(need_GL_SGIX_polynomial_ffd)
-static const char LoadIdentityDeformationMapSGIX_names[] = 
+static const char LoadIdentityDeformationMapSGIX_names[] =
     "i\0" /* Parameter signature */
     "glLoadIdentityDeformationMapSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_NV_fence)
-static const char GenFencesNV_names[] = 
+static const char GenFencesNV_names[] =
     "ip\0" /* Parameter signature */
     "glGenFencesNV\0"
     "";
 #endif
 
 #if defined(need_GL_HP_image_transform)
-static const char ImageTransformParameterfHP_names[] = 
+static const char ImageTransformParameterfHP_names[] =
     "iif\0" /* Parameter signature */
     "glImageTransformParameterfHP\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_matrix_palette)
-static const char MatrixIndexusvARB_names[] = 
+static const char MatrixIndexusvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMatrixIndexusvARB\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_draw_elements_base_vertex)
+static const char DrawElementsBaseVertex_names[] =
+    "iiipi\0" /* Parameter signature */
+    "glDrawElementsBaseVertex\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char DisableVertexAttribArrayARB_names[] = 
+static const char DisableVertexAttribArrayARB_names[] =
     "i\0" /* Parameter signature */
     "glDisableVertexAttribArray\0"
     "glDisableVertexAttribArrayARB\0"
@@ -574,21 +600,21 @@ static const char DisableVertexAttribArrayARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char StencilMaskSeparate_names[] = 
+static const char StencilMaskSeparate_names[] =
     "ii\0" /* Parameter signature */
     "glStencilMaskSeparate\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char ProgramLocalParameter4dARB_names[] = 
+static const char ProgramLocalParameter4dARB_names[] =
     "iidddd\0" /* Parameter signature */
     "glProgramLocalParameter4dARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_texture_compression)
-static const char CompressedTexImage3DARB_names[] = 
+static const char CompressedTexImage3DARB_names[] =
     "iiiiiiiip\0" /* Parameter signature */
     "glCompressedTexImage3D\0"
     "glCompressedTexImage3DARB\0"
@@ -596,7 +622,7 @@ static const char CompressedTexImage3DARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char GetConvolutionParameteriv_names[] = 
+static const char GetConvolutionParameteriv_names[] =
     "iip\0" /* Parameter signature */
     "glGetConvolutionParameteriv\0"
     "glGetConvolutionParameterivEXT\0"
@@ -604,7 +630,7 @@ static const char GetConvolutionParameteriv_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib1fARB_names[] = 
+static const char VertexAttrib1fARB_names[] =
     "if\0" /* Parameter signature */
     "glVertexAttrib1f\0"
     "glVertexAttrib1fARB\0"
@@ -612,14 +638,14 @@ static const char VertexAttrib1fARB_names[] =
 #endif
 
 #if defined(need_GL_NV_fence)
-static const char TestFenceNV_names[] = 
+static const char TestFenceNV_names[] =
     "i\0" /* Parameter signature */
     "glTestFenceNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord1fvARB_names[] = 
+static const char MultiTexCoord1fvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord1fv\0"
     "glMultiTexCoord1fvARB\0"
@@ -627,56 +653,56 @@ static const char MultiTexCoord1fvARB_names[] =
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char ColorFragmentOp2ATI_names[] = 
+static const char ColorFragmentOp2ATI_names[] =
     "iiiiiiiiii\0" /* Parameter signature */
     "glColorFragmentOp2ATI\0"
     "";
 #endif
 
 #if defined(need_GL_IBM_vertex_array_lists)
-static const char SecondaryColorPointerListIBM_names[] = 
+static const char SecondaryColorPointerListIBM_names[] =
     "iiipi\0" /* Parameter signature */
     "glSecondaryColorPointerListIBM\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_pixel_texture)
-static const char GetPixelTexGenParameterivSGIS_names[] = 
+static const char GetPixelTexGenParameterivSGIS_names[] =
     "ip\0" /* Parameter signature */
     "glGetPixelTexGenParameterivSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib4fNV_names[] = 
+static const char VertexAttrib4fNV_names[] =
     "iffff\0" /* Parameter signature */
     "glVertexAttrib4fNV\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_triangle_list)
-static const char ReplacementCodeubSUN_names[] = 
+static const char ReplacementCodeubSUN_names[] =
     "i\0" /* Parameter signature */
     "glReplacementCodeubSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_async)
-static const char FinishAsyncSGIX_names[] = 
+static const char FinishAsyncSGIX_names[] =
     "p\0" /* Parameter signature */
     "glFinishAsyncSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_MESA_shader_debug)
-static const char GetDebugLogMESA_names[] = 
+static const char GetDebugLogMESA_names[] =
     "iiiipp\0" /* Parameter signature */
     "glGetDebugLogMESA\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
-static const char FogCoorddEXT_names[] = 
+static const char FogCoorddEXT_names[] =
     "d\0" /* Parameter signature */
     "glFogCoordd\0"
     "glFogCoorddEXT\0"
@@ -684,14 +710,14 @@ static const char FogCoorddEXT_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Color4ubVertex3fSUN_names[] = 
+static const char Color4ubVertex3fSUN_names[] =
     "iiiifff\0" /* Parameter signature */
     "glColor4ubVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
-static const char FogCoordfEXT_names[] = 
+static const char FogCoordfEXT_names[] =
     "f\0" /* Parameter signature */
     "glFogCoordf\0"
     "glFogCoordfEXT\0"
@@ -699,35 +725,35 @@ static const char FogCoordfEXT_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fVertex3fSUN_names[] = 
+static const char TexCoord2fVertex3fSUN_names[] =
     "fffff\0" /* Parameter signature */
     "glTexCoord2fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_global_alpha)
-static const char GlobalAlphaFactoriSUN_names[] = 
+static const char GlobalAlphaFactoriSUN_names[] =
     "i\0" /* Parameter signature */
     "glGlobalAlphaFactoriSUN\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib2dNV_names[] = 
+static const char VertexAttrib2dNV_names[] =
     "idd\0" /* Parameter signature */
     "glVertexAttrib2dNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char GetProgramInfoLog_names[] = 
+static const char GetProgramInfoLog_names[] =
     "iipp\0" /* Parameter signature */
     "glGetProgramInfoLog\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4NbvARB_names[] = 
+static const char VertexAttrib4NbvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4Nbv\0"
     "glVertexAttrib4NbvARB\0"
@@ -735,7 +761,7 @@ static const char VertexAttrib4NbvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_shader)
-static const char GetActiveAttribARB_names[] = 
+static const char GetActiveAttribARB_names[] =
     "iiipppp\0" /* Parameter signature */
     "glGetActiveAttrib\0"
     "glGetActiveAttribARB\0"
@@ -743,77 +769,91 @@ static const char GetActiveAttribARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib4ubNV_names[] = 
+static const char VertexAttrib4ubNV_names[] =
     "iiiii\0" /* Parameter signature */
     "glVertexAttrib4ubNV\0"
     "";
 #endif
 
+#if defined(need_GL_APPLE_texture_range)
+static const char TextureRangeAPPLE_names[] =
+    "iip\0" /* Parameter signature */
+    "glTextureRangeAPPLE\0"
+    "";
+#endif
+
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fColor4fNormal3fVertex3fSUN_names[] = 
+static const char TexCoord2fColor4fNormal3fVertex3fSUN_names[] =
     "ffffffffffff\0" /* Parameter signature */
     "glTexCoord2fColor4fNormal3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char CombinerParameterfvNV_names[] = 
+static const char CombinerParameterfvNV_names[] =
     "ip\0" /* Parameter signature */
     "glCombinerParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs3dvNV_names[] = 
+static const char VertexAttribs3dvNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs3dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs4fvNV_names[] = 
+static const char VertexAttribs4fvNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs4fvNV\0"
     "";
 #endif
 
+#if defined(need_GL_NV_vertex_array_range)
+static const char VertexArrayRangeNV_names[] =
+    "ip\0" /* Parameter signature */
+    "glVertexArrayRangeNV\0"
+    "";
+#endif
+
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentLightiSGIX_names[] = 
+static const char FragmentLightiSGIX_names[] =
     "iii\0" /* Parameter signature */
     "glFragmentLightiSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_polygon_offset)
-static const char PolygonOffsetEXT_names[] = 
+static const char PolygonOffsetEXT_names[] =
     "ff\0" /* Parameter signature */
     "glPolygonOffsetEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_async)
-static const char PollAsyncSGIX_names[] = 
+static const char PollAsyncSGIX_names[] =
     "p\0" /* Parameter signature */
     "glPollAsyncSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char DeleteFragmentShaderATI_names[] = 
+static const char DeleteFragmentShaderATI_names[] =
     "i\0" /* Parameter signature */
     "glDeleteFragmentShaderATI\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fNormal3fVertex3fvSUN_names[] = 
+static const char TexCoord2fNormal3fVertex3fvSUN_names[] =
     "ppp\0" /* Parameter signature */
     "glTexCoord2fNormal3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_transpose_matrix)
-static const char MultTransposeMatrixdARB_names[] = 
+static const char MultTransposeMatrixdARB_names[] =
     "p\0" /* Parameter signature */
     "glMultTransposeMatrixd\0"
     "glMultTransposeMatrixdARB\0"
@@ -821,7 +861,7 @@ static const char MultTransposeMatrixdARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos2svMESA_names[] = 
+static const char WindowPos2svMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos2sv\0"
     "glWindowPos2svARB\0"
@@ -830,7 +870,7 @@ static const char WindowPos2svMESA_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_texture_compression)
-static const char CompressedTexImage1DARB_names[] = 
+static const char CompressedTexImage1DARB_names[] =
     "iiiiiip\0" /* Parameter signature */
     "glCompressedTexImage1D\0"
     "glCompressedTexImage1DARB\0"
@@ -838,35 +878,35 @@ static const char CompressedTexImage1DARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib2sNV_names[] = 
+static const char VertexAttrib2sNV_names[] =
     "iii\0" /* Parameter signature */
     "glVertexAttrib2sNV\0"
     "";
 #endif
 
 #if defined(need_GL_IBM_vertex_array_lists)
-static const char NormalPointerListIBM_names[] = 
+static const char NormalPointerListIBM_names[] =
     "iipi\0" /* Parameter signature */
     "glNormalPointerListIBM\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char IndexPointerEXT_names[] = 
+static const char IndexPointerEXT_names[] =
     "iiip\0" /* Parameter signature */
     "glIndexPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char NormalPointerEXT_names[] = 
+static const char NormalPointerEXT_names[] =
     "iiip\0" /* Parameter signature */
     "glNormalPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord3dARB_names[] = 
+static const char MultiTexCoord3dARB_names[] =
     "iddd\0" /* Parameter signature */
     "glMultiTexCoord3d\0"
     "glMultiTexCoord3dARB\0"
@@ -874,7 +914,7 @@ static const char MultiTexCoord3dARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord2iARB_names[] = 
+static const char MultiTexCoord2iARB_names[] =
     "iii\0" /* Parameter signature */
     "glMultiTexCoord2i\0"
     "glMultiTexCoord2iARB\0"
@@ -882,14 +922,14 @@ static const char MultiTexCoord2iARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiTexCoord2fNormal3fVertex3fSUN_names[] = 
+static const char ReplacementCodeuiTexCoord2fNormal3fVertex3fSUN_names[] =
     "iffffffff\0" /* Parameter signature */
     "glReplacementCodeuiTexCoord2fNormal3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord2svARB_names[] = 
+static const char MultiTexCoord2svARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord2sv\0"
     "glMultiTexCoord2svARB\0"
@@ -897,14 +937,14 @@ static const char MultiTexCoord2svARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_triangle_list)
-static const char ReplacementCodeubvSUN_names[] = 
+static const char ReplacementCodeubvSUN_names[] =
     "p\0" /* Parameter signature */
     "glReplacementCodeubvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform3iARB_names[] = 
+static const char Uniform3iARB_names[] =
     "iiii\0" /* Parameter signature */
     "glUniform3i\0"
     "glUniform3iARB\0"
@@ -912,77 +952,85 @@ static const char Uniform3iARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char GetFragmentMaterialfvSGIX_names[] = 
+static const char GetFragmentMaterialfvSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glGetFragmentMaterialfvSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char GetShaderInfoLog_names[] = 
+static const char GetShaderInfoLog_names[] =
     "iipp\0" /* Parameter signature */
     "glGetShaderInfoLog\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightivARB_names[] = 
+static const char WeightivARB_names[] =
     "ip\0" /* Parameter signature */
     "glWeightivARB\0"
     "";
 #endif
 
+#if defined(need_GL_SGIX_instruments)
+static const char PollInstrumentsSGIX_names[] =
+    "p\0" /* Parameter signature */
+    "glPollInstrumentsSGIX\0"
+    "";
+#endif
+
 #if defined(need_GL_SUN_global_alpha)
-static const char GlobalAlphaFactordSUN_names[] = 
+static const char GlobalAlphaFactordSUN_names[] =
     "d\0" /* Parameter signature */
     "glGlobalAlphaFactordSUN\0"
     "";
 #endif
 
-#if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs3fvNV_names[] = 
+#if defined(need_GL_NV_register_combiners)
+static const char GetFinalCombinerInputParameterfvNV_names[] =
     "iip\0" /* Parameter signature */
-    "glVertexAttribs3fvNV\0"
+    "glGetFinalCombinerInputParameterfvNV\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char GenerateMipmapEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char GenerateMipmapEXT_names[] =
     "i\0" /* Parameter signature */
+    "glGenerateMipmap\0"
     "glGenerateMipmapEXT\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char SetFragmentShaderConstantATI_names[] = 
+static const char SetFragmentShaderConstantATI_names[] =
     "ip\0" /* Parameter signature */
     "glSetFragmentShaderConstantATI\0"
     "";
 #endif
 
 #if defined(need_GL_NV_evaluators)
-static const char GetMapAttribParameterivNV_names[] = 
+static const char GetMapAttribParameterivNV_names[] =
     "iiip\0" /* Parameter signature */
     "glGetMapAttribParameterivNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char CreateShaderObjectARB_names[] = 
+static const char CreateShaderObjectARB_names[] =
     "i\0" /* Parameter signature */
     "glCreateShaderObjectARB\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_sharpen_texture)
-static const char GetSharpenTexFuncSGIS_names[] = 
+static const char GetSharpenTexFuncSGIS_names[] =
     "ip\0" /* Parameter signature */
     "glGetSharpenTexFuncSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char BufferDataARB_names[] = 
+static const char BufferDataARB_names[] =
     "iipi\0" /* Parameter signature */
     "glBufferData\0"
     "glBufferDataARB\0"
@@ -990,42 +1038,42 @@ static const char BufferDataARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_array_range)
-static const char FlushVertexArrayRangeNV_names[] = 
+static const char FlushVertexArrayRangeNV_names[] =
     "\0" /* Parameter signature */
     "glFlushVertexArrayRangeNV\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char SampleMapATI_names[] = 
+static const char SampleMapATI_names[] =
     "iii\0" /* Parameter signature */
     "glSampleMapATI\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char VertexPointerEXT_names[] = 
+static const char VertexPointerEXT_names[] =
     "iiiip\0" /* Parameter signature */
     "glVertexPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_texture_filter4)
-static const char GetTexFilterFuncSGIS_names[] = 
+static const char GetTexFilterFuncSGIS_names[] =
     "iip\0" /* Parameter signature */
     "glGetTexFilterFuncSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char GetCombinerOutputParameterfvNV_names[] = 
+static const char GetCombinerOutputParameterfvNV_names[] =
     "iiip\0" /* Parameter signature */
     "glGetCombinerOutputParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_subtexture)
-static const char TexSubImage1D_names[] = 
+static const char TexSubImage1D_names[] =
     "iiiiiip\0" /* Parameter signature */
     "glTexSubImage1D\0"
     "glTexSubImage1DEXT\0"
@@ -1033,36 +1081,43 @@ static const char TexSubImage1D_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib1sARB_names[] = 
+static const char VertexAttrib1sARB_names[] =
     "ii\0" /* Parameter signature */
     "glVertexAttrib1s\0"
     "glVertexAttrib1sARB\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_sync)
+static const char FenceSync_names[] =
+    "ii\0" /* Parameter signature */
+    "glFenceSync\0"
+    "";
+#endif
+
 #if defined(need_GL_NV_register_combiners)
-static const char FinalCombinerInputNV_names[] = 
+static const char FinalCombinerInputNV_names[] =
     "iiii\0" /* Parameter signature */
     "glFinalCombinerInputNV\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_flush_raster)
-static const char FlushRasterSGIX_names[] = 
+static const char FlushRasterSGIX_names[] =
     "\0" /* Parameter signature */
     "glFlushRasterSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiTexCoord2fVertex3fSUN_names[] = 
+static const char ReplacementCodeuiTexCoord2fVertex3fSUN_names[] =
     "ifffff\0" /* Parameter signature */
     "glReplacementCodeuiTexCoord2fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform1fARB_names[] = 
+static const char Uniform1fARB_names[] =
     "if\0" /* Parameter signature */
     "glUniform1f\0"
     "glUniform1fARB\0"
@@ -1070,22 +1125,15 @@ static const char Uniform1fARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_texture_object)
-static const char AreTexturesResident_names[] = 
+static const char AreTexturesResident_names[] =
     "ipp\0" /* Parameter signature */
     "glAreTexturesResident\0"
     "glAreTexturesResidentEXT\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char IsRenderbufferEXT_names[] = 
-    "i\0" /* Parameter signature */
-    "glIsRenderbufferEXT\0"
-    "";
-#endif
-
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ATI_separate_stencil)
-static const char StencilOpSeparate_names[] = 
+static const char StencilOpSeparate_names[] =
     "iiii\0" /* Parameter signature */
     "glStencilOpSeparate\0"
     "glStencilOpSeparateATI\0"
@@ -1093,7 +1141,7 @@ static const char StencilOpSeparate_names[] =
 #endif
 
 #if defined(need_GL_SGI_color_table)
-static const char ColorTableParameteriv_names[] = 
+static const char ColorTableParameteriv_names[] =
     "iip\0" /* Parameter signature */
     "glColorTableParameteriv\0"
     "glColorTableParameterivSGI\0"
@@ -1101,14 +1149,14 @@ static const char ColorTableParameteriv_names[] =
 #endif
 
 #if defined(need_GL_IBM_vertex_array_lists)
-static const char FogCoordPointerListIBM_names[] = 
+static const char FogCoordPointerListIBM_names[] =
     "iipi\0" /* Parameter signature */
     "glFogCoordPointerListIBM\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos3dMESA_names[] = 
+static const char WindowPos3dMESA_names[] =
     "ddd\0" /* Parameter signature */
     "glWindowPos3d\0"
     "glWindowPos3dARB\0"
@@ -1117,7 +1165,7 @@ static const char WindowPos3dMESA_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_point_parameters) || defined(need_GL_EXT_point_parameters) || defined(need_GL_SGIS_point_parameters)
-static const char PointParameterfvEXT_names[] = 
+static const char PointParameterfvEXT_names[] =
     "ip\0" /* Parameter signature */
     "glPointParameterfv\0"
     "glPointParameterfvARB\0"
@@ -1127,7 +1175,7 @@ static const char PointParameterfvEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos2fvMESA_names[] = 
+static const char WindowPos2fvMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos2fv\0"
     "glWindowPos2fvARB\0"
@@ -1136,7 +1184,7 @@ static const char WindowPos2fvMESA_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3bvEXT_names[] = 
+static const char SecondaryColor3bvEXT_names[] =
     "p\0" /* Parameter signature */
     "glSecondaryColor3bv\0"
     "glSecondaryColor3bvEXT\0"
@@ -1144,63 +1192,64 @@ static const char SecondaryColor3bvEXT_names[] =
 #endif
 
 #if defined(need_GL_IBM_vertex_array_lists)
-static const char VertexPointerListIBM_names[] = 
+static const char VertexPointerListIBM_names[] =
     "iiipi\0" /* Parameter signature */
     "glVertexPointerListIBM\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char GetProgramLocalParameterfvARB_names[] = 
+static const char GetProgramLocalParameterfvARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramLocalParameterfvARB\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentMaterialfSGIX_names[] = 
+static const char FragmentMaterialfSGIX_names[] =
     "iif\0" /* Parameter signature */
     "glFragmentMaterialfSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fNormal3fVertex3fSUN_names[] = 
+static const char TexCoord2fNormal3fVertex3fSUN_names[] =
     "ffffffff\0" /* Parameter signature */
     "glTexCoord2fNormal3fVertex3fSUN\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char RenderbufferStorageEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char RenderbufferStorageEXT_names[] =
     "iiii\0" /* Parameter signature */
+    "glRenderbufferStorage\0"
     "glRenderbufferStorageEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_fence)
-static const char IsFenceNV_names[] = 
+static const char IsFenceNV_names[] =
     "i\0" /* Parameter signature */
     "glIsFenceNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char AttachObjectARB_names[] = 
+static const char AttachObjectARB_names[] =
     "ii\0" /* Parameter signature */
     "glAttachObjectARB\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char GetFragmentLightivSGIX_names[] = 
+static const char GetFragmentLightivSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glGetFragmentLightivSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char UniformMatrix2fvARB_names[] = 
+static const char UniformMatrix2fvARB_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix2fv\0"
     "glUniformMatrix2fvARB\0"
@@ -1208,7 +1257,7 @@ static const char UniformMatrix2fvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord2fARB_names[] = 
+static const char MultiTexCoord2fARB_names[] =
     "iff\0" /* Parameter signature */
     "glMultiTexCoord2f\0"
     "glMultiTexCoord2fARB\0"
@@ -1216,7 +1265,7 @@ static const char MultiTexCoord2fARB_names[] =
 #endif
 
 #if defined(need_GL_SGI_color_table) || defined(need_GL_EXT_paletted_texture)
-static const char ColorTable_names[] = 
+static const char ColorTable_names[] =
     "iiiiip\0" /* Parameter signature */
     "glColorTable\0"
     "glColorTableSGI\0"
@@ -1225,14 +1274,14 @@ static const char ColorTable_names[] =
 #endif
 
 #if defined(need_GL_NV_evaluators)
-static const char MapControlPointsNV_names[] = 
+static const char MapControlPointsNV_names[] =
     "iiiiiiiip\0" /* Parameter signature */
     "glMapControlPointsNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char ConvolutionFilter2D_names[] = 
+static const char ConvolutionFilter2D_names[] =
     "iiiiiip\0" /* Parameter signature */
     "glConvolutionFilter2D\0"
     "glConvolutionFilter2DEXT\0"
@@ -1240,14 +1289,14 @@ static const char ConvolutionFilter2D_names[] =
 #endif
 
 #if defined(need_GL_NV_evaluators)
-static const char MapParameterfvNV_names[] = 
+static const char MapParameterfvNV_names[] =
     "iip\0" /* Parameter signature */
     "glMapParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib3dvARB_names[] = 
+static const char VertexAttrib3dvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib3dv\0"
     "glVertexAttrib3dvARB\0"
@@ -1255,14 +1304,14 @@ static const char VertexAttrib3dvARB_names[] =
 #endif
 
 #if defined(need_GL_PGI_misc_hints)
-static const char HintPGI_names[] = 
+static const char HintPGI_names[] =
     "ii\0" /* Parameter signature */
     "glHintPGI\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char ConvolutionParameteriv_names[] = 
+static const char ConvolutionParameteriv_names[] =
     "iip\0" /* Parameter signature */
     "glConvolutionParameteriv\0"
     "glConvolutionParameterivEXT\0"
@@ -1270,28 +1319,28 @@ static const char ConvolutionParameteriv_names[] =
 #endif
 
 #if defined(need_GL_EXT_cull_vertex)
-static const char CullParameterdvEXT_names[] = 
+static const char CullParameterdvEXT_names[] =
     "ip\0" /* Parameter signature */
     "glCullParameterdvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_fragment_program)
-static const char ProgramNamedParameter4fNV_names[] = 
+static const char ProgramNamedParameter4fNV_names[] =
     "iipffff\0" /* Parameter signature */
     "glProgramNamedParameter4fNV\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Color3fVertex3fSUN_names[] = 
+static const char Color3fVertex3fSUN_names[] =
     "ffffff\0" /* Parameter signature */
     "glColor3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char ProgramEnvParameter4fvARB_names[] = 
+static const char ProgramEnvParameter4fvARB_names[] =
     "iip\0" /* Parameter signature */
     "glProgramEnvParameter4fvARB\0"
     "glProgramParameter4fvNV\0"
@@ -1299,14 +1348,14 @@ static const char ProgramEnvParameter4fvARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentLightModeliSGIX_names[] = 
+static const char FragmentLightModeliSGIX_names[] =
     "ii\0" /* Parameter signature */
     "glFragmentLightModeliSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char ConvolutionParameterfv_names[] = 
+static const char ConvolutionParameterfv_names[] =
     "iip\0" /* Parameter signature */
     "glConvolutionParameterfv\0"
     "glConvolutionParameterfvEXT\0"
@@ -1314,35 +1363,42 @@ static const char ConvolutionParameterfv_names[] =
 #endif
 
 #if defined(need_GL_3DFX_tbuffer)
-static const char TbufferMask3DFX_names[] = 
+static const char TbufferMask3DFX_names[] =
     "i\0" /* Parameter signature */
     "glTbufferMask3DFX\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char LoadProgramNV_names[] = 
+static const char LoadProgramNV_names[] =
     "iiip\0" /* Parameter signature */
     "glLoadProgramNV\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_sync)
+static const char WaitSync_names[] =
+    "iii\0" /* Parameter signature */
+    "glWaitSync\0"
+    "";
+#endif
+
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib4fvNV_names[] = 
+static const char VertexAttrib4fvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4fvNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char GetAttachedObjectsARB_names[] = 
+static const char GetAttachedObjectsARB_names[] =
     "iipp\0" /* Parameter signature */
     "glGetAttachedObjectsARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform3fvARB_names[] = 
+static const char Uniform3fvARB_names[] =
     "iip\0" /* Parameter signature */
     "glUniform3fv\0"
     "glUniform3fvARB\0"
@@ -1350,7 +1406,7 @@ static const char Uniform3fvARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_draw_range_elements)
-static const char DrawRangeElements_names[] = 
+static const char DrawRangeElements_names[] =
     "iiiiip\0" /* Parameter signature */
     "glDrawRangeElements\0"
     "glDrawRangeElementsEXT\0"
@@ -1358,35 +1414,36 @@ static const char DrawRangeElements_names[] =
 #endif
 
 #if defined(need_GL_SGIX_sprite)
-static const char SpriteParameterfvSGIX_names[] = 
+static const char SpriteParameterfvSGIX_names[] =
     "ip\0" /* Parameter signature */
     "glSpriteParameterfvSGIX\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char CheckFramebufferStatusEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char CheckFramebufferStatusEXT_names[] =
     "i\0" /* Parameter signature */
+    "glCheckFramebufferStatus\0"
     "glCheckFramebufferStatusEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_global_alpha)
-static const char GlobalAlphaFactoruiSUN_names[] = 
+static const char GlobalAlphaFactoruiSUN_names[] =
     "i\0" /* Parameter signature */
     "glGlobalAlphaFactoruiSUN\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char GetHandleARB_names[] = 
+static const char GetHandleARB_names[] =
     "i\0" /* Parameter signature */
     "glGetHandleARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char GetVertexAttribivARB_names[] = 
+static const char GetVertexAttribivARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetVertexAttribiv\0"
     "glGetVertexAttribivARB\0"
@@ -1394,21 +1451,21 @@ static const char GetVertexAttribivARB_names[] =
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char GetCombinerInputParameterfvNV_names[] = 
+static const char GetCombinerInputParameterfvNV_names[] =
     "iiiip\0" /* Parameter signature */
     "glGetCombinerInputParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char CreateProgram_names[] = 
+static const char CreateProgram_names[] =
     "\0" /* Parameter signature */
     "glCreateProgram\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_transpose_matrix)
-static const char LoadTransposeMatrixdARB_names[] = 
+static const char LoadTransposeMatrixdARB_names[] =
     "p\0" /* Parameter signature */
     "glLoadTransposeMatrixd\0"
     "glLoadTransposeMatrixdARB\0"
@@ -1416,7 +1473,7 @@ static const char LoadTransposeMatrixdARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char GetMinmax_names[] = 
+static const char GetMinmax_names[] =
     "iiiip\0" /* Parameter signature */
     "glGetMinmax\0"
     "glGetMinmaxEXT\0"
@@ -1424,14 +1481,14 @@ static const char GetMinmax_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char StencilFuncSeparate_names[] = 
+static const char StencilFuncSeparate_names[] =
     "iiii\0" /* Parameter signature */
     "glStencilFuncSeparate\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3sEXT_names[] = 
+static const char SecondaryColor3sEXT_names[] =
     "iii\0" /* Parameter signature */
     "glSecondaryColor3s\0"
     "glSecondaryColor3sEXT\0"
@@ -1439,28 +1496,28 @@ static const char SecondaryColor3sEXT_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Color3fVertex3fvSUN_names[] = 
+static const char Color3fVertex3fvSUN_names[] =
     "pp\0" /* Parameter signature */
     "glColor3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_global_alpha)
-static const char GlobalAlphaFactorbSUN_names[] = 
+static const char GlobalAlphaFactorbSUN_names[] =
     "i\0" /* Parameter signature */
     "glGlobalAlphaFactorbSUN\0"
     "";
 #endif
 
 #if defined(need_GL_HP_image_transform)
-static const char ImageTransformParameterfvHP_names[] = 
+static const char ImageTransformParameterfvHP_names[] =
     "iip\0" /* Parameter signature */
     "glImageTransformParameterfvHP\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4ivARB_names[] = 
+static const char VertexAttrib4ivARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4iv\0"
     "glVertexAttrib4ivARB\0"
@@ -1468,28 +1525,28 @@ static const char VertexAttrib4ivARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib3fNV_names[] = 
+static const char VertexAttrib3fNV_names[] =
     "ifff\0" /* Parameter signature */
     "glVertexAttrib3fNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs2dvNV_names[] = 
+static const char VertexAttribs2dvNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs2dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_timer_query)
-static const char GetQueryObjectui64vEXT_names[] = 
+static const char GetQueryObjectui64vEXT_names[] =
     "iip\0" /* Parameter signature */
     "glGetQueryObjectui64vEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord3fvARB_names[] = 
+static const char MultiTexCoord3fvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord3fv\0"
     "glMultiTexCoord3fvARB\0"
@@ -1497,7 +1554,7 @@ static const char MultiTexCoord3fvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3dEXT_names[] = 
+static const char SecondaryColor3dEXT_names[] =
     "ddd\0" /* Parameter signature */
     "glSecondaryColor3d\0"
     "glSecondaryColor3dEXT\0"
@@ -1505,42 +1562,42 @@ static const char SecondaryColor3dEXT_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char GetProgramParameterfvNV_names[] = 
+static const char GetProgramParameterfvNV_names[] =
     "iiip\0" /* Parameter signature */
     "glGetProgramParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char TangentPointerEXT_names[] = 
+static const char TangentPointerEXT_names[] =
     "iip\0" /* Parameter signature */
     "glTangentPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Color4fNormal3fVertex3fvSUN_names[] = 
+static const char Color4fNormal3fVertex3fvSUN_names[] =
     "ppp\0" /* Parameter signature */
     "glColor4fNormal3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_instruments)
-static const char GetInstrumentsSGIX_names[] = 
+static const char GetInstrumentsSGIX_names[] =
     "\0" /* Parameter signature */
     "glGetInstrumentsSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_NV_evaluators)
-static const char EvalMapsNV_names[] = 
+static const char EvalMapsNV_names[] =
     "ii\0" /* Parameter signature */
     "glEvalMapsNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_subtexture)
-static const char TexSubImage2D_names[] = 
+static const char TexSubImage2D_names[] =
     "iiiiiiiip\0" /* Parameter signature */
     "glTexSubImage2D\0"
     "glTexSubImage2DEXT\0"
@@ -1548,28 +1605,28 @@ static const char TexSubImage2D_names[] =
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentLightivSGIX_names[] = 
+static const char FragmentLightivSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glFragmentLightivSGIX\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char DeleteRenderbuffersEXT_names[] = 
-    "ip\0" /* Parameter signature */
-    "glDeleteRenderbuffersEXT\0"
+#if defined(need_GL_APPLE_texture_range)
+static const char GetTexParameterPointervAPPLE_names[] =
+    "iip\0" /* Parameter signature */
+    "glGetTexParameterPointervAPPLE\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_pixel_transform)
-static const char PixelTransformParameterfvEXT_names[] = 
+static const char PixelTransformParameterfvEXT_names[] =
     "iip\0" /* Parameter signature */
     "glPixelTransformParameterfvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4bvARB_names[] = 
+static const char VertexAttrib4bvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4bv\0"
     "glVertexAttrib4bvARB\0"
@@ -1577,14 +1634,14 @@ static const char VertexAttrib4bvARB_names[] =
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char AlphaFragmentOp2ATI_names[] = 
+static const char AlphaFragmentOp2ATI_names[] =
     "iiiiiiiii\0" /* Parameter signature */
     "glAlphaFragmentOp2ATI\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord4sARB_names[] = 
+static const char MultiTexCoord4sARB_names[] =
     "iiiii\0" /* Parameter signature */
     "glMultiTexCoord4s\0"
     "glMultiTexCoord4sARB\0"
@@ -1592,28 +1649,28 @@ static const char MultiTexCoord4sARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char GetFragmentMaterialivSGIX_names[] = 
+static const char GetFragmentMaterialivSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glGetFragmentMaterialivSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_MESA_window_pos)
-static const char WindowPos4dMESA_names[] = 
+static const char WindowPos4dMESA_names[] =
     "dddd\0" /* Parameter signature */
     "glWindowPos4dMESA\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightPointerARB_names[] = 
+static const char WeightPointerARB_names[] =
     "iiip\0" /* Parameter signature */
     "glWeightPointerARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos2dMESA_names[] = 
+static const char WindowPos2dMESA_names[] =
     "dd\0" /* Parameter signature */
     "glWindowPos2d\0"
     "glWindowPos2dARB\0"
@@ -1621,15 +1678,16 @@ static const char WindowPos2dMESA_names[] =
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char FramebufferTexture3DEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char FramebufferTexture3DEXT_names[] =
     "iiiiii\0" /* Parameter signature */
+    "glFramebufferTexture3D\0"
     "glFramebufferTexture3DEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_blend_minmax)
-static const char BlendEquation_names[] = 
+static const char BlendEquation_names[] =
     "i\0" /* Parameter signature */
     "glBlendEquation\0"
     "glBlendEquationEXT\0"
@@ -1637,14 +1695,14 @@ static const char BlendEquation_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib3dNV_names[] = 
+static const char VertexAttrib3dNV_names[] =
     "iddd\0" /* Parameter signature */
     "glVertexAttrib3dNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib3dARB_names[] = 
+static const char VertexAttrib3dARB_names[] =
     "iddd\0" /* Parameter signature */
     "glVertexAttrib3d\0"
     "glVertexAttrib3dARB\0"
@@ -1652,14 +1710,14 @@ static const char VertexAttrib3dARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiTexCoord2fColor4fNormal3fVertex3fvSUN_names[] = 
+static const char ReplacementCodeuiTexCoord2fColor4fNormal3fVertex3fvSUN_names[] =
     "ppppp\0" /* Parameter signature */
     "glReplacementCodeuiTexCoord2fColor4fNormal3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4fARB_names[] = 
+static const char VertexAttrib4fARB_names[] =
     "iffff\0" /* Parameter signature */
     "glVertexAttrib4f\0"
     "glVertexAttrib4fARB\0"
@@ -1667,21 +1725,22 @@ static const char VertexAttrib4fARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_index_func)
-static const char IndexFuncEXT_names[] = 
+static const char IndexFuncEXT_names[] =
     "if\0" /* Parameter signature */
     "glIndexFuncEXT\0"
     "";
 #endif
 
-#if defined(need_GL_SGIX_list_priority)
-static const char GetListParameterfvSGIX_names[] = 
-    "iip\0" /* Parameter signature */
-    "glGetListParameterfvSGIX\0"
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char FramebufferTexture2DEXT_names[] =
+    "iiiii\0" /* Parameter signature */
+    "glFramebufferTexture2D\0"
+    "glFramebufferTexture2DEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord2dvARB_names[] = 
+static const char MultiTexCoord2dvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord2dv\0"
     "glMultiTexCoord2dvARB\0"
@@ -1689,21 +1748,21 @@ static const char MultiTexCoord2dvARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_cull_vertex)
-static const char CullParameterfvEXT_names[] = 
+static const char CullParameterfvEXT_names[] =
     "ip\0" /* Parameter signature */
     "glCullParameterfvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_fragment_program)
-static const char ProgramNamedParameter4fvNV_names[] = 
+static const char ProgramNamedParameter4fvNV_names[] =
     "iipp\0" /* Parameter signature */
     "glProgramNamedParameter4fvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColorPointerEXT_names[] = 
+static const char SecondaryColorPointerEXT_names[] =
     "iiip\0" /* Parameter signature */
     "glSecondaryColorPointer\0"
     "glSecondaryColorPointerEXT\0"
@@ -1711,7 +1770,7 @@ static const char SecondaryColorPointerEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4fvARB_names[] = 
+static const char VertexAttrib4fvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4fv\0"
     "glVertexAttrib4fvARB\0"
@@ -1719,14 +1778,14 @@ static const char VertexAttrib4fvARB_names[] =
 #endif
 
 #if defined(need_GL_IBM_vertex_array_lists)
-static const char ColorPointerListIBM_names[] = 
+static const char ColorPointerListIBM_names[] =
     "iiipi\0" /* Parameter signature */
     "glColorPointerListIBM\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char GetActiveUniformARB_names[] = 
+static const char GetActiveUniformARB_names[] =
     "iiipppp\0" /* Parameter signature */
     "glGetActiveUniform\0"
     "glGetActiveUniformARB\0"
@@ -1734,14 +1793,14 @@ static const char GetActiveUniformARB_names[] =
 #endif
 
 #if defined(need_GL_HP_image_transform)
-static const char ImageTransformParameteriHP_names[] = 
+static const char ImageTransformParameteriHP_names[] =
     "iii\0" /* Parameter signature */
     "glImageTransformParameteriHP\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord1svARB_names[] = 
+static const char MultiTexCoord1svARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord1sv\0"
     "glMultiTexCoord1svARB\0"
@@ -1749,7 +1808,7 @@ static const char MultiTexCoord1svARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_occlusion_query)
-static const char EndQueryARB_names[] = 
+static const char EndQueryARB_names[] =
     "i\0" /* Parameter signature */
     "glEndQuery\0"
     "glEndQueryARB\0"
@@ -1757,42 +1816,35 @@ static const char EndQueryARB_names[] =
 #endif
 
 #if defined(need_GL_NV_fence)
-static const char DeleteFencesNV_names[] = 
+static const char DeleteFencesNV_names[] =
     "ip\0" /* Parameter signature */
     "glDeleteFencesNV\0"
     "";
 #endif
 
-#if defined(need_GL_SGIX_polynomial_ffd)
-static const char DeformationMap3dSGIX_names[] = 
-    "iddiiddiiddiip\0" /* Parameter signature */
-    "glDeformationMap3dSGIX\0"
-    "";
-#endif
-
 #if defined(need_GL_VERSION_2_0)
-static const char IsShader_names[] = 
+static const char IsShader_names[] =
     "i\0" /* Parameter signature */
     "glIsShader\0"
     "";
 #endif
 
 #if defined(need_GL_HP_image_transform)
-static const char GetImageTransformParameterivHP_names[] = 
+static const char GetImageTransformParameterivHP_names[] =
     "iip\0" /* Parameter signature */
     "glGetImageTransformParameterivHP\0"
     "";
 #endif
 
 #if defined(need_GL_MESA_window_pos)
-static const char WindowPos4ivMESA_names[] = 
+static const char WindowPos4ivMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos4ivMESA\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord3svARB_names[] = 
+static const char MultiTexCoord3svARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord3sv\0"
     "glMultiTexCoord3svARB\0"
@@ -1800,7 +1852,7 @@ static const char MultiTexCoord3svARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord4iARB_names[] = 
+static const char MultiTexCoord4iARB_names[] =
     "iiiii\0" /* Parameter signature */
     "glMultiTexCoord4i\0"
     "glMultiTexCoord4iARB\0"
@@ -1808,21 +1860,21 @@ static const char MultiTexCoord4iARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3ivEXT_names[] = 
+static const char Binormal3ivEXT_names[] =
     "p\0" /* Parameter signature */
     "glBinormal3ivEXT\0"
     "";
 #endif
 
 #if defined(need_GL_MESA_resize_buffers)
-static const char ResizeBuffersMESA_names[] = 
+static const char ResizeBuffersMESA_names[] =
     "\0" /* Parameter signature */
     "glResizeBuffersMESA\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char GetUniformivARB_names[] = 
+static const char GetUniformivARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetUniformiv\0"
     "glGetUniformivARB\0"
@@ -1830,28 +1882,28 @@ static const char GetUniformivARB_names[] =
 #endif
 
 #if defined(need_GL_SGIS_pixel_texture)
-static const char PixelTexGenParameteriSGIS_names[] = 
+static const char PixelTexGenParameteriSGIS_names[] =
     "ii\0" /* Parameter signature */
     "glPixelTexGenParameteriSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_INTEL_parallel_arrays)
-static const char VertexPointervINTEL_names[] = 
+static const char VertexPointervINTEL_names[] =
     "iip\0" /* Parameter signature */
     "glVertexPointervINTEL\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiColor4fNormal3fVertex3fvSUN_names[] = 
+static const char ReplacementCodeuiColor4fNormal3fVertex3fvSUN_names[] =
     "pppp\0" /* Parameter signature */
     "glReplacementCodeuiColor4fNormal3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3uiEXT_names[] = 
+static const char SecondaryColor3uiEXT_names[] =
     "iii\0" /* Parameter signature */
     "glSecondaryColor3ui\0"
     "glSecondaryColor3uiEXT\0"
@@ -1859,14 +1911,14 @@ static const char SecondaryColor3uiEXT_names[] =
 #endif
 
 #if defined(need_GL_SGIX_instruments)
-static const char StartInstrumentsSGIX_names[] = 
+static const char StartInstrumentsSGIX_names[] =
     "\0" /* Parameter signature */
     "glStartInstrumentsSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3usvEXT_names[] = 
+static const char SecondaryColor3usvEXT_names[] =
     "p\0" /* Parameter signature */
     "glSecondaryColor3usv\0"
     "glSecondaryColor3usvEXT\0"
@@ -1874,42 +1926,49 @@ static const char SecondaryColor3usvEXT_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib2fvNV_names[] = 
+static const char VertexAttrib2fvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib2fvNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char ProgramLocalParameter4dvARB_names[] = 
+static const char ProgramLocalParameter4dvARB_names[] =
     "iip\0" /* Parameter signature */
     "glProgramLocalParameter4dvARB\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_matrix_palette)
-static const char MatrixIndexuivARB_names[] = 
+static const char MatrixIndexuivARB_names[] =
     "ip\0" /* Parameter signature */
     "glMatrixIndexuivARB\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_framebuffer_object)
+static const char RenderbufferStorageMultisample_names[] =
+    "iiiii\0" /* Parameter signature */
+    "glRenderbufferStorageMultisample\0"
+    "";
+#endif
+
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3sEXT_names[] = 
+static const char Tangent3sEXT_names[] =
     "iii\0" /* Parameter signature */
     "glTangent3sEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_global_alpha)
-static const char GlobalAlphaFactorfSUN_names[] = 
+static const char GlobalAlphaFactorfSUN_names[] =
     "f\0" /* Parameter signature */
     "glGlobalAlphaFactorfSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord3iARB_names[] = 
+static const char MultiTexCoord3iARB_names[] =
     "iiii\0" /* Parameter signature */
     "glMultiTexCoord3i\0"
     "glMultiTexCoord3iARB\0"
@@ -1917,49 +1976,50 @@ static const char MultiTexCoord3iARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char IsProgram_names[] = 
+static const char IsProgram_names[] =
     "i\0" /* Parameter signature */
     "glIsProgram\0"
     "";
 #endif
 
 #if defined(need_GL_IBM_vertex_array_lists)
-static const char TexCoordPointerListIBM_names[] = 
+static const char TexCoordPointerListIBM_names[] =
     "iiipi\0" /* Parameter signature */
     "glTexCoordPointerListIBM\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_global_alpha)
-static const char GlobalAlphaFactorusSUN_names[] = 
+static const char GlobalAlphaFactorusSUN_names[] =
     "i\0" /* Parameter signature */
     "glGlobalAlphaFactorusSUN\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib2dvNV_names[] = 
+static const char VertexAttrib2dvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib2dvNV\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char FramebufferRenderbufferEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char FramebufferRenderbufferEXT_names[] =
     "iiii\0" /* Parameter signature */
+    "glFramebufferRenderbuffer\0"
     "glFramebufferRenderbufferEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib1dvNV_names[] = 
+static const char VertexAttrib1dvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib1dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_texture_object)
-static const char GenTextures_names[] = 
+static const char GenTextures_names[] =
     "ip\0" /* Parameter signature */
     "glGenTextures\0"
     "glGenTexturesEXT\0"
@@ -1967,63 +2027,71 @@ static const char GenTextures_names[] =
 #endif
 
 #if defined(need_GL_NV_fence)
-static const char SetFenceNV_names[] = 
+static const char SetFenceNV_names[] =
     "ii\0" /* Parameter signature */
     "glSetFenceNV\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char FramebufferTexture1DEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char FramebufferTexture1DEXT_names[] =
     "iiiii\0" /* Parameter signature */
+    "glFramebufferTexture1D\0"
     "glFramebufferTexture1DEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char GetCombinerOutputParameterivNV_names[] = 
+static const char GetCombinerOutputParameterivNV_names[] =
     "iiip\0" /* Parameter signature */
     "glGetCombinerOutputParameterivNV\0"
     "";
 #endif
 
+#if defined(need_GL_IBM_multimode_draw_arrays)
+static const char MultiModeDrawArraysIBM_names[] =
+    "pppii\0" /* Parameter signature */
+    "glMultiModeDrawArraysIBM\0"
+    "";
+#endif
+
 #if defined(need_GL_SGIS_pixel_texture)
-static const char PixelTexGenParameterivSGIS_names[] = 
+static const char PixelTexGenParameterivSGIS_names[] =
     "ip\0" /* Parameter signature */
     "glPixelTexGenParameterivSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_texture_perturb_normal)
-static const char TextureNormalEXT_names[] = 
+static const char TextureNormalEXT_names[] =
     "i\0" /* Parameter signature */
     "glTextureNormalEXT\0"
     "";
 #endif
 
 #if defined(need_GL_IBM_vertex_array_lists)
-static const char IndexPointerListIBM_names[] = 
+static const char IndexPointerListIBM_names[] =
     "iipi\0" /* Parameter signature */
     "glIndexPointerListIBM\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightfvARB_names[] = 
+static const char WeightfvARB_names[] =
     "ip\0" /* Parameter signature */
     "glWeightfvARB\0"
     "";
 #endif
 
 #if defined(need_GL_MESA_window_pos)
-static const char WindowPos4fMESA_names[] = 
+static const char WindowPos4fMESA_names[] =
     "ffff\0" /* Parameter signature */
     "glWindowPos4fMESA\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos3dvMESA_names[] = 
+static const char WindowPos3dvMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos3dv\0"
     "glWindowPos3dvARB\0"
@@ -2032,14 +2100,14 @@ static const char WindowPos3dvMESA_names[] =
 #endif
 
 #if defined(need_GL_EXT_timer_query)
-static const char GetQueryObjecti64vEXT_names[] = 
+static const char GetQueryObjecti64vEXT_names[] =
     "iip\0" /* Parameter signature */
     "glGetQueryObjecti64vEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord1dARB_names[] = 
+static const char MultiTexCoord1dARB_names[] =
     "id\0" /* Parameter signature */
     "glMultiTexCoord1d\0"
     "glMultiTexCoord1dARB\0"
@@ -2047,7 +2115,7 @@ static const char MultiTexCoord1dARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_NV_point_sprite)
-static const char PointParameterivNV_names[] = 
+static const char PointParameterivNV_names[] =
     "ip\0" /* Parameter signature */
     "glPointParameteriv\0"
     "glPointParameterivNV\0"
@@ -2055,15 +2123,22 @@ static const char PointParameterivNV_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform2fvARB_names[] = 
+static const char Uniform2fvARB_names[] =
     "iip\0" /* Parameter signature */
     "glUniform2fv\0"
     "glUniform2fvARB\0"
     "";
 #endif
 
+#if defined(need_GL_APPLE_flush_buffer_range)
+static const char BufferParameteriAPPLE_names[] =
+    "iii\0" /* Parameter signature */
+    "glBufferParameteriAPPLE\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord3dvARB_names[] = 
+static const char MultiTexCoord3dvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord3dv\0"
     "glMultiTexCoord3dvARB\0"
@@ -2071,56 +2146,64 @@ static const char MultiTexCoord3dvARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiTexCoord2fNormal3fVertex3fvSUN_names[] = 
+static const char ReplacementCodeuiTexCoord2fNormal3fVertex3fvSUN_names[] =
     "pppp\0" /* Parameter signature */
     "glReplacementCodeuiTexCoord2fNormal3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char DeleteObjectARB_names[] = 
+static const char DeleteObjectARB_names[] =
     "i\0" /* Parameter signature */
     "glDeleteObjectARB\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_matrix_palette)
-static const char MatrixIndexPointerARB_names[] = 
+static const char MatrixIndexPointerARB_names[] =
     "iiip\0" /* Parameter signature */
     "glMatrixIndexPointerARB\0"
     "";
 #endif
 
 #if defined(need_GL_NV_fragment_program)
-static const char ProgramNamedParameter4dvNV_names[] = 
+static const char ProgramNamedParameter4dvNV_names[] =
     "iipp\0" /* Parameter signature */
     "glProgramNamedParameter4dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3fvEXT_names[] = 
+static const char Tangent3fvEXT_names[] =
     "p\0" /* Parameter signature */
     "glTangent3fvEXT\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char BindFramebufferEXT_names[] = 
+#if defined(need_GL_ARB_vertex_array_object)
+static const char GenVertexArrays_names[] =
+    "ip\0" /* Parameter signature */
+    "glGenVertexArrays\0"
+    "";
+#endif
+
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char BindFramebufferEXT_names[] =
     "ii\0" /* Parameter signature */
+    "glBindFramebuffer\0"
     "glBindFramebufferEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_reference_plane)
-static const char ReferencePlaneSGIX_names[] = 
+static const char ReferencePlaneSGIX_names[] =
     "p\0" /* Parameter signature */
     "glReferencePlaneSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char ValidateProgramARB_names[] = 
+static const char ValidateProgramARB_names[] =
     "i\0" /* Parameter signature */
     "glValidateProgram\0"
     "glValidateProgramARB\0"
@@ -2128,21 +2211,21 @@ static const char ValidateProgramARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_compiled_vertex_array)
-static const char UnlockArraysEXT_names[] = 
+static const char UnlockArraysEXT_names[] =
     "\0" /* Parameter signature */
     "glUnlockArraysEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fColor3fVertex3fSUN_names[] = 
+static const char TexCoord2fColor3fVertex3fSUN_names[] =
     "ffffffff\0" /* Parameter signature */
     "glTexCoord2fColor3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos3fvMESA_names[] = 
+static const char WindowPos3fvMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos3fv\0"
     "glWindowPos3fvARB\0"
@@ -2151,14 +2234,14 @@ static const char WindowPos3fvMESA_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib1svNV_names[] = 
+static const char VertexAttrib1svNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib1svNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_copy_texture)
-static const char CopyTexSubImage3D_names[] = 
+static const char CopyTexSubImage3D_names[] =
     "iiiiiiiii\0" /* Parameter signature */
     "glCopyTexSubImage3D\0"
     "glCopyTexSubImage3DEXT\0"
@@ -2166,22 +2249,29 @@ static const char CopyTexSubImage3D_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib2dARB_names[] = 
+static const char VertexAttrib2dARB_names[] =
     "idd\0" /* Parameter signature */
     "glVertexAttrib2d\0"
     "glVertexAttrib2dARB\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_sync)
+static const char GetInteger64v_names[] =
+    "ip\0" /* Parameter signature */
+    "glGetInteger64v\0"
+    "";
+#endif
+
 #if defined(need_GL_SGIS_texture_color_mask)
-static const char TextureColorMaskSGIS_names[] = 
+static const char TextureColorMaskSGIS_names[] =
     "iiii\0" /* Parameter signature */
     "glTextureColorMaskSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_SGI_color_table) || defined(need_GL_EXT_paletted_texture)
-static const char GetColorTable_names[] = 
+static const char GetColorTable_names[] =
     "iiip\0" /* Parameter signature */
     "glGetColorTable\0"
     "glGetColorTableSGI\0"
@@ -2190,7 +2280,7 @@ static const char GetColorTable_names[] =
 #endif
 
 #if defined(need_GL_SGI_color_table)
-static const char CopyColorTable_names[] = 
+static const char CopyColorTable_names[] =
     "iiiii\0" /* Parameter signature */
     "glCopyColorTable\0"
     "glCopyColorTableSGI\0"
@@ -2198,7 +2288,7 @@ static const char CopyColorTable_names[] =
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char GetHistogramParameterfv_names[] = 
+static const char GetHistogramParameterfv_names[] =
     "iip\0" /* Parameter signature */
     "glGetHistogramParameterfv\0"
     "glGetHistogramParameterfvEXT\0"
@@ -2206,21 +2296,21 @@ static const char GetHistogramParameterfv_names[] =
 #endif
 
 #if defined(need_GL_INTEL_parallel_arrays)
-static const char ColorPointervINTEL_names[] = 
+static const char ColorPointervINTEL_names[] =
     "iip\0" /* Parameter signature */
     "glColorPointervINTEL\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char AlphaFragmentOp1ATI_names[] = 
+static const char AlphaFragmentOp1ATI_names[] =
     "iiiiii\0" /* Parameter signature */
     "glAlphaFragmentOp1ATI\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord3ivARB_names[] = 
+static const char MultiTexCoord3ivARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord3iv\0"
     "glMultiTexCoord3ivARB\0"
@@ -2228,7 +2318,7 @@ static const char MultiTexCoord3ivARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord2sARB_names[] = 
+static const char MultiTexCoord2sARB_names[] =
     "iii\0" /* Parameter signature */
     "glMultiTexCoord2s\0"
     "glMultiTexCoord2sARB\0"
@@ -2236,7 +2326,7 @@ static const char MultiTexCoord2sARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib1dvARB_names[] = 
+static const char VertexAttrib1dvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib1dv\0"
     "glVertexAttrib1dvARB\0"
@@ -2244,7 +2334,7 @@ static const char VertexAttrib1dvARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_texture_object)
-static const char DeleteTextures_names[] = 
+static const char DeleteTextures_names[] =
     "ip\0" /* Parameter signature */
     "glDeleteTextures\0"
     "glDeleteTexturesEXT\0"
@@ -2252,49 +2342,49 @@ static const char DeleteTextures_names[] =
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char TexCoordPointerEXT_names[] = 
+static const char TexCoordPointerEXT_names[] =
     "iiiip\0" /* Parameter signature */
     "glTexCoordPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_texture4D)
-static const char TexSubImage4DSGIS_names[] = 
+static const char TexSubImage4DSGIS_names[] =
     "iiiiiiiiiiiip\0" /* Parameter signature */
     "glTexSubImage4DSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners2)
-static const char CombinerStageParameterfvNV_names[] = 
+static const char CombinerStageParameterfvNV_names[] =
     "iip\0" /* Parameter signature */
     "glCombinerStageParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_instruments)
-static const char StopInstrumentsSGIX_names[] = 
+static const char StopInstrumentsSGIX_names[] =
     "i\0" /* Parameter signature */
     "glStopInstrumentsSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord4fColor4fNormal3fVertex4fSUN_names[] = 
+static const char TexCoord4fColor4fNormal3fVertex4fSUN_names[] =
     "fffffffffffffff\0" /* Parameter signature */
     "glTexCoord4fColor4fNormal3fVertex4fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_polynomial_ffd)
-static const char DeformSGIX_names[] = 
+static const char DeformSGIX_names[] =
     "i\0" /* Parameter signature */
     "glDeformSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char GetVertexAttribfvARB_names[] = 
+static const char GetVertexAttribfvARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetVertexAttribfv\0"
     "glGetVertexAttribfvARB\0"
@@ -2302,7 +2392,7 @@ static const char GetVertexAttribfvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3ivEXT_names[] = 
+static const char SecondaryColor3ivEXT_names[] =
     "p\0" /* Parameter signature */
     "glSecondaryColor3iv\0"
     "glSecondaryColor3ivEXT\0"
@@ -2310,49 +2400,49 @@ static const char SecondaryColor3ivEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_1)
-static const char UniformMatrix4x2fv_names[] = 
+static const char UniformMatrix4x2fv_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix4x2fv\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_detail_texture)
-static const char GetDetailTexFuncSGIS_names[] = 
+static const char GetDetailTexFuncSGIS_names[] =
     "ip\0" /* Parameter signature */
     "glGetDetailTexFuncSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners2)
-static const char GetCombinerStageParameterfvNV_names[] = 
+static const char GetCombinerStageParameterfvNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetCombinerStageParameterfvNV\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3fEXT_names[] = 
-    "fff\0" /* Parameter signature */
-    "glBinormal3fEXT\0"
+#if defined(need_GL_ARB_vertex_array_object)
+static const char BindVertexArray_names[] =
+    "i\0" /* Parameter signature */
+    "glBindVertexArray\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Color4ubVertex2fvSUN_names[] = 
+static const char Color4ubVertex2fvSUN_names[] =
     "pp\0" /* Parameter signature */
     "glColor4ubVertex2fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_texture_filter4)
-static const char TexFilterFuncSGIS_names[] = 
+static const char TexFilterFuncSGIS_names[] =
     "iiip\0" /* Parameter signature */
     "glTexFilterFuncSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_multisample) || defined(need_GL_EXT_multisample)
-static const char SampleMaskSGIS_names[] = 
+static const char SampleMaskSGIS_names[] =
     "fi\0" /* Parameter signature */
     "glSampleMaskSGIS\0"
     "glSampleMaskEXT\0"
@@ -2360,7 +2450,7 @@ static const char SampleMaskSGIS_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_shader)
-static const char GetAttribLocationARB_names[] = 
+static const char GetAttribLocationARB_names[] =
     "ip\0" /* Parameter signature */
     "glGetAttribLocation\0"
     "glGetAttribLocationARB\0"
@@ -2368,7 +2458,7 @@ static const char GetAttribLocationARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4ubvARB_names[] = 
+static const char VertexAttrib4ubvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4ubv\0"
     "glVertexAttrib4ubvARB\0"
@@ -2376,21 +2466,21 @@ static const char VertexAttrib4ubvARB_names[] =
 #endif
 
 #if defined(need_GL_SGIS_detail_texture)
-static const char DetailTexFuncSGIS_names[] = 
+static const char DetailTexFuncSGIS_names[] =
     "iip\0" /* Parameter signature */
     "glDetailTexFuncSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Normal3fVertex3fSUN_names[] = 
+static const char Normal3fVertex3fSUN_names[] =
     "ffffff\0" /* Parameter signature */
     "glNormal3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_copy_texture)
-static const char CopyTexImage2D_names[] = 
+static const char CopyTexImage2D_names[] =
     "iiiiiiii\0" /* Parameter signature */
     "glCopyTexImage2D\0"
     "glCopyTexImage2DEXT\0"
@@ -2398,7 +2488,7 @@ static const char CopyTexImage2D_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char GetBufferPointervARB_names[] = 
+static const char GetBufferPointervARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetBufferPointerv\0"
     "glGetBufferPointervARB\0"
@@ -2406,7 +2496,7 @@ static const char GetBufferPointervARB_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char ProgramEnvParameter4fARB_names[] = 
+static const char ProgramEnvParameter4fARB_names[] =
     "iiffff\0" /* Parameter signature */
     "glProgramEnvParameter4fARB\0"
     "glProgramParameter4fNV\0"
@@ -2414,7 +2504,7 @@ static const char ProgramEnvParameter4fARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform3ivARB_names[] = 
+static const char Uniform3ivARB_names[] =
     "iip\0" /* Parameter signature */
     "glUniform3iv\0"
     "glUniform3ivARB\0"
@@ -2422,21 +2512,21 @@ static const char Uniform3ivARB_names[] =
 #endif
 
 #if defined(need_GL_NV_fence)
-static const char GetFenceivNV_names[] = 
+static const char GetFenceivNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetFenceivNV\0"
     "";
 #endif
 
 #if defined(need_GL_MESA_window_pos)
-static const char WindowPos4dvMESA_names[] = 
+static const char WindowPos4dvMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos4dvMESA\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_color_subtable)
-static const char ColorSubTable_names[] = 
+static const char ColorSubTable_names[] =
     "iiiiip\0" /* Parameter signature */
     "glColorSubTable\0"
     "glColorSubTableEXT\0"
@@ -2444,7 +2534,7 @@ static const char ColorSubTable_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord4ivARB_names[] = 
+static const char MultiTexCoord4ivARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord4iv\0"
     "glMultiTexCoord4ivARB\0"
@@ -2452,21 +2542,21 @@ static const char MultiTexCoord4ivARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_gpu_program_parameters)
-static const char ProgramLocalParameters4fvEXT_names[] = 
+static const char ProgramLocalParameters4fvEXT_names[] =
     "iiip\0" /* Parameter signature */
     "glProgramLocalParameters4fvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_evaluators)
-static const char GetMapAttribParameterfvNV_names[] = 
+static const char GetMapAttribParameterfvNV_names[] =
     "iiip\0" /* Parameter signature */
     "glGetMapAttribParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4sARB_names[] = 
+static const char VertexAttrib4sARB_names[] =
     "iiiii\0" /* Parameter signature */
     "glVertexAttrib4s\0"
     "glVertexAttrib4sARB\0"
@@ -2474,7 +2564,7 @@ static const char VertexAttrib4sARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_occlusion_query)
-static const char GetQueryObjectuivARB_names[] = 
+static const char GetQueryObjectuivARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetQueryObjectuiv\0"
     "glGetQueryObjectuivARB\0"
@@ -2482,21 +2572,22 @@ static const char GetQueryObjectuivARB_names[] =
 #endif
 
 #if defined(need_GL_NV_evaluators)
-static const char MapParameterivNV_names[] = 
+static const char MapParameterivNV_names[] =
     "iip\0" /* Parameter signature */
     "glMapParameterivNV\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char GenRenderbuffersEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char GenRenderbuffersEXT_names[] =
     "ip\0" /* Parameter signature */
+    "glGenRenderbuffers\0"
     "glGenRenderbuffersEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib2dvARB_names[] = 
+static const char VertexAttrib2dvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib2dv\0"
     "glVertexAttrib2dvARB\0"
@@ -2504,28 +2595,28 @@ static const char VertexAttrib2dvARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char EdgeFlagPointerEXT_names[] = 
+static const char EdgeFlagPointerEXT_names[] =
     "iip\0" /* Parameter signature */
     "glEdgeFlagPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs2svNV_names[] = 
+static const char VertexAttribs2svNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs2svNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightbvARB_names[] = 
+static const char WeightbvARB_names[] =
     "ip\0" /* Parameter signature */
     "glWeightbvARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib2fvARB_names[] = 
+static const char VertexAttrib2fvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib2fv\0"
     "glVertexAttrib2fvARB\0"
@@ -2533,7 +2624,7 @@ static const char VertexAttrib2fvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char GetBufferParameterivARB_names[] = 
+static const char GetBufferParameterivARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetBufferParameteriv\0"
     "glGetBufferParameterivARB\0"
@@ -2541,28 +2632,28 @@ static const char GetBufferParameterivARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_list_priority)
-static const char ListParameteriSGIX_names[] = 
+static const char ListParameteriSGIX_names[] =
     "iii\0" /* Parameter signature */
     "glListParameteriSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiColor4fNormal3fVertex3fSUN_names[] = 
+static const char ReplacementCodeuiColor4fNormal3fVertex3fSUN_names[] =
     "iffffffffff\0" /* Parameter signature */
     "glReplacementCodeuiColor4fNormal3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_instruments)
-static const char InstrumentsBufferSGIX_names[] = 
+static const char InstrumentsBufferSGIX_names[] =
     "ip\0" /* Parameter signature */
     "glInstrumentsBufferSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4NivARB_names[] = 
+static const char VertexAttrib4NivARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4Niv\0"
     "glVertexAttrib4NivARB\0"
@@ -2570,35 +2661,35 @@ static const char VertexAttrib4NivARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char GetAttachedShaders_names[] = 
+static const char GetAttachedShaders_names[] =
     "iipp\0" /* Parameter signature */
     "glGetAttachedShaders\0"
     "";
 #endif
 
 #if defined(need_GL_APPLE_vertex_array_object)
-static const char GenVertexArraysAPPLE_names[] = 
+static const char GenVertexArraysAPPLE_names[] =
     "ip\0" /* Parameter signature */
     "glGenVertexArraysAPPLE\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_gpu_program_parameters)
-static const char ProgramEnvParameters4fvEXT_names[] = 
+static const char ProgramEnvParameters4fvEXT_names[] =
     "iiip\0" /* Parameter signature */
     "glProgramEnvParameters4fvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fColor4fNormal3fVertex3fvSUN_names[] = 
+static const char TexCoord2fColor4fNormal3fVertex3fvSUN_names[] =
     "pppp\0" /* Parameter signature */
     "glTexCoord2fColor4fNormal3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos2iMESA_names[] = 
+static const char WindowPos2iMESA_names[] =
     "ii\0" /* Parameter signature */
     "glWindowPos2i\0"
     "glWindowPos2iARB\0"
@@ -2607,7 +2698,7 @@ static const char WindowPos2iMESA_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3fvEXT_names[] = 
+static const char SecondaryColor3fvEXT_names[] =
     "p\0" /* Parameter signature */
     "glSecondaryColor3fv\0"
     "glSecondaryColor3fvEXT\0"
@@ -2615,7 +2706,7 @@ static const char SecondaryColor3fvEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_texture_compression)
-static const char CompressedTexSubImage1DARB_names[] = 
+static const char CompressedTexSubImage1DARB_names[] =
     "iiiiiip\0" /* Parameter signature */
     "glCompressedTexSubImage1D\0"
     "glCompressedTexSubImage1DARB\0"
@@ -2623,21 +2714,28 @@ static const char CompressedTexSubImage1DARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char GetVertexAttribivNV_names[] = 
+static const char GetVertexAttribivNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetVertexAttribivNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char GetProgramStringARB_names[] = 
+static const char GetProgramStringARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramStringARB\0"
     "";
 #endif
 
+#if defined(need_GL_ATI_envmap_bumpmap)
+static const char TexBumpParameterfvATI_names[] =
+    "ip\0" /* Parameter signature */
+    "glTexBumpParameterfvATI\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char CompileShaderARB_names[] = 
+static const char CompileShaderARB_names[] =
     "i\0" /* Parameter signature */
     "glCompileShader\0"
     "glCompileShaderARB\0"
@@ -2645,14 +2743,14 @@ static const char CompileShaderARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char DeleteShader_names[] = 
+static const char DeleteShader_names[] =
     "i\0" /* Parameter signature */
     "glDeleteShader\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform3fARB_names[] = 
+static const char Uniform3fARB_names[] =
     "ifff\0" /* Parameter signature */
     "glUniform3f\0"
     "glUniform3fARB\0"
@@ -2660,28 +2758,28 @@ static const char Uniform3fARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_list_priority)
-static const char ListParameterfvSGIX_names[] = 
+static const char ListParameterfvSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glListParameterfvSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3dvEXT_names[] = 
+static const char Tangent3dvEXT_names[] =
     "p\0" /* Parameter signature */
     "glTangent3dvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char GetVertexAttribfvNV_names[] = 
+static const char GetVertexAttribfvNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetVertexAttribfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos3sMESA_names[] = 
+static const char WindowPos3sMESA_names[] =
     "iii\0" /* Parameter signature */
     "glWindowPos3s\0"
     "glWindowPos3sARB\0"
@@ -2690,35 +2788,35 @@ static const char WindowPos3sMESA_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib2svNV_names[] = 
+static const char VertexAttrib2svNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib2svNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs1fvNV_names[] = 
+static const char VertexAttribs1fvNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs1fvNV\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fVertex3fvSUN_names[] = 
+static const char TexCoord2fVertex3fvSUN_names[] =
     "pp\0" /* Parameter signature */
     "glTexCoord2fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_MESA_window_pos)
-static const char WindowPos4sMESA_names[] = 
+static const char WindowPos4sMESA_names[] =
     "iiii\0" /* Parameter signature */
     "glWindowPos4sMESA\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4NuivARB_names[] = 
+static const char VertexAttrib4NuivARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4Nuiv\0"
     "glVertexAttrib4NuivARB\0"
@@ -2726,7 +2824,7 @@ static const char VertexAttrib4NuivARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char ClientActiveTextureARB_names[] = 
+static const char ClientActiveTextureARB_names[] =
     "i\0" /* Parameter signature */
     "glClientActiveTexture\0"
     "glClientActiveTextureARB\0"
@@ -2734,51 +2832,58 @@ static const char ClientActiveTextureARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_pixel_texture)
-static const char PixelTexGenSGIX_names[] = 
+static const char PixelTexGenSGIX_names[] =
     "i\0" /* Parameter signature */
     "glPixelTexGenSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_triangle_list)
-static const char ReplacementCodeusvSUN_names[] = 
+static const char ReplacementCodeusvSUN_names[] =
     "p\0" /* Parameter signature */
     "glReplacementCodeusvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform4fARB_names[] = 
+static const char Uniform4fARB_names[] =
     "iffff\0" /* Parameter signature */
     "glUniform4f\0"
     "glUniform4fARB\0"
     "";
 #endif
 
-#if defined(need_GL_IBM_multimode_draw_arrays)
-static const char MultiModeDrawArraysIBM_names[] = 
-    "pppii\0" /* Parameter signature */
-    "glMultiModeDrawArraysIBM\0"
+#if defined(need_GL_ARB_map_buffer_range)
+static const char FlushMappedBufferRange_names[] =
+    "iii\0" /* Parameter signature */
+    "glFlushMappedBufferRange\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char IsProgramNV_names[] = 
+static const char IsProgramNV_names[] =
     "i\0" /* Parameter signature */
     "glIsProgramARB\0"
     "glIsProgramNV\0"
     "";
 #endif
 
+#if defined(need_GL_APPLE_flush_buffer_range)
+static const char FlushMappedBufferRangeAPPLE_names[] =
+    "iii\0" /* Parameter signature */
+    "glFlushMappedBufferRangeAPPLE\0"
+    "";
+#endif
+
 #if defined(need_GL_SUN_triangle_list)
-static const char ReplacementCodePointerSUN_names[] = 
+static const char ReplacementCodePointerSUN_names[] =
     "iip\0" /* Parameter signature */
     "glReplacementCodePointerSUN\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char ProgramEnvParameter4dARB_names[] = 
+static const char ProgramEnvParameter4dARB_names[] =
     "iidddd\0" /* Parameter signature */
     "glProgramEnvParameter4dARB\0"
     "glProgramParameter4dNV\0"
@@ -2786,7 +2891,7 @@ static const char ProgramEnvParameter4dARB_names[] =
 #endif
 
 #if defined(need_GL_SGI_color_table)
-static const char ColorTableParameterfv_names[] = 
+static const char ColorTableParameterfv_names[] =
     "iip\0" /* Parameter signature */
     "glColorTableParameterfv\0"
     "glColorTableParameterfvSGI\0"
@@ -2794,21 +2899,21 @@ static const char ColorTableParameterfv_names[] =
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentLightModelfSGIX_names[] = 
+static const char FragmentLightModelfSGIX_names[] =
     "if\0" /* Parameter signature */
     "glFragmentLightModelfSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3bvEXT_names[] = 
+static const char Binormal3bvEXT_names[] =
     "p\0" /* Parameter signature */
     "glBinormal3bvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_texture_object)
-static const char IsTexture_names[] = 
+static const char IsTexture_names[] =
     "i\0" /* Parameter signature */
     "glIsTexture\0"
     "glIsTextureEXT\0"
@@ -2816,14 +2921,14 @@ static const char IsTexture_names[] =
 #endif
 
 #if defined(need_GL_EXT_vertex_weighting)
-static const char VertexWeightfvEXT_names[] = 
+static const char VertexWeightfvEXT_names[] =
     "p\0" /* Parameter signature */
     "glVertexWeightfvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib1dARB_names[] = 
+static const char VertexAttrib1dARB_names[] =
     "id\0" /* Parameter signature */
     "glVertexAttrib1d\0"
     "glVertexAttrib1dARB\0"
@@ -2831,14 +2936,14 @@ static const char VertexAttrib1dARB_names[] =
 #endif
 
 #if defined(need_GL_HP_image_transform)
-static const char ImageTransformParameterivHP_names[] = 
+static const char ImageTransformParameterivHP_names[] =
     "iip\0" /* Parameter signature */
     "glImageTransformParameterivHP\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_occlusion_query)
-static const char DeleteQueriesARB_names[] = 
+static const char DeleteQueriesARB_names[] =
     "ip\0" /* Parameter signature */
     "glDeleteQueries\0"
     "glDeleteQueriesARB\0"
@@ -2846,28 +2951,28 @@ static const char DeleteQueriesARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Color4ubVertex2fSUN_names[] = 
+static const char Color4ubVertex2fSUN_names[] =
     "iiiiff\0" /* Parameter signature */
     "glColor4ubVertex2fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentColorMaterialSGIX_names[] = 
+static const char FragmentColorMaterialSGIX_names[] =
     "ii\0" /* Parameter signature */
     "glFragmentColorMaterialSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_matrix_palette)
-static const char CurrentPaletteMatrixARB_names[] = 
+static const char CurrentPaletteMatrixARB_names[] =
     "i\0" /* Parameter signature */
     "glCurrentPaletteMatrixARB\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_multisample) || defined(need_GL_EXT_multisample)
-static const char SamplePatternSGIS_names[] = 
+static const char SamplePatternSGIS_names[] =
     "i\0" /* Parameter signature */
     "glSamplePatternSGIS\0"
     "glSamplePatternEXT\0"
@@ -2875,7 +2980,7 @@ static const char SamplePatternSGIS_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_occlusion_query)
-static const char IsQueryARB_names[] = 
+static const char IsQueryARB_names[] =
     "i\0" /* Parameter signature */
     "glIsQuery\0"
     "glIsQueryARB\0"
@@ -2883,14 +2988,14 @@ static const char IsQueryARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiColor4ubVertex3fSUN_names[] = 
+static const char ReplacementCodeuiColor4ubVertex3fSUN_names[] =
     "iiiiifff\0" /* Parameter signature */
     "glReplacementCodeuiColor4ubVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4usvARB_names[] = 
+static const char VertexAttrib4usvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4usv\0"
     "glVertexAttrib4usvARB\0"
@@ -2898,7 +3003,7 @@ static const char VertexAttrib4usvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char LinkProgramARB_names[] = 
+static const char LinkProgramARB_names[] =
     "i\0" /* Parameter signature */
     "glLinkProgram\0"
     "glLinkProgramARB\0"
@@ -2906,14 +3011,14 @@ static const char LinkProgramARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib2fNV_names[] = 
+static const char VertexAttrib2fNV_names[] =
     "iff\0" /* Parameter signature */
     "glVertexAttrib2fNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char ShaderSourceARB_names[] = 
+static const char ShaderSourceARB_names[] =
     "iipp\0" /* Parameter signature */
     "glShaderSource\0"
     "glShaderSourceARB\0"
@@ -2921,14 +3026,14 @@ static const char ShaderSourceARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentMaterialiSGIX_names[] = 
+static const char FragmentMaterialiSGIX_names[] =
     "iii\0" /* Parameter signature */
     "glFragmentMaterialiSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib3svARB_names[] = 
+static const char VertexAttrib3svARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib3sv\0"
     "glVertexAttrib3svARB\0"
@@ -2936,7 +3041,7 @@ static const char VertexAttrib3svARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_texture_compression)
-static const char CompressedTexSubImage3DARB_names[] = 
+static const char CompressedTexSubImage3DARB_names[] =
     "iiiiiiiiiip\0" /* Parameter signature */
     "glCompressedTexSubImage3D\0"
     "glCompressedTexSubImage3DARB\0"
@@ -2944,7 +3049,7 @@ static const char CompressedTexSubImage3DARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos2ivMESA_names[] = 
+static const char WindowPos2ivMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos2iv\0"
     "glWindowPos2ivARB\0"
@@ -2952,15 +3057,16 @@ static const char WindowPos2ivMESA_names[] =
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char IsFramebufferEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char IsFramebufferEXT_names[] =
     "i\0" /* Parameter signature */
+    "glIsFramebuffer\0"
     "glIsFramebufferEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform4ivARB_names[] = 
+static const char Uniform4ivARB_names[] =
     "iip\0" /* Parameter signature */
     "glUniform4iv\0"
     "glUniform4ivARB\0"
@@ -2968,15 +3074,22 @@ static const char Uniform4ivARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char GetVertexAttribdvARB_names[] = 
+static const char GetVertexAttribdvARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetVertexAttribdv\0"
     "glGetVertexAttribdvARB\0"
     "";
 #endif
 
+#if defined(need_GL_ATI_envmap_bumpmap)
+static const char TexBumpParameterivATI_names[] =
+    "ip\0" /* Parameter signature */
+    "glTexBumpParameterivATI\0"
+    "";
+#endif
+
 #if defined(need_GL_EXT_convolution)
-static const char GetSeparableFilter_names[] = 
+static const char GetSeparableFilter_names[] =
     "iiippp\0" /* Parameter signature */
     "glGetSeparableFilter\0"
     "glGetSeparableFilterEXT\0"
@@ -2984,49 +3097,49 @@ static const char GetSeparableFilter_names[] =
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3dEXT_names[] = 
+static const char Binormal3dEXT_names[] =
     "ddd\0" /* Parameter signature */
     "glBinormal3dEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_sprite)
-static const char SpriteParameteriSGIX_names[] = 
+static const char SpriteParameteriSGIX_names[] =
     "ii\0" /* Parameter signature */
     "glSpriteParameteriSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char RequestResidentProgramsNV_names[] = 
+static const char RequestResidentProgramsNV_names[] =
     "ip\0" /* Parameter signature */
     "glRequestResidentProgramsNV\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_tag_sample_buffer)
-static const char TagSampleBufferSGIX_names[] = 
+static const char TagSampleBufferSGIX_names[] =
     "\0" /* Parameter signature */
     "glTagSampleBufferSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_triangle_list)
-static const char ReplacementCodeusSUN_names[] = 
+static const char ReplacementCodeusSUN_names[] =
     "i\0" /* Parameter signature */
     "glReplacementCodeusSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_list_priority)
-static const char ListParameterivSGIX_names[] = 
+static const char ListParameterivSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glListParameterivSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_multi_draw_arrays)
-static const char MultiDrawElementsEXT_names[] = 
+static const char MultiDrawElementsEXT_names[] =
     "ipipi\0" /* Parameter signature */
     "glMultiDrawElements\0"
     "glMultiDrawElementsEXT\0"
@@ -3034,7 +3147,7 @@ static const char MultiDrawElementsEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform1ivARB_names[] = 
+static const char Uniform1ivARB_names[] =
     "iip\0" /* Parameter signature */
     "glUniform1iv\0"
     "glUniform1ivARB\0"
@@ -3042,7 +3155,7 @@ static const char Uniform1ivARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos2sMESA_names[] = 
+static const char WindowPos2sMESA_names[] =
     "ii\0" /* Parameter signature */
     "glWindowPos2s\0"
     "glWindowPos2sARB\0"
@@ -3051,14 +3164,14 @@ static const char WindowPos2sMESA_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightusvARB_names[] = 
+static const char WeightusvARB_names[] =
     "ip\0" /* Parameter signature */
     "glWeightusvARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
-static const char FogCoordPointerEXT_names[] = 
+static const char FogCoordPointerEXT_names[] =
     "iip\0" /* Parameter signature */
     "glFogCoordPointer\0"
     "glFogCoordPointerEXT\0"
@@ -3066,14 +3179,14 @@ static const char FogCoordPointerEXT_names[] =
 #endif
 
 #if defined(need_GL_EXT_index_material)
-static const char IndexMaterialEXT_names[] = 
+static const char IndexMaterialEXT_names[] =
     "ii\0" /* Parameter signature */
     "glIndexMaterialEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3ubvEXT_names[] = 
+static const char SecondaryColor3ubvEXT_names[] =
     "p\0" /* Parameter signature */
     "glSecondaryColor3ubv\0"
     "glSecondaryColor3ubvEXT\0"
@@ -3081,7 +3194,7 @@ static const char SecondaryColor3ubvEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4dvARB_names[] = 
+static const char VertexAttrib4dvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4dv\0"
     "glVertexAttrib4dvARB\0"
@@ -3089,7 +3202,7 @@ static const char VertexAttrib4dvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_shader)
-static const char BindAttribLocationARB_names[] = 
+static const char BindAttribLocationARB_names[] =
     "iip\0" /* Parameter signature */
     "glBindAttribLocation\0"
     "glBindAttribLocationARB\0"
@@ -3097,7 +3210,7 @@ static const char BindAttribLocationARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord2dARB_names[] = 
+static const char MultiTexCoord2dARB_names[] =
     "idd\0" /* Parameter signature */
     "glMultiTexCoord2d\0"
     "glMultiTexCoord2dARB\0"
@@ -3105,49 +3218,50 @@ static const char MultiTexCoord2dARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char ExecuteProgramNV_names[] = 
+static const char ExecuteProgramNV_names[] =
     "iip\0" /* Parameter signature */
     "glExecuteProgramNV\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char LightEnviSGIX_names[] = 
+static const char LightEnviSGIX_names[] =
     "ii\0" /* Parameter signature */
     "glLightEnviSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_triangle_list)
-static const char ReplacementCodeuiSUN_names[] = 
+static const char ReplacementCodeuiSUN_names[] =
     "i\0" /* Parameter signature */
     "glReplacementCodeuiSUN\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char FramebufferTexture2DEXT_names[] = 
-    "iiiii\0" /* Parameter signature */
-    "glFramebufferTexture2DEXT\0"
-    "";
-#endif
-
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribPointerNV_names[] = 
+static const char VertexAttribPointerNV_names[] =
     "iiiip\0" /* Parameter signature */
     "glVertexAttribPointerNV\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char GetFramebufferAttachmentParameterivEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char GetFramebufferAttachmentParameterivEXT_names[] =
     "iiip\0" /* Parameter signature */
+    "glGetFramebufferAttachmentParameteriv\0"
     "glGetFramebufferAttachmentParameterivEXT\0"
     "";
 #endif
 
+#if defined(need_GL_EXT_pixel_transform)
+static const char PixelTransformParameterfEXT_names[] =
+    "iif\0" /* Parameter signature */
+    "glPixelTransformParameterfEXT\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord4dvARB_names[] = 
+static const char MultiTexCoord4dvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord4dv\0"
     "glMultiTexCoord4dvARB\0"
@@ -3155,21 +3269,21 @@ static const char MultiTexCoord4dvARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_pixel_transform)
-static const char PixelTransformParameteriEXT_names[] = 
+static const char PixelTransformParameteriEXT_names[] =
     "iii\0" /* Parameter signature */
     "glPixelTransformParameteriEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fColor4ubVertex3fSUN_names[] = 
+static const char TexCoord2fColor4ubVertex3fSUN_names[] =
     "ffiiiifff\0" /* Parameter signature */
     "glTexCoord2fColor4ubVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform1iARB_names[] = 
+static const char Uniform1iARB_names[] =
     "ii\0" /* Parameter signature */
     "glUniform1i\0"
     "glUniform1iARB\0"
@@ -3177,7 +3291,7 @@ static const char Uniform1iARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttribPointerARB_names[] = 
+static const char VertexAttribPointerARB_names[] =
     "iiiiip\0" /* Parameter signature */
     "glVertexAttribPointer\0"
     "glVertexAttribPointerARB\0"
@@ -3185,14 +3299,14 @@ static const char VertexAttribPointerARB_names[] =
 #endif
 
 #if defined(need_GL_SGIS_sharpen_texture)
-static const char SharpenTexFuncSGIS_names[] = 
+static const char SharpenTexFuncSGIS_names[] =
     "iip\0" /* Parameter signature */
     "glSharpenTexFuncSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord4fvARB_names[] = 
+static const char MultiTexCoord4fvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord4fv\0"
     "glMultiTexCoord4fvARB\0"
@@ -3200,56 +3314,56 @@ static const char MultiTexCoord4fvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_1)
-static const char UniformMatrix2x3fv_names[] = 
+static const char UniformMatrix2x3fv_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix2x3fv\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char TrackMatrixNV_names[] = 
+static const char TrackMatrixNV_names[] =
     "iiii\0" /* Parameter signature */
     "glTrackMatrixNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char CombinerParameteriNV_names[] = 
+static const char CombinerParameteriNV_names[] =
     "ii\0" /* Parameter signature */
     "glCombinerParameteriNV\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_async)
-static const char DeleteAsyncMarkersSGIX_names[] = 
+static const char DeleteAsyncMarkersSGIX_names[] =
     "ii\0" /* Parameter signature */
     "glDeleteAsyncMarkersSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_async)
-static const char IsAsyncMarkerSGIX_names[] = 
+static const char IsAsyncMarkerSGIX_names[] =
     "i\0" /* Parameter signature */
     "glIsAsyncMarkerSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_framezoom)
-static const char FrameZoomSGIX_names[] = 
+static const char FrameZoomSGIX_names[] =
     "i\0" /* Parameter signature */
     "glFrameZoomSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Normal3fVertex3fvSUN_names[] = 
+static const char Normal3fVertex3fvSUN_names[] =
     "pp\0" /* Parameter signature */
     "glNormal3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4NsvARB_names[] = 
+static const char VertexAttrib4NsvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4Nsv\0"
     "glVertexAttrib4NsvARB\0"
@@ -3257,29 +3371,37 @@ static const char VertexAttrib4NsvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib3fvARB_names[] = 
+static const char VertexAttrib3fvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib3fv\0"
     "glVertexAttrib3fvARB\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char DeleteFramebuffersEXT_names[] = 
+#if defined(need_GL_ARB_sync)
+static const char GetSynciv_names[] =
+    "iiipp\0" /* Parameter signature */
+    "glGetSynciv\0"
+    "";
+#endif
+
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char DeleteFramebuffersEXT_names[] =
     "ip\0" /* Parameter signature */
+    "glDeleteFramebuffers\0"
     "glDeleteFramebuffersEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_global_alpha)
-static const char GlobalAlphaFactorsSUN_names[] = 
+static const char GlobalAlphaFactorsSUN_names[] =
     "i\0" /* Parameter signature */
     "glGlobalAlphaFactorsSUN\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_texture3D)
-static const char TexSubImage3D_names[] = 
+static const char TexSubImage3D_names[] =
     "iiiiiiiiiip\0" /* Parameter signature */
     "glTexSubImage3D\0"
     "glTexSubImage3DEXT\0"
@@ -3287,14 +3409,14 @@ static const char TexSubImage3D_names[] =
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3fEXT_names[] = 
+static const char Tangent3fEXT_names[] =
     "fff\0" /* Parameter signature */
     "glTangent3fEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3uivEXT_names[] = 
+static const char SecondaryColor3uivEXT_names[] =
     "p\0" /* Parameter signature */
     "glSecondaryColor3uiv\0"
     "glSecondaryColor3uivEXT\0"
@@ -3302,35 +3424,35 @@ static const char SecondaryColor3uivEXT_names[] =
 #endif
 
 #if defined(need_GL_ARB_matrix_palette)
-static const char MatrixIndexubvARB_names[] = 
+static const char MatrixIndexubvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMatrixIndexubvARB\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char Color4fNormal3fVertex3fSUN_names[] = 
+static const char Color4fNormal3fVertex3fSUN_names[] =
     "ffffffffff\0" /* Parameter signature */
     "glColor4fNormal3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_pixel_texture)
-static const char PixelTexGenParameterfSGIS_names[] = 
+static const char PixelTexGenParameterfSGIS_names[] =
     "if\0" /* Parameter signature */
     "glPixelTexGenParameterfSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char CreateShader_names[] = 
+static const char CreateShader_names[] =
     "i\0" /* Parameter signature */
     "glCreateShader\0"
     "";
 #endif
 
 #if defined(need_GL_SGI_color_table) || defined(need_GL_EXT_paletted_texture)
-static const char GetColorTableParameterfv_names[] = 
+static const char GetColorTableParameterfv_names[] =
     "iip\0" /* Parameter signature */
     "glGetColorTableParameterfv\0"
     "glGetColorTableParameterfvSGI\0"
@@ -3339,14 +3461,14 @@ static const char GetColorTableParameterfv_names[] =
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentLightModelfvSGIX_names[] = 
+static const char FragmentLightModelfvSGIX_names[] =
     "ip\0" /* Parameter signature */
     "glFragmentLightModelfvSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord3fARB_names[] = 
+static const char MultiTexCoord3fARB_names[] =
     "ifff\0" /* Parameter signature */
     "glMultiTexCoord3f\0"
     "glMultiTexCoord3fARB\0"
@@ -3354,49 +3476,51 @@ static const char MultiTexCoord3fARB_names[] =
 #endif
 
 #if defined(need_GL_SGIS_pixel_texture)
-static const char GetPixelTexGenParameterfvSGIS_names[] = 
+static const char GetPixelTexGenParameterfvSGIS_names[] =
     "ip\0" /* Parameter signature */
     "glGetPixelTexGenParameterfvSGIS\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char GenFramebuffersEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char GenFramebuffersEXT_names[] =
     "ip\0" /* Parameter signature */
+    "glGenFramebuffers\0"
     "glGenFramebuffersEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char GetProgramParameterdvNV_names[] = 
+static const char GetProgramParameterdvNV_names[] =
     "iiip\0" /* Parameter signature */
     "glGetProgramParameterdvNV\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_pixel_transform)
-static const char PixelTransformParameterfEXT_names[] = 
-    "iif\0" /* Parameter signature */
-    "glPixelTransformParameterfEXT\0"
+#if defined(need_GL_ARB_vertex_array_object) || defined(need_GL_APPLE_vertex_array_object)
+static const char IsVertexArrayAPPLE_names[] =
+    "i\0" /* Parameter signature */
+    "glIsVertexArray\0"
+    "glIsVertexArrayAPPLE\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentLightfvSGIX_names[] = 
+static const char FragmentLightfvSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glFragmentLightfvSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char DetachShader_names[] = 
+static const char DetachShader_names[] =
     "ii\0" /* Parameter signature */
     "glDetachShader\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4NubARB_names[] = 
+static const char VertexAttrib4NubARB_names[] =
     "iiiii\0" /* Parameter signature */
     "glVertexAttrib4Nub\0"
     "glVertexAttrib4NubARB\0"
@@ -3404,28 +3528,28 @@ static const char VertexAttrib4NubARB_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char GetProgramEnvParameterfvARB_names[] = 
+static const char GetProgramEnvParameterfvARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramEnvParameterfvARB\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char GetTrackMatrixivNV_names[] = 
+static const char GetTrackMatrixivNV_names[] =
     "iiip\0" /* Parameter signature */
     "glGetTrackMatrixivNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib3svNV_names[] = 
+static const char VertexAttrib3svNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib3svNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform4fvARB_names[] = 
+static const char Uniform4fvARB_names[] =
     "iip\0" /* Parameter signature */
     "glUniform4fv\0"
     "glUniform4fvARB\0"
@@ -3433,7 +3557,7 @@ static const char Uniform4fvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_transpose_matrix)
-static const char MultTransposeMatrixfARB_names[] = 
+static const char MultTransposeMatrixfARB_names[] =
     "p\0" /* Parameter signature */
     "glMultTransposeMatrixf\0"
     "glMultTransposeMatrixfARB\0"
@@ -3441,14 +3565,14 @@ static const char MultTransposeMatrixfARB_names[] =
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char ColorFragmentOp1ATI_names[] = 
+static const char ColorFragmentOp1ATI_names[] =
     "iiiiiii\0" /* Parameter signature */
     "glColorFragmentOp1ATI\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char GetUniformfvARB_names[] = 
+static const char GetUniformfvARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetUniformfv\0"
     "glGetUniformfvARB\0"
@@ -3456,28 +3580,28 @@ static const char GetUniformfvARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiTexCoord2fColor4fNormal3fVertex3fSUN_names[] = 
+static const char ReplacementCodeuiTexCoord2fColor4fNormal3fVertex3fSUN_names[] =
     "iffffffffffff\0" /* Parameter signature */
     "glReplacementCodeuiTexCoord2fColor4fNormal3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char DetachObjectARB_names[] = 
+static const char DetachObjectARB_names[] =
     "ii\0" /* Parameter signature */
     "glDetachObjectARB\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char VertexBlendARB_names[] = 
+static const char VertexBlendARB_names[] =
     "i\0" /* Parameter signature */
     "glVertexBlendARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos3iMESA_names[] = 
+static const char WindowPos3iMESA_names[] =
     "iii\0" /* Parameter signature */
     "glWindowPos3i\0"
     "glWindowPos3iARB\0"
@@ -3486,7 +3610,7 @@ static const char WindowPos3iMESA_names[] =
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char SeparableFilter2D_names[] = 
+static const char SeparableFilter2D_names[] =
     "iiiiiipp\0" /* Parameter signature */
     "glSeparableFilter2D\0"
     "glSeparableFilter2DEXT\0"
@@ -3494,14 +3618,14 @@ static const char SeparableFilter2D_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiColor4ubVertex3fvSUN_names[] = 
+static const char ReplacementCodeuiColor4ubVertex3fvSUN_names[] =
     "ppp\0" /* Parameter signature */
     "glReplacementCodeuiColor4ubVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_texture_compression)
-static const char CompressedTexImage2DARB_names[] = 
+static const char CompressedTexImage2DARB_names[] =
     "iiiiiiip\0" /* Parameter signature */
     "glCompressedTexImage2D\0"
     "glCompressedTexImage2DARB\0"
@@ -3509,7 +3633,7 @@ static const char CompressedTexImage2DARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char ArrayElement_names[] = 
+static const char ArrayElement_names[] =
     "i\0" /* Parameter signature */
     "glArrayElement\0"
     "glArrayElementEXT\0"
@@ -3517,35 +3641,35 @@ static const char ArrayElement_names[] =
 #endif
 
 #if defined(need_GL_EXT_depth_bounds_test)
-static const char DepthBoundsEXT_names[] = 
+static const char DepthBoundsEXT_names[] =
     "dd\0" /* Parameter signature */
     "glDepthBoundsEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char ProgramParameters4fvNV_names[] = 
+static const char ProgramParameters4fvNV_names[] =
     "iiip\0" /* Parameter signature */
     "glProgramParameters4fvNV\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_polynomial_ffd)
-static const char DeformationMap3fSGIX_names[] = 
+static const char DeformationMap3fSGIX_names[] =
     "iffiiffiiffiip\0" /* Parameter signature */
     "glDeformationMap3fSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char GetProgramivNV_names[] = 
+static const char GetProgramivNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramivNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char GetMinmaxParameteriv_names[] = 
+static const char GetMinmaxParameteriv_names[] =
     "iip\0" /* Parameter signature */
     "glGetMinmaxParameteriv\0"
     "glGetMinmaxParameterivEXT\0"
@@ -3553,7 +3677,7 @@ static const char GetMinmaxParameteriv_names[] =
 #endif
 
 #if defined(need_GL_EXT_copy_texture)
-static const char CopyTexImage1D_names[] = 
+static const char CopyTexImage1D_names[] =
     "iiiiiii\0" /* Parameter signature */
     "glCopyTexImage1D\0"
     "glCopyTexImage1DEXT\0"
@@ -3561,42 +3685,42 @@ static const char CopyTexImage1D_names[] =
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char AlphaFragmentOp3ATI_names[] = 
+static const char AlphaFragmentOp3ATI_names[] =
     "iiiiiiiiiiii\0" /* Parameter signature */
     "glAlphaFragmentOp3ATI\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char GetVertexAttribdvNV_names[] = 
+static const char GetVertexAttribdvNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetVertexAttribdvNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib3fvNV_names[] = 
+static const char VertexAttrib3fvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib3fvNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char GetFinalCombinerInputParameterivNV_names[] = 
+static const char GetFinalCombinerInputParameterivNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetFinalCombinerInputParameterivNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_evaluators)
-static const char GetMapParameterivNV_names[] = 
+static const char GetMapParameterivNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetMapParameterivNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform4iARB_names[] = 
+static const char Uniform4iARB_names[] =
     "iiiii\0" /* Parameter signature */
     "glUniform4i\0"
     "glUniform4iARB\0"
@@ -3604,7 +3728,7 @@ static const char Uniform4iARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char ConvolutionParameteri_names[] = 
+static const char ConvolutionParameteri_names[] =
     "iii\0" /* Parameter signature */
     "glConvolutionParameteri\0"
     "glConvolutionParameteriEXT\0"
@@ -3612,14 +3736,14 @@ static const char ConvolutionParameteri_names[] =
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3sEXT_names[] = 
+static const char Binormal3sEXT_names[] =
     "iii\0" /* Parameter signature */
     "glBinormal3sEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char ConvolutionParameterf_names[] = 
+static const char ConvolutionParameterf_names[] =
     "iif\0" /* Parameter signature */
     "glConvolutionParameterf\0"
     "glConvolutionParameterfEXT\0"
@@ -3627,7 +3751,7 @@ static const char ConvolutionParameterf_names[] =
 #endif
 
 #if defined(need_GL_SGI_color_table) || defined(need_GL_EXT_paletted_texture)
-static const char GetColorTableParameteriv_names[] = 
+static const char GetColorTableParameteriv_names[] =
     "iip\0" /* Parameter signature */
     "glGetColorTableParameteriv\0"
     "glGetColorTableParameterivSGI\0"
@@ -3636,7 +3760,7 @@ static const char GetColorTableParameteriv_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char ProgramEnvParameter4dvARB_names[] = 
+static const char ProgramEnvParameter4dvARB_names[] =
     "iip\0" /* Parameter signature */
     "glProgramEnvParameter4dvARB\0"
     "glProgramParameter4dvNV\0"
@@ -3644,14 +3768,14 @@ static const char ProgramEnvParameter4dvARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs2fvNV_names[] = 
+static const char VertexAttribs2fvNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs2fvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char UseProgramObjectARB_names[] = 
+static const char UseProgramObjectARB_names[] =
     "i\0" /* Parameter signature */
     "glUseProgram\0"
     "glUseProgramObjectARB\0"
@@ -3659,42 +3783,42 @@ static const char UseProgramObjectARB_names[] =
 #endif
 
 #if defined(need_GL_NV_evaluators)
-static const char GetMapParameterfvNV_names[] = 
+static const char GetMapParameterfvNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetMapParameterfvNV\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char PassTexCoordATI_names[] = 
+static const char PassTexCoordATI_names[] =
     "iii\0" /* Parameter signature */
     "glPassTexCoordATI\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char DeleteProgram_names[] = 
+static const char DeleteProgram_names[] =
     "i\0" /* Parameter signature */
     "glDeleteProgram\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3ivEXT_names[] = 
+static const char Tangent3ivEXT_names[] =
     "p\0" /* Parameter signature */
     "glTangent3ivEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3dEXT_names[] = 
+static const char Tangent3dEXT_names[] =
     "ddd\0" /* Parameter signature */
     "glTangent3dEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3dvEXT_names[] = 
+static const char SecondaryColor3dvEXT_names[] =
     "p\0" /* Parameter signature */
     "glSecondaryColor3dv\0"
     "glSecondaryColor3dvEXT\0"
@@ -3702,22 +3826,23 @@ static const char SecondaryColor3dvEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_multi_draw_arrays)
-static const char MultiDrawArraysEXT_names[] = 
+static const char MultiDrawArraysEXT_names[] =
     "ippi\0" /* Parameter signature */
     "glMultiDrawArrays\0"
     "glMultiDrawArraysEXT\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char BindRenderbufferEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char BindRenderbufferEXT_names[] =
     "ii\0" /* Parameter signature */
+    "glBindRenderbuffer\0"
     "glBindRenderbufferEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord4dARB_names[] = 
+static const char MultiTexCoord4dARB_names[] =
     "idddd\0" /* Parameter signature */
     "glMultiTexCoord4d\0"
     "glMultiTexCoord4dARB\0"
@@ -3725,7 +3850,7 @@ static const char MultiTexCoord4dARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3usEXT_names[] = 
+static const char SecondaryColor3usEXT_names[] =
     "iii\0" /* Parameter signature */
     "glSecondaryColor3us\0"
     "glSecondaryColor3usEXT\0"
@@ -3733,14 +3858,14 @@ static const char SecondaryColor3usEXT_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char ProgramLocalParameter4fvARB_names[] = 
+static const char ProgramLocalParameter4fvARB_names[] =
     "iip\0" /* Parameter signature */
     "glProgramLocalParameter4fvARB\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char DeleteProgramsNV_names[] = 
+static const char DeleteProgramsNV_names[] =
     "ip\0" /* Parameter signature */
     "glDeleteProgramsARB\0"
     "glDeleteProgramsNV\0"
@@ -3748,7 +3873,7 @@ static const char DeleteProgramsNV_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord1sARB_names[] = 
+static const char MultiTexCoord1sARB_names[] =
     "ii\0" /* Parameter signature */
     "glMultiTexCoord1s\0"
     "glMultiTexCoord1sARB\0"
@@ -3756,14 +3881,14 @@ static const char MultiTexCoord1sARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiColor3fVertex3fSUN_names[] = 
+static const char ReplacementCodeuiColor3fVertex3fSUN_names[] =
     "iffffff\0" /* Parameter signature */
     "glReplacementCodeuiColor3fVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char GetVertexAttribPointervNV_names[] = 
+static const char GetVertexAttribPointervNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetVertexAttribPointerv\0"
     "glGetVertexAttribPointervARB\0"
@@ -3772,7 +3897,7 @@ static const char GetVertexAttribPointervNV_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord1dvARB_names[] = 
+static const char MultiTexCoord1dvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord1dv\0"
     "glMultiTexCoord1dvARB\0"
@@ -3780,7 +3905,7 @@ static const char MultiTexCoord1dvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform2iARB_names[] = 
+static const char Uniform2iARB_names[] =
     "iii\0" /* Parameter signature */
     "glUniform2i\0"
     "glUniform2iARB\0"
@@ -3788,57 +3913,64 @@ static const char Uniform2iARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char GetProgramStringNV_names[] = 
+static const char GetProgramStringNV_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramStringNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char ColorPointerEXT_names[] = 
+static const char ColorPointerEXT_names[] =
     "iiiip\0" /* Parameter signature */
     "glColorPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char MapBufferARB_names[] = 
+static const char MapBufferARB_names[] =
     "ii\0" /* Parameter signature */
     "glMapBuffer\0"
     "glMapBufferARB\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_draw_elements_base_vertex)
+static const char MultiDrawElementsBaseVertex_names[] =
+    "ipipip\0" /* Parameter signature */
+    "glMultiDrawElementsBaseVertex\0"
+    "";
+#endif
+
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3svEXT_names[] = 
+static const char Binormal3svEXT_names[] =
     "p\0" /* Parameter signature */
     "glBinormal3svEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_light_texture)
-static const char ApplyTextureEXT_names[] = 
+static const char ApplyTextureEXT_names[] =
     "i\0" /* Parameter signature */
     "glApplyTextureEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_light_texture)
-static const char TextureMaterialEXT_names[] = 
+static const char TextureMaterialEXT_names[] =
     "ii\0" /* Parameter signature */
     "glTextureMaterialEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_light_texture)
-static const char TextureLightEXT_names[] = 
+static const char TextureLightEXT_names[] =
     "i\0" /* Parameter signature */
     "glTextureLightEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char ResetMinmax_names[] = 
+static const char ResetMinmax_names[] =
     "i\0" /* Parameter signature */
     "glResetMinmax\0"
     "glResetMinmaxEXT\0"
@@ -3846,21 +3978,21 @@ static const char ResetMinmax_names[] =
 #endif
 
 #if defined(need_GL_SGIX_sprite)
-static const char SpriteParameterfSGIX_names[] = 
+static const char SpriteParameterfSGIX_names[] =
     "if\0" /* Parameter signature */
     "glSpriteParameterfSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib4sNV_names[] = 
+static const char VertexAttrib4sNV_names[] =
     "iiiii\0" /* Parameter signature */
     "glVertexAttrib4sNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char GetConvolutionParameterfv_names[] = 
+static const char GetConvolutionParameterfv_names[] =
     "iip\0" /* Parameter signature */
     "glGetConvolutionParameterfv\0"
     "glGetConvolutionParameterfvEXT\0"
@@ -3868,36 +4000,50 @@ static const char GetConvolutionParameterfv_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs4dvNV_names[] = 
+static const char VertexAttribs4dvNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs4dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4dARB_names[] = 
+static const char VertexAttrib4dARB_names[] =
     "idddd\0" /* Parameter signature */
     "glVertexAttrib4d\0"
     "glVertexAttrib4dARB\0"
     "";
 #endif
 
+#if defined(need_GL_ATI_envmap_bumpmap)
+static const char GetTexBumpParameterfvATI_names[] =
+    "ip\0" /* Parameter signature */
+    "glGetTexBumpParameterfvATI\0"
+    "";
+#endif
+
 #if defined(need_GL_NV_fragment_program)
-static const char ProgramNamedParameter4dNV_names[] = 
+static const char ProgramNamedParameter4dNV_names[] =
     "iipdddd\0" /* Parameter signature */
     "glProgramNamedParameter4dNV\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_vertex_weighting)
-static const char VertexWeightfEXT_names[] = 
+static const char VertexWeightfEXT_names[] =
     "f\0" /* Parameter signature */
     "glVertexWeightfEXT\0"
     "";
 #endif
 
+#if defined(need_GL_EXT_coordinate_frame)
+static const char Binormal3fEXT_names[] =
+    "fff\0" /* Parameter signature */
+    "glBinormal3fEXT\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
-static const char FogCoordfvEXT_names[] = 
+static const char FogCoordfvEXT_names[] =
     "p\0" /* Parameter signature */
     "glFogCoordfv\0"
     "glFogCoordfvEXT\0"
@@ -3905,7 +4051,7 @@ static const char FogCoordfvEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord1ivARB_names[] = 
+static const char MultiTexCoord1ivARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord1iv\0"
     "glMultiTexCoord1ivARB\0"
@@ -3913,7 +4059,7 @@ static const char MultiTexCoord1ivARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3ubEXT_names[] = 
+static const char SecondaryColor3ubEXT_names[] =
     "iii\0" /* Parameter signature */
     "glSecondaryColor3ub\0"
     "glSecondaryColor3ubEXT\0"
@@ -3921,7 +4067,7 @@ static const char SecondaryColor3ubEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord2ivARB_names[] = 
+static const char MultiTexCoord2ivARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord2iv\0"
     "glMultiTexCoord2ivARB\0"
@@ -3929,14 +4075,14 @@ static const char MultiTexCoord2ivARB_names[] =
 #endif
 
 #if defined(need_GL_SGIS_fog_function)
-static const char FogFuncSGIS_names[] = 
+static const char FogFuncSGIS_names[] =
     "ip\0" /* Parameter signature */
     "glFogFuncSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_copy_texture)
-static const char CopyTexSubImage2D_names[] = 
+static const char CopyTexSubImage2D_names[] =
     "iiiiiiii\0" /* Parameter signature */
     "glCopyTexSubImage2D\0"
     "glCopyTexSubImage2DEXT\0"
@@ -3944,35 +4090,35 @@ static const char CopyTexSubImage2D_names[] =
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char GetObjectParameterivARB_names[] = 
+static const char GetObjectParameterivARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetObjectParameterivARB\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord4fVertex4fSUN_names[] = 
+static const char TexCoord4fVertex4fSUN_names[] =
     "ffffffff\0" /* Parameter signature */
     "glTexCoord4fVertex4fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_APPLE_vertex_array_object)
-static const char BindVertexArrayAPPLE_names[] = 
+static const char BindVertexArrayAPPLE_names[] =
     "i\0" /* Parameter signature */
     "glBindVertexArrayAPPLE\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char GetProgramLocalParameterdvARB_names[] = 
+static const char GetProgramLocalParameterdvARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramLocalParameterdvARB\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char GetHistogramParameteriv_names[] = 
+static const char GetHistogramParameteriv_names[] =
     "iip\0" /* Parameter signature */
     "glGetHistogramParameteriv\0"
     "glGetHistogramParameterivEXT\0"
@@ -3980,7 +4126,7 @@ static const char GetHistogramParameteriv_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord1iARB_names[] = 
+static const char MultiTexCoord1iARB_names[] =
     "ii\0" /* Parameter signature */
     "glMultiTexCoord1i\0"
     "glMultiTexCoord1iARB\0"
@@ -3988,7 +4134,7 @@ static const char MultiTexCoord1iARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char GetConvolutionFilter_names[] = 
+static const char GetConvolutionFilter_names[] =
     "iiip\0" /* Parameter signature */
     "glGetConvolutionFilter\0"
     "glGetConvolutionFilterEXT\0"
@@ -3996,14 +4142,14 @@ static const char GetConvolutionFilter_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char GetProgramivARB_names[] = 
+static const char GetProgramivARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramivARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_blend_func_separate) || defined(need_GL_INGR_blend_func_separate)
-static const char BlendFuncSeparateEXT_names[] = 
+static const char BlendFuncSeparateEXT_names[] =
     "iiii\0" /* Parameter signature */
     "glBlendFuncSeparate\0"
     "glBlendFuncSeparateEXT\0"
@@ -4011,50 +4157,50 @@ static const char BlendFuncSeparateEXT_names[] =
     "";
 #endif
 
-#if defined(need_GL_APPLE_vertex_array_object)
-static const char IsVertexArrayAPPLE_names[] = 
-    "i\0" /* Parameter signature */
-    "glIsVertexArrayAPPLE\0"
+#if defined(need_GL_ARB_map_buffer_range)
+static const char MapBufferRange_names[] =
+    "iiii\0" /* Parameter signature */
+    "glMapBufferRange\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char ProgramParameters4dvNV_names[] = 
+static const char ProgramParameters4dvNV_names[] =
     "iiip\0" /* Parameter signature */
     "glProgramParameters4dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord2fColor3fVertex3fvSUN_names[] = 
+static const char TexCoord2fColor3fVertex3fvSUN_names[] =
     "ppp\0" /* Parameter signature */
     "glTexCoord2fColor3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3dvEXT_names[] = 
+static const char Binormal3dvEXT_names[] =
     "p\0" /* Parameter signature */
     "glBinormal3dvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_fence)
-static const char FinishFenceNV_names[] = 
+static const char FinishFenceNV_names[] =
     "i\0" /* Parameter signature */
     "glFinishFenceNV\0"
     "";
 #endif
 
 #if defined(need_GL_SGIS_fog_function)
-static const char GetFogFuncSGIS_names[] = 
+static const char GetFogFuncSGIS_names[] =
     "p\0" /* Parameter signature */
     "glGetFogFuncSGIS\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char GetUniformLocationARB_names[] = 
+static const char GetUniformLocationARB_names[] =
     "ip\0" /* Parameter signature */
     "glGetUniformLocation\0"
     "glGetUniformLocationARB\0"
@@ -4062,7 +4208,7 @@ static const char GetUniformLocationARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3fEXT_names[] = 
+static const char SecondaryColor3fEXT_names[] =
     "fff\0" /* Parameter signature */
     "glSecondaryColor3f\0"
     "glSecondaryColor3fEXT\0"
@@ -4070,14 +4216,14 @@ static const char SecondaryColor3fEXT_names[] =
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char CombinerInputNV_names[] = 
+static const char CombinerInputNV_names[] =
     "iiiiii\0" /* Parameter signature */
     "glCombinerInputNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib3sARB_names[] = 
+static const char VertexAttrib3sARB_names[] =
     "iiii\0" /* Parameter signature */
     "glVertexAttrib3s\0"
     "glVertexAttrib3sARB\0"
@@ -4085,49 +4231,49 @@ static const char VertexAttrib3sARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiNormal3fVertex3fvSUN_names[] = 
+static const char ReplacementCodeuiNormal3fVertex3fvSUN_names[] =
     "ppp\0" /* Parameter signature */
     "glReplacementCodeuiNormal3fVertex3fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char ProgramStringARB_names[] = 
+static const char ProgramStringARB_names[] =
     "iiip\0" /* Parameter signature */
     "glProgramStringARB\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char TexCoord4fVertex4fvSUN_names[] = 
+static const char TexCoord4fVertex4fvSUN_names[] =
     "pp\0" /* Parameter signature */
     "glTexCoord4fVertex4fvSUN\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib3sNV_names[] = 
+static const char VertexAttrib3sNV_names[] =
     "iiii\0" /* Parameter signature */
     "glVertexAttrib3sNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib1fNV_names[] = 
+static const char VertexAttrib1fNV_names[] =
     "if\0" /* Parameter signature */
     "glVertexAttrib1fNV\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentLightfSGIX_names[] = 
+static const char FragmentLightfSGIX_names[] =
     "iif\0" /* Parameter signature */
     "glFragmentLightfSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_texture_compression)
-static const char GetCompressedTexImageARB_names[] = 
+static const char GetCompressedTexImageARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetCompressedTexImage\0"
     "glGetCompressedTexImageARB\0"
@@ -4135,14 +4281,14 @@ static const char GetCompressedTexImageARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_vertex_weighting)
-static const char VertexWeightPointerEXT_names[] = 
+static const char VertexWeightPointerEXT_names[] =
     "iiip\0" /* Parameter signature */
     "glVertexWeightPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char GetHistogram_names[] = 
+static const char GetHistogram_names[] =
     "iiiip\0" /* Parameter signature */
     "glGetHistogram\0"
     "glGetHistogramEXT\0"
@@ -4150,21 +4296,21 @@ static const char GetHistogram_names[] =
 #endif
 
 #if defined(need_GL_EXT_stencil_two_side)
-static const char ActiveStencilFaceEXT_names[] = 
+static const char ActiveStencilFaceEXT_names[] =
     "i\0" /* Parameter signature */
     "glActiveStencilFaceEXT\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_separate_stencil)
-static const char StencilFuncSeparateATI_names[] = 
+static const char StencilFuncSeparateATI_names[] =
     "iiii\0" /* Parameter signature */
     "glStencilFuncSeparateATI\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char GetShaderSourceARB_names[] = 
+static const char GetShaderSourceARB_names[] =
     "iipp\0" /* Parameter signature */
     "glGetShaderSource\0"
     "glGetShaderSourceARB\0"
@@ -4172,28 +4318,28 @@ static const char GetShaderSourceARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_igloo_interface)
-static const char IglooInterfaceSGIX_names[] = 
+static const char IglooInterfaceSGIX_names[] =
     "ip\0" /* Parameter signature */
     "glIglooInterfaceSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib4dNV_names[] = 
+static const char VertexAttrib4dNV_names[] =
     "idddd\0" /* Parameter signature */
     "glVertexAttrib4dNV\0"
     "";
 #endif
 
 #if defined(need_GL_IBM_multimode_draw_arrays)
-static const char MultiModeDrawElementsIBM_names[] = 
+static const char MultiModeDrawElementsIBM_names[] =
     "ppipii\0" /* Parameter signature */
     "glMultiModeDrawElementsIBM\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord4svARB_names[] = 
+static const char MultiTexCoord4svARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord4sv\0"
     "glMultiTexCoord4svARB\0"
@@ -4201,7 +4347,7 @@ static const char MultiTexCoord4svARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_occlusion_query)
-static const char GenQueriesARB_names[] = 
+static const char GenQueriesARB_names[] =
     "ip\0" /* Parameter signature */
     "glGenQueries\0"
     "glGenQueriesARB\0"
@@ -4209,35 +4355,42 @@ static const char GenQueriesARB_names[] =
 #endif
 
 #if defined(need_GL_SUN_vertex)
-static const char ReplacementCodeuiVertex3fSUN_names[] = 
+static const char ReplacementCodeuiVertex3fSUN_names[] =
     "ifff\0" /* Parameter signature */
     "glReplacementCodeuiVertex3fSUN\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3iEXT_names[] = 
+static const char Tangent3iEXT_names[] =
     "iii\0" /* Parameter signature */
     "glTangent3iEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_mesh_array)
-static const char DrawMeshArraysSUN_names[] = 
+static const char DrawMeshArraysSUN_names[] =
     "iiii\0" /* Parameter signature */
     "glDrawMeshArraysSUN\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_sync)
+static const char IsSync_names[] =
+    "i\0" /* Parameter signature */
+    "glIsSync\0"
+    "";
+#endif
+
 #if defined(need_GL_NV_evaluators)
-static const char GetMapControlPointsNV_names[] = 
+static const char GetMapControlPointsNV_names[] =
     "iiiiiip\0" /* Parameter signature */
     "glGetMapControlPointsNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_draw_buffers) || defined(need_GL_ATI_draw_buffers)
-static const char DrawBuffersARB_names[] = 
+static const char DrawBuffersARB_names[] =
     "ip\0" /* Parameter signature */
     "glDrawBuffers\0"
     "glDrawBuffersARB\0"
@@ -4246,21 +4399,28 @@ static const char DrawBuffersARB_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char ProgramLocalParameter4fARB_names[] = 
+static const char ProgramLocalParameter4fARB_names[] =
     "iiffff\0" /* Parameter signature */
     "glProgramLocalParameter4fARB\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_sprite)
-static const char SpriteParameterivSGIX_names[] = 
+static const char SpriteParameterivSGIX_names[] =
     "ip\0" /* Parameter signature */
     "glSpriteParameterivSGIX\0"
     "";
 #endif
 
+#if defined(need_GL_EXT_provoking_vertex)
+static const char ProvokingVertexEXT_names[] =
+    "i\0" /* Parameter signature */
+    "glProvokingVertexEXT\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord1fARB_names[] = 
+static const char MultiTexCoord1fARB_names[] =
     "if\0" /* Parameter signature */
     "glMultiTexCoord1f\0"
     "glMultiTexCoord1fARB\0"
@@ -4268,21 +4428,21 @@ static const char MultiTexCoord1fARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs4ubvNV_names[] = 
+static const char VertexAttribs4ubvNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs4ubvNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightsvARB_names[] = 
+static const char WeightsvARB_names[] =
     "ip\0" /* Parameter signature */
     "glWeightsvARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_shader_objects)
-static const char Uniform1fvARB_names[] = 
+static const char Uniform1fvARB_names[] =
     "iip\0" /* Parameter signature */
     "glUniform1fv\0"
     "glUniform1fvARB\0"
@@ -4290,7 +4450,7 @@ static const char Uniform1fvARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_copy_texture)
-static const char CopyTexSubImage1D_names[] = 
+static const char CopyTexSubImage1D_names[] =
     "iiiiii\0" /* Parameter signature */
     "glCopyTexSubImage1D\0"
     "glCopyTexSubImage1DEXT\0"
@@ -4298,7 +4458,7 @@ static const char CopyTexSubImage1D_names[] =
 #endif
 
 #if defined(need_GL_EXT_texture_object)
-static const char BindTexture_names[] = 
+static const char BindTexture_names[] =
     "ii\0" /* Parameter signature */
     "glBindTexture\0"
     "glBindTextureEXT\0"
@@ -4306,14 +4466,14 @@ static const char BindTexture_names[] =
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char BeginFragmentShaderATI_names[] = 
+static const char BeginFragmentShaderATI_names[] =
     "\0" /* Parameter signature */
     "glBeginFragmentShaderATI\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord4fARB_names[] = 
+static const char MultiTexCoord4fARB_names[] =
     "iffff\0" /* Parameter signature */
     "glMultiTexCoord4f\0"
     "glMultiTexCoord4fARB\0"
@@ -4321,21 +4481,21 @@ static const char MultiTexCoord4fARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs3svNV_names[] = 
+static const char VertexAttribs3svNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs3svNV\0"
     "";
 #endif
 
 #if defined(need_GL_SUN_triangle_list)
-static const char ReplacementCodeuivSUN_names[] = 
+static const char ReplacementCodeuivSUN_names[] =
     "p\0" /* Parameter signature */
     "glReplacementCodeuivSUN\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char EnableVertexAttribArrayARB_names[] = 
+static const char EnableVertexAttribArrayARB_names[] =
     "i\0" /* Parameter signature */
     "glEnableVertexAttribArray\0"
     "glEnableVertexAttribArrayARB\0"
@@ -4343,14 +4503,14 @@ static const char EnableVertexAttribArrayARB_names[] =
 #endif
 
 #if defined(need_GL_INTEL_parallel_arrays)
-static const char NormalPointervINTEL_names[] = 
+static const char NormalPointervINTEL_names[] =
     "ip\0" /* Parameter signature */
     "glNormalPointervINTEL\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_convolution)
-static const char CopyConvolutionFilter2D_names[] = 
+static const char CopyConvolutionFilter2D_names[] =
     "iiiiii\0" /* Parameter signature */
     "glCopyConvolutionFilter2D\0"
     "glCopyConvolutionFilter2DEXT\0"
@@ -4358,7 +4518,7 @@ static const char CopyConvolutionFilter2D_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos3ivMESA_names[] = 
+static const char WindowPos3ivMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos3iv\0"
     "glWindowPos3ivARB\0"
@@ -4366,8 +4526,15 @@ static const char WindowPos3ivMESA_names[] =
     "";
 #endif
 
+#if defined(need_GL_ARB_copy_buffer)
+static const char CopyBufferSubData_names[] =
+    "iiiii\0" /* Parameter signature */
+    "glCopyBufferSubData\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char IsBufferARB_names[] = 
+static const char IsBufferARB_names[] =
     "i\0" /* Parameter signature */
     "glIsBuffer\0"
     "glIsBufferARB\0"
@@ -4375,14 +4542,14 @@ static const char IsBufferARB_names[] =
 #endif
 
 #if defined(need_GL_MESA_window_pos)
-static const char WindowPos4iMESA_names[] = 
+static const char WindowPos4iMESA_names[] =
     "iiii\0" /* Parameter signature */
     "glWindowPos4iMESA\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4uivARB_names[] = 
+static const char VertexAttrib4uivARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4uiv\0"
     "glVertexAttrib4uivARB\0"
@@ -4390,35 +4557,35 @@ static const char VertexAttrib4uivARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3bvEXT_names[] = 
+static const char Tangent3bvEXT_names[] =
     "p\0" /* Parameter signature */
     "glTangent3bvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_1)
-static const char UniformMatrix3x4fv_names[] = 
+static const char UniformMatrix3x4fv_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix3x4fv\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3fvEXT_names[] = 
-    "p\0" /* Parameter signature */
-    "glBinormal3fvEXT\0"
+#if defined(need_GL_ARB_draw_elements_base_vertex)
+static const char DrawRangeElementsBaseVertex_names[] =
+    "iiiiipi\0" /* Parameter signature */
+    "glDrawRangeElementsBaseVertex\0"
     "";
 #endif
 
 #if defined(need_GL_INTEL_parallel_arrays)
-static const char TexCoordPointervINTEL_names[] = 
+static const char TexCoordPointervINTEL_names[] =
     "iip\0" /* Parameter signature */
     "glTexCoordPointervINTEL\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char DeleteBuffersARB_names[] = 
+static const char DeleteBuffersARB_names[] =
     "ip\0" /* Parameter signature */
     "glDeleteBuffers\0"
     "glDeleteBuffersARB\0"
@@ -4426,21 +4593,21 @@ static const char DeleteBuffersARB_names[] =
 #endif
 
 #if defined(need_GL_MESA_window_pos)
-static const char WindowPos4fvMESA_names[] = 
+static const char WindowPos4fvMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos4fvMESA\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib1sNV_names[] = 
+static const char VertexAttrib1sNV_names[] =
     "ii\0" /* Parameter signature */
     "glVertexAttrib1sNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_secondary_color)
-static const char SecondaryColor3svEXT_names[] = 
+static const char SecondaryColor3svEXT_names[] =
     "p\0" /* Parameter signature */
     "glSecondaryColor3sv\0"
     "glSecondaryColor3svEXT\0"
@@ -4448,7 +4615,7 @@ static const char SecondaryColor3svEXT_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_3) || defined(need_GL_ARB_transpose_matrix)
-static const char LoadTransposeMatrixfARB_names[] = 
+static const char LoadTransposeMatrixfARB_names[] =
     "p\0" /* Parameter signature */
     "glLoadTransposeMatrixf\0"
     "glLoadTransposeMatrixfARB\0"
@@ -4456,7 +4623,7 @@ static const char LoadTransposeMatrixfARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char GetPointerv_names[] = 
+static const char GetPointerv_names[] =
     "ip\0" /* Parameter signature */
     "glGetPointerv\0"
     "glGetPointervEXT\0"
@@ -4464,21 +4631,21 @@ static const char GetPointerv_names[] =
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3bEXT_names[] = 
+static const char Tangent3bEXT_names[] =
     "iii\0" /* Parameter signature */
     "glTangent3bEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char CombinerParameterfNV_names[] = 
+static const char CombinerParameterfNV_names[] =
     "if\0" /* Parameter signature */
     "glCombinerParameterfNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char BindProgramNV_names[] = 
+static const char BindProgramNV_names[] =
     "ii\0" /* Parameter signature */
     "glBindProgramARB\0"
     "glBindProgramNV\0"
@@ -4486,7 +4653,7 @@ static const char BindProgramNV_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4svARB_names[] = 
+static const char VertexAttrib4svARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4sv\0"
     "glVertexAttrib4svARB\0"
@@ -4494,28 +4661,35 @@ static const char VertexAttrib4svARB_names[] =
 #endif
 
 #if defined(need_GL_MESA_shader_debug)
-static const char CreateDebugObjectMESA_names[] = 
+static const char CreateDebugObjectMESA_names[] =
     "\0" /* Parameter signature */
     "glCreateDebugObjectMESA\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0)
-static const char GetShaderiv_names[] = 
+static const char GetShaderiv_names[] =
     "iip\0" /* Parameter signature */
     "glGetShaderiv\0"
     "";
 #endif
 
+#if defined(need_GL_ARB_sync)
+static const char ClientWaitSync_names[] =
+    "iii\0" /* Parameter signature */
+    "glClientWaitSync\0"
+    "";
+#endif
+
 #if defined(need_GL_ATI_fragment_shader)
-static const char BindFragmentShaderATI_names[] = 
+static const char BindFragmentShaderATI_names[] =
     "i\0" /* Parameter signature */
     "glBindFragmentShaderATI\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char UnmapBufferARB_names[] = 
+static const char UnmapBufferARB_names[] =
     "i\0" /* Parameter signature */
     "glUnmapBuffer\0"
     "glUnmapBufferARB\0"
@@ -4523,15 +4697,22 @@ static const char UnmapBufferARB_names[] =
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char Minmax_names[] = 
+static const char Minmax_names[] =
     "iii\0" /* Parameter signature */
     "glMinmax\0"
     "glMinmaxEXT\0"
     "";
 #endif
 
+#if defined(need_GL_SGIX_polynomial_ffd)
+static const char DeformationMap3dSGIX_names[] =
+    "iddiiddiiddiip\0" /* Parameter signature */
+    "glDeformationMap3dSGIX\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
-static const char FogCoorddvEXT_names[] = 
+static const char FogCoorddvEXT_names[] =
     "p\0" /* Parameter signature */
     "glFogCoorddv\0"
     "glFogCoorddvEXT\0"
@@ -4539,35 +4720,35 @@ static const char FogCoorddvEXT_names[] =
 #endif
 
 #if defined(need_GL_SUNX_constant_data)
-static const char FinishTextureSUNX_names[] = 
+static const char FinishTextureSUNX_names[] =
     "\0" /* Parameter signature */
     "glFinishTextureSUNX\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char GetFragmentLightfvSGIX_names[] = 
+static const char GetFragmentLightfvSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glGetFragmentLightfvSGIX\0"
     "";
 #endif
 
-#if defined(need_GL_NV_register_combiners)
-static const char GetFinalCombinerInputParameterfvNV_names[] = 
-    "iip\0" /* Parameter signature */
-    "glGetFinalCombinerInputParameterfvNV\0"
+#if defined(need_GL_EXT_coordinate_frame)
+static const char Binormal3fvEXT_names[] =
+    "p\0" /* Parameter signature */
+    "glBinormal3fvEXT\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char ColorFragmentOp3ATI_names[] = 
+static const char ColorFragmentOp3ATI_names[] =
     "iiiiiiiiiiiii\0" /* Parameter signature */
     "glColorFragmentOp3ATI\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib2svARB_names[] = 
+static const char VertexAttrib2svARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib2sv\0"
     "glVertexAttrib2svARB\0"
@@ -4575,14 +4756,14 @@ static const char VertexAttrib2svARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char AreProgramsResidentNV_names[] = 
+static const char AreProgramsResidentNV_names[] =
     "ipp\0" /* Parameter signature */
     "glAreProgramsResidentNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos3svMESA_names[] = 
+static const char WindowPos3svMESA_names[] =
     "p\0" /* Parameter signature */
     "glWindowPos3sv\0"
     "glWindowPos3svARB\0"
@@ -4591,7 +4772,7 @@ static const char WindowPos3svMESA_names[] =
 #endif
 
 #if defined(need_GL_EXT_color_subtable)
-static const char CopyColorSubTable_names[] = 
+static const char CopyColorSubTable_names[] =
     "iiiii\0" /* Parameter signature */
     "glCopyColorSubTable\0"
     "glCopyColorSubTableEXT\0"
@@ -4599,21 +4780,22 @@ static const char CopyColorSubTable_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightdvARB_names[] = 
+static const char WeightdvARB_names[] =
     "ip\0" /* Parameter signature */
     "glWeightdvARB\0"
     "";
 #endif
 
-#if defined(need_GL_SGIX_instruments)
-static const char PollInstrumentsSGIX_names[] = 
-    "p\0" /* Parameter signature */
-    "glPollInstrumentsSGIX\0"
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char DeleteRenderbuffersEXT_names[] =
+    "ip\0" /* Parameter signature */
+    "glDeleteRenderbuffers\0"
+    "glDeleteRenderbuffersEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib4NubvARB_names[] = 
+static const char VertexAttrib4NubvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4Nubv\0"
     "glVertexAttrib4NubvARB\0"
@@ -4621,92 +4803,93 @@ static const char VertexAttrib4NubvARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib3dvNV_names[] = 
+static const char VertexAttrib3dvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib3dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char GetObjectParameterfvARB_names[] = 
+static const char GetObjectParameterfvARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetObjectParameterfvARB\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program)
-static const char GetProgramEnvParameterdvARB_names[] = 
+static const char GetProgramEnvParameterdvARB_names[] =
     "iip\0" /* Parameter signature */
     "glGetProgramEnvParameterdvARB\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_compiled_vertex_array)
-static const char LockArraysEXT_names[] = 
+static const char LockArraysEXT_names[] =
     "ii\0" /* Parameter signature */
     "glLockArraysEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_pixel_transform)
-static const char PixelTransformParameterivEXT_names[] = 
+static const char PixelTransformParameterivEXT_names[] =
     "iip\0" /* Parameter signature */
     "glPixelTransformParameterivEXT\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char BinormalPointerEXT_names[] = 
+static const char BinormalPointerEXT_names[] =
     "iip\0" /* Parameter signature */
     "glBinormalPointerEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib1dNV_names[] = 
+static const char VertexAttrib1dNV_names[] =
     "id\0" /* Parameter signature */
     "glVertexAttrib1dNV\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char GetCombinerInputParameterivNV_names[] = 
+static const char GetCombinerInputParameterivNV_names[] =
     "iiiip\0" /* Parameter signature */
     "glGetCombinerInputParameterivNV\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_3)
-static const char MultiTexCoord2fvARB_names[] = 
+static const char MultiTexCoord2fvARB_names[] =
     "ip\0" /* Parameter signature */
     "glMultiTexCoord2fv\0"
     "glMultiTexCoord2fvARB\0"
     "";
 #endif
 
-#if defined(need_GL_EXT_framebuffer_object)
-static const char GetRenderbufferParameterivEXT_names[] = 
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char GetRenderbufferParameterivEXT_names[] =
     "iip\0" /* Parameter signature */
+    "glGetRenderbufferParameteriv\0"
     "glGetRenderbufferParameterivEXT\0"
     "";
 #endif
 
 #if defined(need_GL_NV_register_combiners)
-static const char CombinerParameterivNV_names[] = 
+static const char CombinerParameterivNV_names[] =
     "ip\0" /* Parameter signature */
     "glCombinerParameterivNV\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char GenFragmentShadersATI_names[] = 
+static const char GenFragmentShadersATI_names[] =
     "i\0" /* Parameter signature */
     "glGenFragmentShadersATI\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_vertex_array)
-static const char DrawArrays_names[] = 
+static const char DrawArrays_names[] =
     "iii\0" /* Parameter signature */
     "glDrawArrays\0"
     "glDrawArraysEXT\0"
@@ -4714,14 +4897,14 @@ static const char DrawArrays_names[] =
 #endif
 
 #if defined(need_GL_ARB_vertex_blend)
-static const char WeightuivARB_names[] = 
+static const char WeightuivARB_names[] =
     "ip\0" /* Parameter signature */
     "glWeightuivARB\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib2sARB_names[] = 
+static const char VertexAttrib2sARB_names[] =
     "iii\0" /* Parameter signature */
     "glVertexAttrib2s\0"
     "glVertexAttrib2sARB\0"
@@ -4729,21 +4912,28 @@ static const char VertexAttrib2sARB_names[] =
 #endif
 
 #if defined(need_GL_SGIX_async)
-static const char GenAsyncMarkersSGIX_names[] = 
+static const char GenAsyncMarkersSGIX_names[] =
     "i\0" /* Parameter signature */
     "glGenAsyncMarkersSGIX\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Tangent3svEXT_names[] = 
+static const char Tangent3svEXT_names[] =
     "p\0" /* Parameter signature */
     "glTangent3svEXT\0"
     "";
 #endif
 
+#if defined(need_GL_SGIX_list_priority)
+static const char GetListParameterivSGIX_names[] =
+    "iip\0" /* Parameter signature */
+    "glGetListParameterivSGIX\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char BindBufferARB_names[] = 
+static const char BindBufferARB_names[] =
     "ii\0" /* Parameter signature */
     "glBindBuffer\0"
     "glBindBufferARB\0"
@@ -4751,35 +4941,35 @@ static const char BindBufferARB_names[] =
 #endif
 
 #if defined(need_GL_ARB_shader_objects)
-static const char GetInfoLogARB_names[] = 
+static const char GetInfoLogARB_names[] =
     "iipp\0" /* Parameter signature */
     "glGetInfoLogARB\0"
     "";
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs4svNV_names[] = 
+static const char VertexAttribs4svNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs4svNV\0"
     "";
 #endif
 
 #if defined(need_GL_IBM_vertex_array_lists)
-static const char EdgeFlagPointerListIBM_names[] = 
+static const char EdgeFlagPointerListIBM_names[] =
     "ipi\0" /* Parameter signature */
     "glEdgeFlagPointerListIBM\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_2_1)
-static const char UniformMatrix3x2fv_names[] = 
+static const char UniformMatrix3x2fv_names[] =
     "iiip\0" /* Parameter signature */
     "glUniformMatrix3x2fv\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_histogram)
-static const char GetMinmaxParameterfv_names[] = 
+static const char GetMinmaxParameterfv_names[] =
     "iip\0" /* Parameter signature */
     "glGetMinmaxParameterfv\0"
     "glGetMinmaxParameterfvEXT\0"
@@ -4787,7 +4977,7 @@ static const char GetMinmaxParameterfv_names[] =
 #endif
 
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
-static const char VertexAttrib1fvARB_names[] = 
+static const char VertexAttrib1fvARB_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib1fv\0"
     "glVertexAttrib1fvARB\0"
@@ -4795,7 +4985,7 @@ static const char VertexAttrib1fvARB_names[] =
 #endif
 
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
-static const char GenBuffersARB_names[] = 
+static const char GenBuffersARB_names[] =
     "ip\0" /* Parameter signature */
     "glGenBuffers\0"
     "glGenBuffersARB\0"
@@ -4803,35 +4993,43 @@ static const char GenBuffersARB_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttribs1svNV_names[] = 
+static const char VertexAttribs1svNV_names[] =
     "iip\0" /* Parameter signature */
     "glVertexAttribs1svNV\0"
     "";
 #endif
 
+#if defined(need_GL_ATI_envmap_bumpmap)
+static const char GetTexBumpParameterivATI_names[] =
+    "ip\0" /* Parameter signature */
+    "glGetTexBumpParameterivATI\0"
+    "";
+#endif
+
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3bEXT_names[] = 
+static const char Binormal3bEXT_names[] =
     "iii\0" /* Parameter signature */
     "glBinormal3bEXT\0"
     "";
 #endif
 
 #if defined(need_GL_SGIX_fragment_lighting)
-static const char FragmentMaterialivSGIX_names[] = 
+static const char FragmentMaterialivSGIX_names[] =
     "iip\0" /* Parameter signature */
     "glFragmentMaterialivSGIX\0"
     "";
 #endif
 
-#if defined(need_GL_NV_vertex_array_range)
-static const char VertexArrayRangeNV_names[] = 
-    "ip\0" /* Parameter signature */
-    "glVertexArrayRangeNV\0"
+#if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
+static const char IsRenderbufferEXT_names[] =
+    "i\0" /* Parameter signature */
+    "glIsRenderbuffer\0"
+    "glIsRenderbufferEXT\0"
     "";
 #endif
 
 #if defined(need_GL_ARB_vertex_program) || defined(need_GL_NV_vertex_program)
-static const char GenProgramsNV_names[] = 
+static const char GenProgramsNV_names[] =
     "ip\0" /* Parameter signature */
     "glGenProgramsARB\0"
     "glGenProgramsNV\0"
@@ -4839,28 +5037,28 @@ static const char GenProgramsNV_names[] =
 #endif
 
 #if defined(need_GL_NV_vertex_program)
-static const char VertexAttrib4dvNV_names[] = 
+static const char VertexAttrib4dvNV_names[] =
     "ip\0" /* Parameter signature */
     "glVertexAttrib4dvNV\0"
     "";
 #endif
 
 #if defined(need_GL_ATI_fragment_shader)
-static const char EndFragmentShaderATI_names[] = 
+static const char EndFragmentShaderATI_names[] =
     "\0" /* Parameter signature */
     "glEndFragmentShaderATI\0"
     "";
 #endif
 
 #if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3iEXT_names[] = 
+static const char Binormal3iEXT_names[] =
     "iii\0" /* Parameter signature */
     "glBinormal3iEXT\0"
     "";
 #endif
 
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_ARB_window_pos) || defined(need_GL_MESA_window_pos)
-static const char WindowPos2fMESA_names[] = 
+static const char WindowPos2fMESA_names[] =
     "ff\0" /* Parameter signature */
     "glWindowPos2f\0"
     "glWindowPos2fARB\0"
@@ -4875,12 +5073,35 @@ static const struct dri_extension_function GL_3DFX_tbuffer_functions[] = {
 };
 #endif
 
+#if defined(need_GL_APPLE_flush_buffer_range)
+static const struct dri_extension_function GL_APPLE_flush_buffer_range_functions[] = {
+    { BufferParameteriAPPLE_names, BufferParameteriAPPLE_remap_index, -1 },
+    { FlushMappedBufferRangeAPPLE_names, FlushMappedBufferRangeAPPLE_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
+#if defined(need_GL_APPLE_texture_range)
+static const struct dri_extension_function GL_APPLE_texture_range_functions[] = {
+    { TextureRangeAPPLE_names, TextureRangeAPPLE_remap_index, -1 },
+    { GetTexParameterPointervAPPLE_names, GetTexParameterPointervAPPLE_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_APPLE_vertex_array_object)
 static const struct dri_extension_function GL_APPLE_vertex_array_object_functions[] = {
     { DeleteVertexArraysAPPLE_names, DeleteVertexArraysAPPLE_remap_index, -1 },
     { GenVertexArraysAPPLE_names, GenVertexArraysAPPLE_remap_index, -1 },
-    { BindVertexArrayAPPLE_names, BindVertexArrayAPPLE_remap_index, -1 },
     { IsVertexArrayAPPLE_names, IsVertexArrayAPPLE_remap_index, -1 },
+    { BindVertexArrayAPPLE_names, BindVertexArrayAPPLE_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
+#if defined(need_GL_ARB_copy_buffer)
+static const struct dri_extension_function GL_ARB_copy_buffer_functions[] = {
+    { CopyBufferSubData_names, CopyBufferSubData_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
@@ -4892,6 +5113,49 @@ static const struct dri_extension_function GL_ARB_draw_buffers_functions[] = {
 };
 #endif
 
+#if defined(need_GL_ARB_draw_elements_base_vertex)
+static const struct dri_extension_function GL_ARB_draw_elements_base_vertex_functions[] = {
+    { DrawElementsBaseVertex_names, DrawElementsBaseVertex_remap_index, -1 },
+    { MultiDrawElementsBaseVertex_names, MultiDrawElementsBaseVertex_remap_index, -1 },
+    { DrawRangeElementsBaseVertex_names, DrawRangeElementsBaseVertex_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
+#if defined(need_GL_ARB_framebuffer_object)
+static const struct dri_extension_function GL_ARB_framebuffer_object_functions[] = {
+    { BlitFramebufferEXT_names, BlitFramebufferEXT_remap_index, -1 },
+    { FramebufferTextureLayerEXT_names, FramebufferTextureLayerEXT_remap_index, -1 },
+    { GenerateMipmapEXT_names, GenerateMipmapEXT_remap_index, -1 },
+    { RenderbufferStorageEXT_names, RenderbufferStorageEXT_remap_index, -1 },
+    { CheckFramebufferStatusEXT_names, CheckFramebufferStatusEXT_remap_index, -1 },
+    { FramebufferTexture3DEXT_names, FramebufferTexture3DEXT_remap_index, -1 },
+    { FramebufferTexture2DEXT_names, FramebufferTexture2DEXT_remap_index, -1 },
+    { RenderbufferStorageMultisample_names, RenderbufferStorageMultisample_remap_index, -1 },
+    { FramebufferRenderbufferEXT_names, FramebufferRenderbufferEXT_remap_index, -1 },
+    { FramebufferTexture1DEXT_names, FramebufferTexture1DEXT_remap_index, -1 },
+    { BindFramebufferEXT_names, BindFramebufferEXT_remap_index, -1 },
+    { GenRenderbuffersEXT_names, GenRenderbuffersEXT_remap_index, -1 },
+    { IsFramebufferEXT_names, IsFramebufferEXT_remap_index, -1 },
+    { GetFramebufferAttachmentParameterivEXT_names, GetFramebufferAttachmentParameterivEXT_remap_index, -1 },
+    { DeleteFramebuffersEXT_names, DeleteFramebuffersEXT_remap_index, -1 },
+    { GenFramebuffersEXT_names, GenFramebuffersEXT_remap_index, -1 },
+    { BindRenderbufferEXT_names, BindRenderbufferEXT_remap_index, -1 },
+    { DeleteRenderbuffersEXT_names, DeleteRenderbuffersEXT_remap_index, -1 },
+    { GetRenderbufferParameterivEXT_names, GetRenderbufferParameterivEXT_remap_index, -1 },
+    { IsRenderbufferEXT_names, IsRenderbufferEXT_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
+#if defined(need_GL_ARB_map_buffer_range)
+static const struct dri_extension_function GL_ARB_map_buffer_range_functions[] = {
+    { FlushMappedBufferRange_names, FlushMappedBufferRange_remap_index, -1 },
+    { MapBufferRange_names, MapBufferRange_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_ARB_matrix_palette)
 static const struct dri_extension_function GL_ARB_matrix_palette_functions[] = {
     { MatrixIndexusvARB_names, MatrixIndexusvARB_remap_index, -1 },
@@ -4977,6 +5241,19 @@ static const struct dri_extension_function GL_ARB_shader_objects_functions[] = {
 };
 #endif
 
+#if defined(need_GL_ARB_sync)
+static const struct dri_extension_function GL_ARB_sync_functions[] = {
+    { DeleteSync_names, DeleteSync_remap_index, -1 },
+    { FenceSync_names, FenceSync_remap_index, -1 },
+    { WaitSync_names, WaitSync_remap_index, -1 },
+    { GetInteger64v_names, GetInteger64v_remap_index, -1 },
+    { GetSynciv_names, GetSynciv_remap_index, -1 },
+    { IsSync_names, IsSync_remap_index, -1 },
+    { ClientWaitSync_names, ClientWaitSync_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_ARB_texture_compression)
 static const struct dri_extension_function GL_ARB_texture_compression_functions[] = {
     { CompressedTexSubImage2DARB_names, CompressedTexSubImage2DARB_remap_index, -1 },
@@ -5000,6 +5277,16 @@ static const struct dri_extension_function GL_ARB_transpose_matrix_functions[] =
 };
 #endif
 
+#if defined(need_GL_ARB_vertex_array_object)
+static const struct dri_extension_function GL_ARB_vertex_array_object_functions[] = {
+    { DeleteVertexArraysAPPLE_names, DeleteVertexArraysAPPLE_remap_index, -1 },
+    { GenVertexArrays_names, GenVertexArrays_remap_index, -1 },
+    { BindVertexArray_names, BindVertexArray_remap_index, -1 },
+    { IsVertexArrayAPPLE_names, IsVertexArrayAPPLE_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_ARB_vertex_blend)
 static const struct dri_extension_function GL_ARB_vertex_blend_functions[] = {
     { WeightubvARB_names, WeightubvARB_remap_index, -1 },
@@ -5146,6 +5433,16 @@ static const struct dri_extension_function GL_ATI_draw_buffers_functions[] = {
 };
 #endif
 
+#if defined(need_GL_ATI_envmap_bumpmap)
+static const struct dri_extension_function GL_ATI_envmap_bumpmap_functions[] = {
+    { TexBumpParameterfvATI_names, TexBumpParameterfvATI_remap_index, -1 },
+    { TexBumpParameterivATI_names, TexBumpParameterivATI_remap_index, -1 },
+    { GetTexBumpParameterfvATI_names, GetTexBumpParameterfvATI_remap_index, -1 },
+    { GetTexBumpParameterivATI_names, GetTexBumpParameterivATI_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_ATI_fragment_shader)
 static const struct dri_extension_function GL_ATI_fragment_shader_functions[] = {
     { ColorFragmentOp2ATI_names, ColorFragmentOp2ATI_remap_index, -1 },
@@ -5243,7 +5540,6 @@ static const struct dri_extension_function GL_EXT_coordinate_frame_functions[] =
     { Binormal3ivEXT_names, Binormal3ivEXT_remap_index, -1 },
     { Tangent3sEXT_names, Tangent3sEXT_remap_index, -1 },
     { Tangent3fvEXT_names, Tangent3fvEXT_remap_index, -1 },
-    { Binormal3fEXT_names, Binormal3fEXT_remap_index, -1 },
     { Tangent3dvEXT_names, Tangent3dvEXT_remap_index, -1 },
     { Binormal3bvEXT_names, Binormal3bvEXT_remap_index, -1 },
     { Binormal3dEXT_names, Binormal3dEXT_remap_index, -1 },
@@ -5252,11 +5548,12 @@ static const struct dri_extension_function GL_EXT_coordinate_frame_functions[] =
     { Tangent3ivEXT_names, Tangent3ivEXT_remap_index, -1 },
     { Tangent3dEXT_names, Tangent3dEXT_remap_index, -1 },
     { Binormal3svEXT_names, Binormal3svEXT_remap_index, -1 },
+    { Binormal3fEXT_names, Binormal3fEXT_remap_index, -1 },
     { Binormal3dvEXT_names, Binormal3dvEXT_remap_index, -1 },
     { Tangent3iEXT_names, Tangent3iEXT_remap_index, -1 },
     { Tangent3bvEXT_names, Tangent3bvEXT_remap_index, -1 },
-    { Binormal3fvEXT_names, Binormal3fvEXT_remap_index, -1 },
     { Tangent3bEXT_names, Tangent3bEXT_remap_index, -1 },
+    { Binormal3fvEXT_names, Binormal3fvEXT_remap_index, -1 },
     { BinormalPointerEXT_names, BinormalPointerEXT_remap_index, -1 },
     { Tangent3svEXT_names, Tangent3svEXT_remap_index, -1 },
     { Binormal3bEXT_names, Binormal3bEXT_remap_index, -1 },
@@ -5319,22 +5616,22 @@ static const struct dri_extension_function GL_EXT_framebuffer_blit_functions[] =
 #if defined(need_GL_EXT_framebuffer_object)
 static const struct dri_extension_function GL_EXT_framebuffer_object_functions[] = {
     { GenerateMipmapEXT_names, GenerateMipmapEXT_remap_index, -1 },
-    { IsRenderbufferEXT_names, IsRenderbufferEXT_remap_index, -1 },
     { RenderbufferStorageEXT_names, RenderbufferStorageEXT_remap_index, -1 },
     { CheckFramebufferStatusEXT_names, CheckFramebufferStatusEXT_remap_index, -1 },
-    { DeleteRenderbuffersEXT_names, DeleteRenderbuffersEXT_remap_index, -1 },
     { FramebufferTexture3DEXT_names, FramebufferTexture3DEXT_remap_index, -1 },
+    { FramebufferTexture2DEXT_names, FramebufferTexture2DEXT_remap_index, -1 },
     { FramebufferRenderbufferEXT_names, FramebufferRenderbufferEXT_remap_index, -1 },
     { FramebufferTexture1DEXT_names, FramebufferTexture1DEXT_remap_index, -1 },
     { BindFramebufferEXT_names, BindFramebufferEXT_remap_index, -1 },
     { GenRenderbuffersEXT_names, GenRenderbuffersEXT_remap_index, -1 },
     { IsFramebufferEXT_names, IsFramebufferEXT_remap_index, -1 },
-    { FramebufferTexture2DEXT_names, FramebufferTexture2DEXT_remap_index, -1 },
     { GetFramebufferAttachmentParameterivEXT_names, GetFramebufferAttachmentParameterivEXT_remap_index, -1 },
     { DeleteFramebuffersEXT_names, DeleteFramebuffersEXT_remap_index, -1 },
     { GenFramebuffersEXT_names, GenFramebuffersEXT_remap_index, -1 },
     { BindRenderbufferEXT_names, BindRenderbufferEXT_remap_index, -1 },
+    { DeleteRenderbuffersEXT_names, DeleteRenderbuffersEXT_remap_index, -1 },
     { GetRenderbufferParameterivEXT_names, GetRenderbufferParameterivEXT_remap_index, -1 },
+    { IsRenderbufferEXT_names, IsRenderbufferEXT_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
@@ -5415,8 +5712,8 @@ static const struct dri_extension_function GL_EXT_paletted_texture_functions[] =
 #if defined(need_GL_EXT_pixel_transform)
 static const struct dri_extension_function GL_EXT_pixel_transform_functions[] = {
     { PixelTransformParameterfvEXT_names, PixelTransformParameterfvEXT_remap_index, -1 },
-    { PixelTransformParameteriEXT_names, PixelTransformParameteriEXT_remap_index, -1 },
     { PixelTransformParameterfEXT_names, PixelTransformParameterfEXT_remap_index, -1 },
+    { PixelTransformParameteriEXT_names, PixelTransformParameteriEXT_remap_index, -1 },
     { PixelTransformParameterivEXT_names, PixelTransformParameterivEXT_remap_index, -1 },
     { NULL, 0, 0 }
 };
@@ -5437,6 +5734,13 @@ static const struct dri_extension_function GL_EXT_polygon_offset_functions[] = {
 };
 #endif
 
+#if defined(need_GL_EXT_provoking_vertex)
+static const struct dri_extension_function GL_EXT_provoking_vertex_functions[] = {
+    { ProvokingVertexEXT_names, ProvokingVertexEXT_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_EXT_secondary_color)
 static const struct dri_extension_function GL_EXT_secondary_color_functions[] = {
     { SecondaryColor3iEXT_names, SecondaryColor3iEXT_remap_index, -1 },
@@ -5691,6 +5995,7 @@ static const struct dri_extension_function GL_NV_point_sprite_functions[] = {
 static const struct dri_extension_function GL_NV_register_combiners_functions[] = {
     { CombinerOutputNV_names, CombinerOutputNV_remap_index, -1 },
     { CombinerParameterfvNV_names, CombinerParameterfvNV_remap_index, -1 },
+    { GetFinalCombinerInputParameterfvNV_names, GetFinalCombinerInputParameterfvNV_remap_index, -1 },
     { GetCombinerOutputParameterfvNV_names, GetCombinerOutputParameterfvNV_remap_index, -1 },
     { FinalCombinerInputNV_names, FinalCombinerInputNV_remap_index, -1 },
     { GetCombinerInputParameterfvNV_names, GetCombinerInputParameterfvNV_remap_index, -1 },
@@ -5699,7 +6004,6 @@ static const struct dri_extension_function GL_NV_register_combiners_functions[]
     { GetFinalCombinerInputParameterivNV_names, GetFinalCombinerInputParameterivNV_remap_index, -1 },
     { CombinerInputNV_names, CombinerInputNV_remap_index, -1 },
     { CombinerParameterfNV_names, CombinerParameterfNV_remap_index, -1 },
-    { GetFinalCombinerInputParameterfvNV_names, GetFinalCombinerInputParameterfvNV_remap_index, -1 },
     { GetCombinerInputParameterivNV_names, GetCombinerInputParameterivNV_remap_index, -1 },
     { CombinerParameterivNV_names, CombinerParameterivNV_remap_index, -1 },
     { NULL, 0, 0 }
@@ -5716,8 +6020,8 @@ static const struct dri_extension_function GL_NV_register_combiners2_functions[]
 
 #if defined(need_GL_NV_vertex_array_range)
 static const struct dri_extension_function GL_NV_vertex_array_range_functions[] = {
-    { FlushVertexArrayRangeNV_names, FlushVertexArrayRangeNV_remap_index, -1 },
     { VertexArrayRangeNV_names, VertexArrayRangeNV_remap_index, -1 },
+    { FlushVertexArrayRangeNV_names, FlushVertexArrayRangeNV_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
@@ -5726,6 +6030,7 @@ static const struct dri_extension_function GL_NV_vertex_array_range_functions[]
 static const struct dri_extension_function GL_NV_vertex_program_functions[] = {
     { VertexAttrib4ubvNV_names, VertexAttrib4ubvNV_remap_index, -1 },
     { VertexAttrib4svNV_names, VertexAttrib4svNV_remap_index, -1 },
+    { VertexAttribs3fvNV_names, VertexAttribs3fvNV_remap_index, -1 },
     { VertexAttribs1dvNV_names, VertexAttribs1dvNV_remap_index, -1 },
     { VertexAttrib1fvNV_names, VertexAttrib1fvNV_remap_index, -1 },
     { VertexAttrib4fNV_names, VertexAttrib4fNV_remap_index, -1 },
@@ -5734,7 +6039,6 @@ static const struct dri_extension_function GL_NV_vertex_program_functions[] = {
     { VertexAttribs3dvNV_names, VertexAttribs3dvNV_remap_index, -1 },
     { VertexAttribs4fvNV_names, VertexAttribs4fvNV_remap_index, -1 },
     { VertexAttrib2sNV_names, VertexAttrib2sNV_remap_index, -1 },
-    { VertexAttribs3fvNV_names, VertexAttribs3fvNV_remap_index, -1 },
     { ProgramEnvParameter4fvARB_names, ProgramEnvParameter4fvARB_remap_index, -1 },
     { LoadProgramNV_names, LoadProgramNV_remap_index, -1 },
     { VertexAttrib4fvNV_names, VertexAttrib4fvNV_remap_index, -1 },
@@ -5934,11 +6238,11 @@ static const struct dri_extension_function GL_SGIX_igloo_interface_functions[] =
 #if defined(need_GL_SGIX_instruments)
 static const struct dri_extension_function GL_SGIX_instruments_functions[] = {
     { ReadInstrumentsSGIX_names, ReadInstrumentsSGIX_remap_index, -1 },
+    { PollInstrumentsSGIX_names, PollInstrumentsSGIX_remap_index, -1 },
     { GetInstrumentsSGIX_names, GetInstrumentsSGIX_remap_index, -1 },
     { StartInstrumentsSGIX_names, StartInstrumentsSGIX_remap_index, -1 },
     { StopInstrumentsSGIX_names, StopInstrumentsSGIX_remap_index, -1 },
     { InstrumentsBufferSGIX_names, InstrumentsBufferSGIX_remap_index, -1 },
-    { PollInstrumentsSGIX_names, PollInstrumentsSGIX_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
@@ -5946,11 +6250,11 @@ static const struct dri_extension_function GL_SGIX_instruments_functions[] = {
 #if defined(need_GL_SGIX_list_priority)
 static const struct dri_extension_function GL_SGIX_list_priority_functions[] = {
     { ListParameterfSGIX_names, ListParameterfSGIX_remap_index, -1 },
-    { GetListParameterivSGIX_names, GetListParameterivSGIX_remap_index, -1 },
     { GetListParameterfvSGIX_names, GetListParameterfvSGIX_remap_index, -1 },
     { ListParameteriSGIX_names, ListParameteriSGIX_remap_index, -1 },
     { ListParameterfvSGIX_names, ListParameterfvSGIX_remap_index, -1 },
     { ListParameterivSGIX_names, ListParameterivSGIX_remap_index, -1 },
+    { GetListParameterivSGIX_names, GetListParameterivSGIX_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
@@ -5965,9 +6269,9 @@ static const struct dri_extension_function GL_SGIX_pixel_texture_functions[] = {
 #if defined(need_GL_SGIX_polynomial_ffd)
 static const struct dri_extension_function GL_SGIX_polynomial_ffd_functions[] = {
     { LoadIdentityDeformationMapSGIX_names, LoadIdentityDeformationMapSGIX_remap_index, -1 },
-    { DeformationMap3dSGIX_names, DeformationMap3dSGIX_remap_index, -1 },
     { DeformSGIX_names, DeformSGIX_remap_index, -1 },
     { DeformationMap3fSGIX_names, DeformationMap3fSGIX_remap_index, -1 },
+    { DeformationMap3dSGIX_names, DeformationMap3dSGIX_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
@@ -6335,3 +6639,463 @@ static const struct dri_extension_function GL_VERSION_2_1_functions[] = {
 };
 #endif
 
+#else /* IN_DRI_DRIVER */
+
+#if defined(need_GL_3DFX_tbuffer)
+#define GL_3DFX_tbuffer_functions NULL
+#endif
+
+#if defined(need_GL_APPLE_flush_buffer_range)
+#define GL_APPLE_flush_buffer_range_functions NULL
+#endif
+
+#if defined(need_GL_APPLE_texture_range)
+#define GL_APPLE_texture_range_functions NULL
+#endif
+
+#if defined(need_GL_APPLE_vertex_array_object)
+#define GL_APPLE_vertex_array_object_functions NULL
+#endif
+
+#if defined(need_GL_ARB_copy_buffer)
+#define GL_ARB_copy_buffer_functions NULL
+#endif
+
+#if defined(need_GL_ARB_draw_buffers)
+#define GL_ARB_draw_buffers_functions NULL
+#endif
+
+#if defined(need_GL_ARB_draw_elements_base_vertex)
+#define GL_ARB_draw_elements_base_vertex_functions NULL
+#endif
+
+#if defined(need_GL_ARB_framebuffer_object)
+#define GL_ARB_framebuffer_object_functions NULL
+#endif
+
+#if defined(need_GL_ARB_map_buffer_range)
+#define GL_ARB_map_buffer_range_functions NULL
+#endif
+
+#if defined(need_GL_ARB_matrix_palette)
+#define GL_ARB_matrix_palette_functions NULL
+#endif
+
+#if defined(need_GL_ARB_multisample)
+#define GL_ARB_multisample_functions NULL
+#endif
+
+#if defined(need_GL_ARB_occlusion_query)
+#define GL_ARB_occlusion_query_functions NULL
+#endif
+
+#if defined(need_GL_ARB_point_parameters)
+#define GL_ARB_point_parameters_functions NULL
+#endif
+
+#if defined(need_GL_ARB_shader_objects)
+#define GL_ARB_shader_objects_functions NULL
+#endif
+
+#if defined(need_GL_ARB_sync)
+#define GL_ARB_sync_functions NULL
+#endif
+
+#if defined(need_GL_ARB_texture_compression)
+#define GL_ARB_texture_compression_functions NULL
+#endif
+
+#if defined(need_GL_ARB_transpose_matrix)
+#define GL_ARB_transpose_matrix_functions NULL
+#endif
+
+#if defined(need_GL_ARB_vertex_array_object)
+#define GL_ARB_vertex_array_object_functions NULL
+#endif
+
+#if defined(need_GL_ARB_vertex_blend)
+#define GL_ARB_vertex_blend_functions NULL
+#endif
+
+#if defined(need_GL_ARB_vertex_buffer_object)
+#define GL_ARB_vertex_buffer_object_functions NULL
+#endif
+
+#if defined(need_GL_ARB_vertex_program)
+#define GL_ARB_vertex_program_functions NULL
+#endif
+
+#if defined(need_GL_ARB_vertex_shader)
+#define GL_ARB_vertex_shader_functions NULL
+#endif
+
+#if defined(need_GL_ARB_window_pos)
+#define GL_ARB_window_pos_functions NULL
+#endif
+
+#if defined(need_GL_ATI_blend_equation_separate)
+#define GL_ATI_blend_equation_separate_functions NULL
+#endif
+
+#if defined(need_GL_ATI_draw_buffers)
+#define GL_ATI_draw_buffers_functions NULL
+#endif
+
+#if defined(need_GL_ATI_envmap_bumpmap)
+#define GL_ATI_envmap_bumpmap_functions NULL
+#endif
+
+#if defined(need_GL_ATI_fragment_shader)
+#define GL_ATI_fragment_shader_functions NULL
+#endif
+
+#if defined(need_GL_ATI_separate_stencil)
+#define GL_ATI_separate_stencil_functions NULL
+#endif
+
+#if defined(need_GL_EXT_blend_color)
+#define GL_EXT_blend_color_functions NULL
+#endif
+
+#if defined(need_GL_EXT_blend_equation_separate)
+#define GL_EXT_blend_equation_separate_functions NULL
+#endif
+
+#if defined(need_GL_EXT_blend_func_separate)
+#define GL_EXT_blend_func_separate_functions NULL
+#endif
+
+#if defined(need_GL_EXT_blend_minmax)
+#define GL_EXT_blend_minmax_functions NULL
+#endif
+
+#if defined(need_GL_EXT_color_subtable)
+#define GL_EXT_color_subtable_functions NULL
+#endif
+
+#if defined(need_GL_EXT_compiled_vertex_array)
+#define GL_EXT_compiled_vertex_array_functions NULL
+#endif
+
+#if defined(need_GL_EXT_convolution)
+#define GL_EXT_convolution_functions NULL
+#endif
+
+#if defined(need_GL_EXT_coordinate_frame)
+#define GL_EXT_coordinate_frame_functions NULL
+#endif
+
+#if defined(need_GL_EXT_copy_texture)
+#define GL_EXT_copy_texture_functions NULL
+#endif
+
+#if defined(need_GL_EXT_cull_vertex)
+#define GL_EXT_cull_vertex_functions NULL
+#endif
+
+#if defined(need_GL_EXT_depth_bounds_test)
+#define GL_EXT_depth_bounds_test_functions NULL
+#endif
+
+#if defined(need_GL_EXT_draw_range_elements)
+#define GL_EXT_draw_range_elements_functions NULL
+#endif
+
+#if defined(need_GL_EXT_fog_coord)
+#define GL_EXT_fog_coord_functions NULL
+#endif
+
+#if defined(need_GL_EXT_framebuffer_blit)
+#define GL_EXT_framebuffer_blit_functions NULL
+#endif
+
+#if defined(need_GL_EXT_framebuffer_object)
+#define GL_EXT_framebuffer_object_functions NULL
+#endif
+
+#if defined(need_GL_EXT_gpu_program_parameters)
+#define GL_EXT_gpu_program_parameters_functions NULL
+#endif
+
+#if defined(need_GL_EXT_histogram)
+#define GL_EXT_histogram_functions NULL
+#endif
+
+#if defined(need_GL_EXT_index_func)
+#define GL_EXT_index_func_functions NULL
+#endif
+
+#if defined(need_GL_EXT_index_material)
+#define GL_EXT_index_material_functions NULL
+#endif
+
+#if defined(need_GL_EXT_light_texture)
+#define GL_EXT_light_texture_functions NULL
+#endif
+
+#if defined(need_GL_EXT_multi_draw_arrays)
+#define GL_EXT_multi_draw_arrays_functions NULL
+#endif
+
+#if defined(need_GL_EXT_multisample)
+#define GL_EXT_multisample_functions NULL
+#endif
+
+#if defined(need_GL_EXT_paletted_texture)
+#define GL_EXT_paletted_texture_functions NULL
+#endif
+
+#if defined(need_GL_EXT_pixel_transform)
+#define GL_EXT_pixel_transform_functions NULL
+#endif
+
+#if defined(need_GL_EXT_point_parameters)
+#define GL_EXT_point_parameters_functions NULL
+#endif
+
+#if defined(need_GL_EXT_polygon_offset)
+#define GL_EXT_polygon_offset_functions NULL
+#endif
+
+#if defined(need_GL_EXT_provoking_vertex)
+#define GL_EXT_provoking_vertex_functions NULL
+#endif
+
+#if defined(need_GL_EXT_secondary_color)
+#define GL_EXT_secondary_color_functions NULL
+#endif
+
+#if defined(need_GL_EXT_stencil_two_side)
+#define GL_EXT_stencil_two_side_functions NULL
+#endif
+
+#if defined(need_GL_EXT_subtexture)
+#define GL_EXT_subtexture_functions NULL
+#endif
+
+#if defined(need_GL_EXT_texture3D)
+#define GL_EXT_texture3D_functions NULL
+#endif
+
+#if defined(need_GL_EXT_texture_array)
+#define GL_EXT_texture_array_functions NULL
+#endif
+
+#if defined(need_GL_EXT_texture_object)
+#define GL_EXT_texture_object_functions NULL
+#endif
+
+#if defined(need_GL_EXT_texture_perturb_normal)
+#define GL_EXT_texture_perturb_normal_functions NULL
+#endif
+
+#if defined(need_GL_EXT_timer_query)
+#define GL_EXT_timer_query_functions NULL
+#endif
+
+#if defined(need_GL_EXT_vertex_array)
+#define GL_EXT_vertex_array_functions NULL
+#endif
+
+#if defined(need_GL_EXT_vertex_weighting)
+#define GL_EXT_vertex_weighting_functions NULL
+#endif
+
+#if defined(need_GL_HP_image_transform)
+#define GL_HP_image_transform_functions NULL
+#endif
+
+#if defined(need_GL_IBM_multimode_draw_arrays)
+#define GL_IBM_multimode_draw_arrays_functions NULL
+#endif
+
+#if defined(need_GL_IBM_vertex_array_lists)
+#define GL_IBM_vertex_array_lists_functions NULL
+#endif
+
+#if defined(need_GL_INGR_blend_func_separate)
+#define GL_INGR_blend_func_separate_functions NULL
+#endif
+
+#if defined(need_GL_INTEL_parallel_arrays)
+#define GL_INTEL_parallel_arrays_functions NULL
+#endif
+
+#if defined(need_GL_MESA_resize_buffers)
+#define GL_MESA_resize_buffers_functions NULL
+#endif
+
+#if defined(need_GL_MESA_shader_debug)
+#define GL_MESA_shader_debug_functions NULL
+#endif
+
+#if defined(need_GL_MESA_window_pos)
+#define GL_MESA_window_pos_functions NULL
+#endif
+
+#if defined(need_GL_NV_evaluators)
+#define GL_NV_evaluators_functions NULL
+#endif
+
+#if defined(need_GL_NV_fence)
+#define GL_NV_fence_functions NULL
+#endif
+
+#if defined(need_GL_NV_fragment_program)
+#define GL_NV_fragment_program_functions NULL
+#endif
+
+#if defined(need_GL_NV_point_sprite)
+#define GL_NV_point_sprite_functions NULL
+#endif
+
+#if defined(need_GL_NV_register_combiners)
+#define GL_NV_register_combiners_functions NULL
+#endif
+
+#if defined(need_GL_NV_register_combiners2)
+#define GL_NV_register_combiners2_functions NULL
+#endif
+
+#if defined(need_GL_NV_vertex_array_range)
+#define GL_NV_vertex_array_range_functions NULL
+#endif
+
+#if defined(need_GL_NV_vertex_program)
+#define GL_NV_vertex_program_functions NULL
+#endif
+
+#if defined(need_GL_PGI_misc_hints)
+#define GL_PGI_misc_hints_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_detail_texture)
+#define GL_SGIS_detail_texture_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_fog_function)
+#define GL_SGIS_fog_function_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_multisample)
+#define GL_SGIS_multisample_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_pixel_texture)
+#define GL_SGIS_pixel_texture_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_point_parameters)
+#define GL_SGIS_point_parameters_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_sharpen_texture)
+#define GL_SGIS_sharpen_texture_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_texture4D)
+#define GL_SGIS_texture4D_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_texture_color_mask)
+#define GL_SGIS_texture_color_mask_functions NULL
+#endif
+
+#if defined(need_GL_SGIS_texture_filter4)
+#define GL_SGIS_texture_filter4_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_async)
+#define GL_SGIX_async_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_flush_raster)
+#define GL_SGIX_flush_raster_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_fragment_lighting)
+#define GL_SGIX_fragment_lighting_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_framezoom)
+#define GL_SGIX_framezoom_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_igloo_interface)
+#define GL_SGIX_igloo_interface_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_instruments)
+#define GL_SGIX_instruments_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_list_priority)
+#define GL_SGIX_list_priority_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_pixel_texture)
+#define GL_SGIX_pixel_texture_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_polynomial_ffd)
+#define GL_SGIX_polynomial_ffd_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_reference_plane)
+#define GL_SGIX_reference_plane_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_sprite)
+#define GL_SGIX_sprite_functions NULL
+#endif
+
+#if defined(need_GL_SGIX_tag_sample_buffer)
+#define GL_SGIX_tag_sample_buffer_functions NULL
+#endif
+
+#if defined(need_GL_SGI_color_table)
+#define GL_SGI_color_table_functions NULL
+#endif
+
+#if defined(need_GL_SUNX_constant_data)
+#define GL_SUNX_constant_data_functions NULL
+#endif
+
+#if defined(need_GL_SUN_global_alpha)
+#define GL_SUN_global_alpha_functions NULL
+#endif
+
+#if defined(need_GL_SUN_mesh_array)
+#define GL_SUN_mesh_array_functions NULL
+#endif
+
+#if defined(need_GL_SUN_triangle_list)
+#define GL_SUN_triangle_list_functions NULL
+#endif
+
+#if defined(need_GL_SUN_vertex)
+#define GL_SUN_vertex_functions NULL
+#endif
+
+#if defined(need_GL_VERSION_1_3)
+#define GL_VERSION_1_3_functions NULL
+#endif
+
+#if defined(need_GL_VERSION_1_4)
+#define GL_VERSION_1_4_functions NULL
+#endif
+
+#if defined(need_GL_VERSION_1_5)
+#define GL_VERSION_1_5_functions NULL
+#endif
+
+#if defined(need_GL_VERSION_2_0)
+#define GL_VERSION_2_0_functions NULL
+#endif
+
+#if defined(need_GL_VERSION_2_1)
+#define GL_VERSION_2_1_functions NULL
+#endif
+
+#endif /* IN_DRI_DRIVER */
+
diff --git a/src/mesa/drivers/dri/common/spantmp2.h b/src/mesa/drivers/dri/common/spantmp2.h
index f2868cb58a..89c815722f 100644
--- a/src/mesa/drivers/dri/common/spantmp2.h
+++ b/src/mesa/drivers/dri/common/spantmp2.h
@@ -82,6 +82,71 @@
       rgba[3] = 0xff;							\
    } while (0)
 
+#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_4_4_4_4_REV)
+
+/**
+ ** GL_BGRA, GL_UNSIGNED_SHORT_4_4_4_4_REV
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+#define INIT_MONO_PIXEL(p, color) \
+   p = PACK_COLOR_4444(color[3], color[0], color[1], color[2])
+
+#define WRITE_RGBA( _x, _y, r, g, b, a )				\
+   PUT_VALUE(_x, _y, PACK_COLOR_4444(a, r, g, b))			\
+
+#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
+
+#define READ_RGBA( rgba, _x, _y )					\
+   do {									\
+      GLushort p = GET_VALUE(_x, _y);					\
+      rgba[0] = ((p >> 8) & 0xf) * 0x11;				\
+      rgba[1] = ((p >> 4) & 0xf) * 0x11;				\
+      rgba[2] = ((p >> 0) & 0xf) * 0x11;				\
+      rgba[3] = ((p >> 12) & 0xf) * 0x11;				\
+   } while (0)
+
+
+#elif (SPANTMP_PIXEL_FMT == GL_BGRA)  && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_SHORT_1_5_5_5_REV)
+
+/**
+ ** GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV
+ **/
+
+#ifndef GET_VALUE
+#ifndef GET_PTR
+#define GET_PTR(_x, _y) (buf + (_x) * 2 + (_y) * pitch)
+#endif
+
+#define GET_VALUE(_x, _y) *(volatile GLushort *)(GET_PTR(_x, _y))
+#define PUT_VALUE(_x, _y, _v) *(volatile GLushort *)(GET_PTR(_x, _y)) = (_v)
+#endif /* GET_VALUE */
+
+#define INIT_MONO_PIXEL(p, color) \
+   p = PACK_COLOR_1555(color[3], color[0], color[1], color[2])
+
+#define WRITE_RGBA( _x, _y, r, g, b, a )				\
+   PUT_VALUE(_x, _y, PACK_COLOR_1555(a, r, g, b))			\
+
+#define WRITE_PIXEL( _x, _y, p ) PUT_VALUE(_x, _y, p)
+
+#define READ_RGBA( rgba, _x, _y )					\
+   do {									\
+      GLushort p = GET_VALUE(_x, _y);					\
+      rgba[0] = ((p >> 7) & 0xf8) * 255 / 0xf8;				\
+      rgba[1] = ((p >> 2) & 0xf8) * 255 / 0xf8;				\
+      rgba[2] = ((p << 3) & 0xf8) * 255 / 0xf8;				\
+      rgba[3] = ((p >> 15) & 0x1) * 0xff;				\
+   } while (0)
+
 #elif (SPANTMP_PIXEL_FMT == GL_BGRA) && (SPANTMP_PIXEL_TYPE == GL_UNSIGNED_INT_8_8_8_8_REV)
 
 /**
diff --git a/src/mesa/drivers/dri/common/texmem.c b/src/mesa/drivers/dri/common/texmem.c
index ff174a251d..b64618a03c 100644
--- a/src/mesa/drivers/dri/common/texmem.c
+++ b/src/mesa/drivers/dri/common/texmem.c
@@ -1063,31 +1063,31 @@ void driInitTextureObjects( GLcontext *ctx, driTextureObject * swapped,
       ctx->Texture.CurrentUnit = i;
 
       if ( (targets & DRI_TEXMGR_DO_TEXTURE_1D) != 0 ) {
-	 texObj = ctx->Texture.Unit[i].Current1D;
+	 texObj = ctx->Texture.Unit[i].CurrentTex[TEXTURE_1D_INDEX];
 	 ctx->Driver.BindTexture( ctx, GL_TEXTURE_1D, texObj );
 	 move_to_tail( swapped, (driTextureObject *) texObj->DriverData );
       }
 
       if ( (targets & DRI_TEXMGR_DO_TEXTURE_2D) != 0 ) {
-	 texObj = ctx->Texture.Unit[i].Current2D;
+	 texObj = ctx->Texture.Unit[i].CurrentTex[TEXTURE_2D_INDEX];
 	 ctx->Driver.BindTexture( ctx, GL_TEXTURE_2D, texObj );
 	 move_to_tail( swapped, (driTextureObject *) texObj->DriverData );
       }
 
       if ( (targets & DRI_TEXMGR_DO_TEXTURE_3D) != 0 ) {
-	 texObj = ctx->Texture.Unit[i].Current3D;
+	 texObj = ctx->Texture.Unit[i].CurrentTex[TEXTURE_3D_INDEX];
 	 ctx->Driver.BindTexture( ctx, GL_TEXTURE_3D, texObj );
 	 move_to_tail( swapped, (driTextureObject *) texObj->DriverData );
       }
 
       if ( (targets & DRI_TEXMGR_DO_TEXTURE_CUBE) != 0 ) {
-	 texObj = ctx->Texture.Unit[i].CurrentCubeMap;
+	 texObj = ctx->Texture.Unit[i].CurrentTex[TEXTURE_CUBE_INDEX];
 	 ctx->Driver.BindTexture( ctx, GL_TEXTURE_CUBE_MAP_ARB, texObj );
 	 move_to_tail( swapped, (driTextureObject *) texObj->DriverData );
       }
 
       if ( (targets & DRI_TEXMGR_DO_TEXTURE_RECT) != 0 ) {
-	 texObj = ctx->Texture.Unit[i].CurrentRect;
+	 texObj = ctx->Texture.Unit[i].CurrentTex[TEXTURE_RECT_INDEX];
 	 ctx->Driver.BindTexture( ctx, GL_TEXTURE_RECTANGLE_NV, texObj );
 	 move_to_tail( swapped, (driTextureObject *) texObj->DriverData );
       }
diff --git a/src/mesa/drivers/dri/common/utils.c b/src/mesa/drivers/dri/common/utils.c
index 30c860b96c..6f4a4f7f22 100644
--- a/src/mesa/drivers/dri/common/utils.c
+++ b/src/mesa/drivers/dri/common/utils.c
@@ -32,19 +32,22 @@
 #include <string.h>
 #include <stdlib.h>
 #include "main/mtypes.h"
+#include "main/cpuinfo.h"
 #include "main/extensions.h"
 #include "glapi/dispatch.h"
 #include "utils.h"
 
+
+#ifdef IN_DRI_DRIVER
+
 int driDispatchRemapTable[ driDispatchRemapTable_size ];
 
-#if defined(USE_X86_ASM)
-#include "x86/common_x86_asm.h"
-#endif
+#else /* IN_DRI_DRIVER */
 
-#if defined(USE_PPC_ASM)
-#include "ppc/common_ppc_features.h"
-#endif
+#define driDispatchRemapTable_size 1
+static int driDispatchRemapTable[ driDispatchRemapTable_size ];
+
+#endif /* IN_DRI_DRIVER */
 
 unsigned
 driParseDebugString( const char * debug, 
@@ -93,12 +96,8 @@ unsigned
 driGetRendererString( char * buffer, const char * hardware_name,
 		      const char * driver_date, GLuint agp_mode )
 {
-#define MAX_INFO   4
-   const char * cpu[MAX_INFO];
-   unsigned   next = 0;
-   unsigned   i;
-   unsigned   offset;
-
+   unsigned offset;
+   char *cpu;
 
    offset = sprintf( buffer, "Mesa DRI %s %s", hardware_name, driver_date );
 
@@ -118,59 +117,10 @@ driGetRendererString( char * buffer, const char * hardware_name,
 
    /* Append any CPU-specific information.
     */
-#ifdef USE_X86_ASM
-   if ( _mesa_x86_cpu_features ) {
-      cpu[next] = " x86";
-      next++;
-   }
-# ifdef USE_MMX_ASM
-   if ( cpu_has_mmx ) {
-      cpu[next] = (cpu_has_mmxext) ? "/MMX+" : "/MMX";
-      next++;
-   }
-# endif
-# ifdef USE_3DNOW_ASM
-   if ( cpu_has_3dnow ) {
-      cpu[next] = (cpu_has_3dnowext) ? "/3DNow!+" : "/3DNow!";
-      next++;
-   }
-# endif
-# ifdef USE_SSE_ASM
-   if ( cpu_has_xmm ) {
-      cpu[next] = (cpu_has_xmm2) ? "/SSE2" : "/SSE";
-      next++;
-   }
-# endif
-
-#elif defined(USE_SPARC_ASM)
-
-   cpu[0] = " SPARC";
-   next = 1;
-
-#elif defined(USE_PPC_ASM)
-   if ( _mesa_ppc_cpu_features ) {
-      cpu[next] = (cpu_has_64) ? " PowerPC 64" : " PowerPC";
-      next++;
-   }
-
-# ifdef USE_VMX_ASM
-   if ( cpu_has_vmx ) {
-      cpu[next] = "/Altivec";
-      next++;
-   }
-# endif
-
-   if ( ! cpu_has_fpu ) {
-      cpu[next] = "/No FPU";
-      next++;
-   }
-#endif
-
-   for ( i = 0 ; i < next ; i++ ) {
-      const size_t len = strlen( cpu[i] );
-
-      strncpy( & buffer[ offset ], cpu[i], len );
-      offset += len;
+   cpu = _mesa_get_cpu_string();
+   if (cpu) {
+      offset += sprintf(buffer + offset, " %s", cpu);
+      _mesa_free(cpu);
    }
 
    return offset;
@@ -179,13 +129,18 @@ driGetRendererString( char * buffer, const char * hardware_name,
 
 
 
+#define need_GL_ARB_draw_buffers
 #define need_GL_ARB_multisample
+#define need_GL_ARB_texture_compression
 #define need_GL_ARB_transpose_matrix
+#define need_GL_ARB_vertex_buffer_object
 #define need_GL_ARB_window_pos
 #define need_GL_EXT_compiled_vertex_array
+#define need_GL_EXT_multi_draw_arrays
 #define need_GL_EXT_polygon_offset
 #define need_GL_EXT_texture_object
 #define need_GL_EXT_vertex_array
+#define need_GL_IBM_multimode_draw_arrays
 #define need_GL_MESA_window_pos
 
 /* These are needed in *all* drivers because Mesa internally implements
@@ -198,14 +153,19 @@ driGetRendererString( char * buffer, const char * hardware_name,
 #include "extension_helper.h"
 
 static const struct dri_extension all_mesa_extensions[] = {
+   { "GL_ARB_draw_buffers",          GL_ARB_draw_buffers_functions },
    { "GL_ARB_multisample",           GL_ARB_multisample_functions },
+   { "GL_ARB_texture_compression",   GL_ARB_texture_compression_functions },
    { "GL_ARB_transpose_matrix",      GL_ARB_transpose_matrix_functions },
+   { "GL_ARB_vertex_buffer_object",  GL_ARB_vertex_buffer_object_functions},
    { "GL_ARB_window_pos",            GL_ARB_window_pos_functions },
    { "GL_EXT_blend_func_separate",   GL_EXT_blend_func_separate_functions },
    { "GL_EXT_compiled_vertex_array", GL_EXT_compiled_vertex_array_functions },
+   { "GL_EXT_multi_draw_arrays",     GL_EXT_multi_draw_arrays_functions },
    { "GL_EXT_polygon_offset",        GL_EXT_polygon_offset_functions },
    { "GL_EXT_texture_object",        GL_EXT_texture_object_functions },
    { "GL_EXT_vertex_array",          GL_EXT_vertex_array_functions },
+   { "GL_IBM_multimode_draw_arrays", GL_IBM_multimode_draw_arrays_functions },
    { "GL_MESA_window_pos",           GL_MESA_window_pos_functions },
    { "GL_NV_vertex_program",         GL_NV_vertex_program_functions },
    { NULL,                           NULL }
@@ -310,8 +270,10 @@ void driInitSingleExtension( GLcontext * ctx,
 	     */
 	    offset = _glapi_add_dispatch( functions, parameter_signature );
 	    if (offset == -1) {
+#if 0 /* this causes noise with egl */
 		fprintf(stderr, "DISPATCH ERROR! _glapi_add_dispatch failed "
 			"to add %s!\n", functions[0]);
+#endif
 	    }
 	    else if (ext->functions[i].remap_index != -1) {
 		driDispatchRemapTable[ ext->functions[i].remap_index ] = 
@@ -504,6 +466,9 @@ GLboolean driClipRectToFramebuffer( const GLframebuffer *buffer,
  *                      \c GLX_SWAP_UNDEFINED_OML.  See the
  *                      GLX_OML_swap_method extension spec for more details.
  * \param num_db_modes  Number of entries in \c db_modes.
+ * \param msaa_samples  Array of msaa sample count. 0 represents a visual
+ *                      without a multisample buffer.
+ * \param num_msaa_modes Number of entries in \c msaa_samples.
  * \param visType       GLX visual type.  Usually either \c GLX_TRUE_COLOR or
  *                      \c GLX_DIRECT_COLOR.
  * 
@@ -523,7 +488,8 @@ __DRIconfig **
 driCreateConfigs(GLenum fb_format, GLenum fb_type,
 		 const uint8_t * depth_bits, const uint8_t * stencil_bits,
 		 unsigned num_depth_stencil_bits,
-		 const GLenum * db_modes, unsigned num_db_modes)
+		 const GLenum * db_modes, unsigned num_db_modes,
+		 const uint8_t * msaa_samples, unsigned num_msaa_modes)
 {
    static const uint8_t bits_table[4][4] = {
      /* R  G  B  A */
@@ -583,9 +549,7 @@ driCreateConfigs(GLenum fb_format, GLenum fb_type,
    int index;
    __DRIconfig **configs, **c;
    __GLcontextModes *modes;
-   unsigned i;
-   unsigned j;
-   unsigned k;
+   unsigned i, j, k, h;
    unsigned num_modes;
    unsigned num_accum_bits = 2;
 
@@ -658,7 +622,7 @@ driCreateConfigs(GLenum fb_format, GLenum fb_type,
 	 break;
    }
 
-   num_modes = num_depth_stencil_bits * num_db_modes * num_accum_bits;
+   num_modes = num_depth_stencil_bits * num_db_modes * num_accum_bits * num_msaa_modes;
    configs = _mesa_calloc((num_modes + 1) * sizeof *configs);
    if (configs == NULL)
        return NULL;
@@ -666,66 +630,72 @@ driCreateConfigs(GLenum fb_format, GLenum fb_type,
     c = configs;
     for ( k = 0 ; k < num_depth_stencil_bits ; k++ ) {
 	for ( i = 0 ; i < num_db_modes ; i++ ) {
-	    for ( j = 0 ; j < num_accum_bits ; j++ ) {
-		*c = _mesa_malloc (sizeof **c);
-		modes = &(*c)->modes;
-		c++;
-
-		memset(modes, 0, sizeof *modes);
-		modes->redBits   = bits[0];
-		modes->greenBits = bits[1];
-		modes->blueBits  = bits[2];
-		modes->alphaBits = bits[3];
-		modes->redMask   = masks[0];
-		modes->greenMask = masks[1];
-		modes->blueMask  = masks[2];
-		modes->alphaMask = masks[3];
-		modes->rgbBits   = modes->redBits + modes->greenBits
-		    + modes->blueBits + modes->alphaBits;
-
-		modes->accumRedBits   = 16 * j;
-		modes->accumGreenBits = 16 * j;
-		modes->accumBlueBits  = 16 * j;
-		modes->accumAlphaBits = (masks[3] != 0) ? 16 * j : 0;
-		modes->visualRating = (j == 0) ? GLX_NONE : GLX_SLOW_CONFIG;
-
-		modes->stencilBits = stencil_bits[k];
-		modes->depthBits = depth_bits[k];
-
-		modes->transparentPixel = GLX_NONE;
-		modes->transparentRed = GLX_DONT_CARE;
-		modes->transparentGreen = GLX_DONT_CARE;
-		modes->transparentBlue = GLX_DONT_CARE;
-		modes->transparentAlpha = GLX_DONT_CARE;
-		modes->transparentIndex = GLX_DONT_CARE;
-		modes->visualType = GLX_DONT_CARE;
-		modes->renderType = GLX_RGBA_BIT;
-		modes->drawableType = GLX_WINDOW_BIT;
-		modes->rgbMode = GL_TRUE;
-
-		if ( db_modes[i] == GLX_NONE ) {
-		    modes->doubleBufferMode = GL_FALSE;
-		}
-		else {
-		    modes->doubleBufferMode = GL_TRUE;
-		    modes->swapMethod = db_modes[i];
-		}
-
-		modes->haveAccumBuffer = ((modes->accumRedBits +
+	    for ( h = 0 ; h < num_msaa_modes; h++ ) {
+	    	for ( j = 0 ; j < num_accum_bits ; j++ ) {
+		    *c = _mesa_malloc (sizeof **c);
+		    modes = &(*c)->modes;
+		    c++;
+
+		    memset(modes, 0, sizeof *modes);
+		    modes->redBits   = bits[0];
+		    modes->greenBits = bits[1];
+		    modes->blueBits  = bits[2];
+		    modes->alphaBits = bits[3];
+		    modes->redMask   = masks[0];
+		    modes->greenMask = masks[1];
+		    modes->blueMask  = masks[2];
+		    modes->alphaMask = masks[3];
+		    modes->rgbBits   = modes->redBits + modes->greenBits
+		    	+ modes->blueBits + modes->alphaBits;
+
+		    modes->accumRedBits   = 16 * j;
+		    modes->accumGreenBits = 16 * j;
+		    modes->accumBlueBits  = 16 * j;
+		    modes->accumAlphaBits = (masks[3] != 0) ? 16 * j : 0;
+		    modes->visualRating = (j == 0) ? GLX_NONE : GLX_SLOW_CONFIG;
+
+		    modes->stencilBits = stencil_bits[k];
+		    modes->depthBits = depth_bits[k];
+
+		    modes->transparentPixel = GLX_NONE;
+		    modes->transparentRed = GLX_DONT_CARE;
+		    modes->transparentGreen = GLX_DONT_CARE;
+		    modes->transparentBlue = GLX_DONT_CARE;
+		    modes->transparentAlpha = GLX_DONT_CARE;
+		    modes->transparentIndex = GLX_DONT_CARE;
+		    modes->visualType = GLX_DONT_CARE;
+		    modes->renderType = GLX_RGBA_BIT;
+		    modes->drawableType = GLX_WINDOW_BIT;
+		    modes->rgbMode = GL_TRUE;
+
+		    if ( db_modes[i] == GLX_NONE ) {
+		    	modes->doubleBufferMode = GL_FALSE;
+		    }
+		    else {
+		    	modes->doubleBufferMode = GL_TRUE;
+		    	modes->swapMethod = db_modes[i];
+		    }
+
+		    modes->samples = msaa_samples[h];
+		    modes->sampleBuffers = modes->samples ? 1 : 0;
+
+
+		    modes->haveAccumBuffer = ((modes->accumRedBits +
 					   modes->accumGreenBits +
 					   modes->accumBlueBits +
 					   modes->accumAlphaBits) > 0);
-		modes->haveDepthBuffer = (modes->depthBits > 0);
-		modes->haveStencilBuffer = (modes->stencilBits > 0);
-
-		modes->bindToTextureRgb = GL_TRUE;
-		modes->bindToTextureRgba = GL_TRUE;
-		modes->bindToMipmapTexture = GL_FALSE;
-		modes->bindToTextureTargets = modes->rgbMode ?
-		    __DRI_ATTRIB_TEXTURE_1D_BIT |
-		    __DRI_ATTRIB_TEXTURE_2D_BIT |
-		    __DRI_ATTRIB_TEXTURE_RECTANGLE_BIT :
-		    0;
+		    modes->haveDepthBuffer = (modes->depthBits > 0);
+		    modes->haveStencilBuffer = (modes->stencilBits > 0);
+
+		    modes->bindToTextureRgb = GL_TRUE;
+		    modes->bindToTextureRgba = GL_TRUE;
+		    modes->bindToMipmapTexture = GL_FALSE;
+		    modes->bindToTextureTargets = modes->rgbMode ?
+		    	__DRI_ATTRIB_TEXTURE_1D_BIT |
+		    	__DRI_ATTRIB_TEXTURE_2D_BIT |
+		    	__DRI_ATTRIB_TEXTURE_RECTANGLE_BIT :
+		    	0;
+		}
 	    }
 	}
     }
@@ -734,9 +704,10 @@ driCreateConfigs(GLenum fb_format, GLenum fb_type,
     return configs;
 }
 
-const __DRIconfig **driConcatConfigs(__DRIconfig **a, __DRIconfig **b)
+__DRIconfig **driConcatConfigs(__DRIconfig **a,
+			       __DRIconfig **b)
 {
-    const __DRIconfig **all;
+    __DRIconfig **all;
     int i, j, index;
 
     i = 0;
diff --git a/src/mesa/drivers/dri/common/utils.h b/src/mesa/drivers/dri/common/utils.h
index 0c974dbff3..9e9e5bc224 100644
--- a/src/mesa/drivers/dri/common/utils.h
+++ b/src/mesa/drivers/dri/common/utils.h
@@ -131,9 +131,11 @@ extern __DRIconfig **
 driCreateConfigs(GLenum fb_format, GLenum fb_type,
 		 const uint8_t * depth_bits, const uint8_t * stencil_bits,
 		 unsigned num_depth_stencil_bits,
-		 const GLenum * db_modes, unsigned num_db_modes);
+		 const GLenum * db_modes, unsigned num_db_modes,
+    		 const uint8_t * msaa_samples, unsigned num_msaa_modes);
 
-const __DRIconfig **driConcatConfigs(__DRIconfig **a, __DRIconfig **b);
+__DRIconfig **driConcatConfigs(__DRIconfig **a,
+			       __DRIconfig **b);
 
 int
 driGetConfigAttrib(const __DRIconfig *config,
diff --git a/src/mesa/drivers/dri/common/xmlpool.h b/src/mesa/drivers/dri/common/xmlpool.h
index 7fbc6e800d..587517ea10 100644
--- a/src/mesa/drivers/dri/common/xmlpool.h
+++ b/src/mesa/drivers/dri/common/xmlpool.h
@@ -60,6 +60,10 @@
 #define DRI_CONF_OPT_BEGIN(name,type,def) \
 "<option name=\""#name"\" type=\""#type"\" default=\""#def"\">\n"
 
+/** \brief Begin an option definition with qouted default value */
+#define DRI_CONF_OPT_BEGIN_Q(name,type,def) \
+"<option name=\""#name"\" type=\""#type"\" default="#def">\n"
+
 /** \brief Begin an option definition with restrictions on valid values */
 #define DRI_CONF_OPT_BEGIN_V(name,type,def,valid) \
 "<option name=\""#name"\" type=\""#type"\" default=\""#def"\" valid=\""valid"\">\n"
diff --git a/src/mesa/drivers/dri/common/xmlpool/options.h b/src/mesa/drivers/dri/common/xmlpool/options.h
index d5f4fc3491..d76595578c 100644
--- a/src/mesa/drivers/dri/common/xmlpool/options.h
+++ b/src/mesa/drivers/dri/common/xmlpool/options.h
@@ -546,3 +546,23 @@ DRI_CONF_OPT_BEGIN(nv_vertex_program,bool,def) \
         DRI_CONF_DESC(fr,"Activer l'extension GL_NV_vertex_program") \
         DRI_CONF_DESC(sv,"Aktivera tillägget GL_NV_vertex_program") \
 DRI_CONF_OPT_END
+
+#define DRI_CONF_ALWAYS_FLUSH_BATCH(def) \
+DRI_CONF_OPT_BEGIN(always_flush_batch,bool,def) \
+        DRI_CONF_DESC(en,"Enable flushing batchbuffer after each draw call") \
+        DRI_CONF_DESC(de,"Enable flushing batchbuffer after each draw call") \
+        DRI_CONF_DESC(es,"Enable flushing batchbuffer after each draw call") \
+        DRI_CONF_DESC(nl,"Enable flushing batchbuffer after each draw call") \
+        DRI_CONF_DESC(fr,"Enable flushing batchbuffer after each draw call") \
+        DRI_CONF_DESC(sv,"Enable flushing batchbuffer after each draw call") \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_ALWAYS_FLUSH_CACHE(def) \
+DRI_CONF_OPT_BEGIN(always_flush_cache,bool,def) \
+        DRI_CONF_DESC(en,"Enable flushing GPU caches with each draw call") \
+        DRI_CONF_DESC(de,"Enable flushing GPU caches with each draw call") \
+        DRI_CONF_DESC(es,"Enable flushing GPU caches with each draw call") \
+        DRI_CONF_DESC(nl,"Enable flushing GPU caches with each draw call") \
+        DRI_CONF_DESC(fr,"Enable flushing GPU caches with each draw call") \
+        DRI_CONF_DESC(sv,"Enable flushing GPU caches with each draw call") \
+DRI_CONF_OPT_END
diff --git a/src/mesa/drivers/dri/common/xmlpool/t_options.h b/src/mesa/drivers/dri/common/xmlpool/t_options.h
index 4df1916aad..5fd6ec65bf 100644
--- a/src/mesa/drivers/dri/common/xmlpool/t_options.h
+++ b/src/mesa/drivers/dri/common/xmlpool/t_options.h
@@ -237,3 +237,13 @@ DRI_CONF_OPT_END
 DRI_CONF_OPT_BEGIN(nv_vertex_program,bool,def) \
         DRI_CONF_DESC(en,gettext("Enable extension GL_NV_vertex_program")) \
 DRI_CONF_OPT_END
+
+#define DRI_CONF_ALWAYS_FLUSH_BATCH(def) \
+DRI_CONF_OPT_BEGIN(always_flush_batch,bool,def) \
+        DRI_CONF_DESC(en,gettext("Enable flushing batchbuffer after each draw call")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_ALWAYS_FLUSH_CACHE(def) \
+DRI_CONF_OPT_BEGIN(always_flush_cache,bool,def) \
+        DRI_CONF_DESC(en,gettext("Enable flushing GPU caches with each draw call")) \
+DRI_CONF_OPT_END
diff --git a/src/mesa/drivers/dri/fb/fb_dri.c b/src/mesa/drivers/dri/fb/fb_dri.c
index f1194d7ce8..571b8922d5 100644
--- a/src/mesa/drivers/dri/fb/fb_dri.c
+++ b/src/mesa/drivers/dri/fb/fb_dri.c
@@ -480,7 +480,7 @@ fbCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 fbDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
diff --git a/src/mesa/drivers/dri/fb/fb_egl.c b/src/mesa/drivers/dri/fb/fb_egl.c
index 35c268441c..4e41860d8c 100644
--- a/src/mesa/drivers/dri/fb/fb_egl.c
+++ b/src/mesa/drivers/dri/fb/fb_egl.c
@@ -472,8 +472,8 @@ fbCreateContext(_EGLDriver *drv, EGLDisplay dpy, EGLConfig config, EGLContext sh
    c->Base.DrawSurface = EGL_NO_SURFACE;
    c->Base.ReadSurface = EGL_NO_SURFACE;
 
-   /* generate handle and insert into hash table */
-   _eglSaveContext(&c->Base);
+   /* link to display */
+   _eglLinkContext(&c->Base, disp);
    assert(c->Base.Handle);
 
    /* Init default driver functions then plug in our FBdev-specific functions
@@ -604,13 +604,9 @@ static EGLBoolean
 fbDestroySurface(_EGLDriver *drv, EGLDisplay dpy, EGLSurface surface)
 {
    fbSurface *fs = Lookup_fbSurface(surface);
-   _eglRemoveSurface(&fs->Base);
-   if (fs->Base.IsBound) {
-      fs->Base.DeletePending = EGL_TRUE;
-   }
-   else {
+   _eglUnlinkSurface(&fs->Base);
+   if (!_eglIsSurfaceBound(&fs->Base))
       free(fs);
-   }
    return EGL_TRUE;
 }
 
@@ -619,13 +615,9 @@ static EGLBoolean
 fbDestroyContext(_EGLDriver *drv, EGLDisplay dpy, EGLContext context)
 {
    fbContext *fc = Lookup_fbContext(context);
-   _eglRemoveContext(&fc->Base);
-   if (fc->Base.IsBound) {
-      fc->Base.DeletePending = EGL_TRUE;
-   }
-   else {
+   _eglUnlinkContext(&fc->Base);
+   if (!_eglIsContextBound(&fc->Base))
       free(fc);
-   }
    return EGL_TRUE;
 }
 
@@ -688,7 +680,7 @@ fbCreateScreenSurfaceMESA(_EGLDriver *drv, EGLDisplay dpy, EGLConfig cfg,
    surface->mesa_framebuffer = _mesa_create_framebuffer(&vis);
    if (!surface->mesa_framebuffer) {
       free(surface);
-      _eglRemoveSurface(&surface->Base);
+      _eglUnlinkSurface(&surface->Base);
       return EGL_NO_SURFACE;
    }
 
diff --git a/src/mesa/drivers/dri/ffb/ffb_state.c b/src/mesa/drivers/dri/ffb/ffb_state.c
index ee0fe4e0db..5eb8f417ff 100644
--- a/src/mesa/drivers/dri/ffb/ffb_state.c
+++ b/src/mesa/drivers/dri/ffb/ffb_state.c
@@ -275,7 +275,7 @@ ffbDDStencilFuncSeparate(GLcontext *ctx, GLenum face, GLenum func,
 	/* We will properly update sw/hw state when stenciling is
 	 * enabled.
 	 */
-	if (! ctx->Stencil.Enabled)
+	if (! ctx->Stencil._Enabled)
 		return;
 
 	stencilctl = fmesa->stencilctl;
@@ -333,7 +333,7 @@ ffbDDStencilOpSeparate(GLcontext *ctx, GLenum face, GLenum fail,
 	/* We will properly update sw/hw state when stenciling is
 	 * enabled.
 	 */
-	if (! ctx->Stencil.Enabled)
+	if (! ctx->Stencil._Enabled)
 		return;
 
 	stencilctl = fmesa->stencilctl;
diff --git a/src/mesa/drivers/dri/ffb/ffb_xmesa.c b/src/mesa/drivers/dri/ffb/ffb_xmesa.c
index 679f8561d2..3b9f5c6759 100644
--- a/src/mesa/drivers/dri/ffb/ffb_xmesa.c
+++ b/src/mesa/drivers/dri/ffb/ffb_xmesa.c
@@ -260,6 +260,8 @@ ffbCreateContext(const __GLcontextModes *mesaVis,
 	ctx->Const.MaxLineWidthAA = 1.0;
 	ctx->Const.LineWidthGranularity = 1.0;
 
+	ctx->Const.MaxDrawBuffers = 1;
+
 	/* Instead of having GCC emit these constants a zillion times
 	 * everywhere in the driver, put them here.
 	 */
@@ -392,7 +394,7 @@ ffbCreateBuffer(__DRIscreenPrivate *driScrnPriv,
 static void
 ffbDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
@@ -626,6 +628,7 @@ ffbFillInModes( __DRIscreenPrivate *psp,
 
    uint8_t depth_bits_array[3];
    uint8_t stencil_bits_array[3];
+   uint8_t msaa_samples_array[1];
 
    depth_bits_array[0] = 0;
    depth_bits_array[1] = depth_bits;
@@ -639,6 +642,8 @@ ffbFillInModes( __DRIscreenPrivate *psp,
    stencil_bits_array[1] = 0;
    stencil_bits_array[2] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+   msaa_samples_array[0] = 0;
+
    depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 3 : 1;
    back_buffer_factor  = (have_back_buffer) ? 3 : 1;
 
@@ -654,9 +659,10 @@ ffbFillInModes( __DRIscreenPrivate *psp,
    configs = driCreateConfigs(fb_format, fb_type,
 			      depth_bits_array, stencil_bits_array,
 			      depth_buffer_factor, back_buffer_modes,
-			      back_buffer_factor);
+			      back_buffer_factor,
+                               msaa_samples_array, 1);
    if (configs == NULL) {
-    fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
               __LINE__);
       return NULL;
    }
diff --git a/src/mesa/drivers/dri/gamma/gamma_context.c b/src/mesa/drivers/dri/gamma/gamma_context.c
index c91bedce3a..b0ac299daa 100644
--- a/src/mesa/drivers/dri/gamma/gamma_context.c
+++ b/src/mesa/drivers/dri/gamma/gamma_context.c
@@ -133,6 +133,8 @@ GLboolean gammaCreateContext( const __GLcontextModes *glVisual,
    ctx->Const.MaxPointSizeAA = 16.0; 
    ctx->Const.PointSizeGranularity = 0.25;
 
+   ctx->Const.MaxDrawBuffers = 1;
+
    gmesa->texHeap = mmInit( 0, gmesa->gammaScreen->textureSize );
 
    make_empty_list(&gmesa->TexObjList);
diff --git a/src/mesa/drivers/dri/gamma/gamma_tex.c b/src/mesa/drivers/dri/gamma/gamma_tex.c
index 2ffb790f28..97797d4788 100644
--- a/src/mesa/drivers/dri/gamma/gamma_tex.c
+++ b/src/mesa/drivers/dri/gamma/gamma_tex.c
@@ -107,9 +107,14 @@ static void gammaSetTexFilter(gammaContextPtr gmesa,
 
 static void gammaSetTexBorderColor(gammaContextPtr gmesa,
 				  gammaTextureObjectPtr t, 
-				  GLubyte color[4])
+                                  const GLfloat color[4])
 {
-    t->TextureBorderColor = PACK_COLOR_8888(color[0], color[1], color[2], color[3]);
+   GLubyte c[4];
+   CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+   CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+   CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+   CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
+   t->TextureBorderColor = PACK_COLOR_8888(c[0], c[1], c[2], c[3]);
 }
 
 
@@ -143,7 +148,7 @@ static void gammaTexParameter( GLcontext *ctx, GLenum target,
       break;
   
    case GL_TEXTURE_BORDER_COLOR:
-      gammaSetTexBorderColor( gmesa, t, tObj->_BorderChan );
+      gammaSetTexBorderColor( gmesa, t, tObj->BorderColor );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
@@ -347,7 +352,7 @@ static void gammaBindTexture( GLcontext *ctx, GLenum target,
 
 	 gammaSetTexWrapping( t, tObj->WrapS, tObj->WrapT );
 	 gammaSetTexFilter( gmesa, t, tObj->MinFilter, tObj->MagFilter, bias );
-	 gammaSetTexBorderColor( gmesa, t, tObj->_BorderChan );
+	 gammaSetTexBorderColor( gmesa, t, tObj->BorderColor );
       }
 }
 
@@ -400,19 +405,19 @@ void gammaInitTextureObjects( GLcontext *ctx )
 
    ctx->Texture.CurrentUnit = 0;
 
-   texObj = ctx->Texture.Unit[0].Current1D;
+   texObj = ctx->Texture.Unit[0].CurrentTex[TEXTURE_1D_INDEX];
    gammaBindTexture( ctx, GL_TEXTURE_1D, texObj );
 
-   texObj = ctx->Texture.Unit[0].Current2D;
+   texObj = ctx->Texture.Unit[0].CurrentTex[TEXTURE_2D_INDEX];
    gammaBindTexture( ctx, GL_TEXTURE_2D, texObj );
 
 #if 0
    ctx->Texture.CurrentUnit = 1;
 
-   texObj = ctx->Texture.Unit[1].Current1D;
+   texObj = ctx->Texture.Unit[1].CurrentTex[TEXTURE_1D_INDEX];
    gammaBindTexture( ctx, GL_TEXTURE_1D, texObj );
 
-   texObj = ctx->Texture.Unit[1].Current2D;
+   texObj = ctx->Texture.Unit[1].CurrentTex[TEXTURE_2D_INDEX];
    gammaBindTexture( ctx, GL_TEXTURE_2D, texObj );
 #endif
 
diff --git a/src/mesa/drivers/dri/gamma/gamma_xmesa.c b/src/mesa/drivers/dri/gamma/gamma_xmesa.c
index 2a28902e1e..7b5b53589c 100644
--- a/src/mesa/drivers/dri/gamma/gamma_xmesa.c
+++ b/src/mesa/drivers/dri/gamma/gamma_xmesa.c
@@ -96,7 +96,7 @@ gammaCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 gammaDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/glcore/Makefile b/src/mesa/drivers/dri/glcore/Makefile
new file mode 100644
index 0000000000..ac7e1de928
--- /dev/null
+++ b/src/mesa/drivers/dri/glcore/Makefile
@@ -0,0 +1,84 @@
+# src/mesa/drivers/dri/glcore/Makefile
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = glcore_dri.so
+
+DRIVER_SOURCES = glcore_driver.c \
+		 $(TOP)/src/mesa/drivers/common/driverfuncs.c \
+		 ../common/dri_util.c
+
+C_SOURCES = \
+	$(DRIVER_SOURCES) \
+	$(DRI_SOURCES) 
+
+
+# Include directories
+INCLUDE_DIRS = \
+	-I. \
+	-I../common \
+	-I../dri_client \
+	-I../dri_client/imports \
+	-Iserver \
+	-I$(TOP)/include \
+	-I$(DRM_SOURCE_PATH)/shared-core \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/mesa/glapi \
+	-I$(TOP)/src/mesa/math \
+	-I$(TOP)/src/mesa/transform \
+	-I$(TOP)/src/mesa/shader \
+	-I$(TOP)/src/mesa/swrast \
+	-I$(TOP)/src/mesa/swrast_setup
+
+# Core Mesa objects
+MESA_MODULES = $(TOP)/src/mesa/libmesa.a
+
+# Libraries that the driver shared lib depends on
+LIB_DEPS = -lm -lpthread -lc
+# LIB_DEPS = -lGL -lm -lpthread -lc
+
+
+ASM_SOURCES = 
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $(DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $(DEFINES)  $< -o $@
+
+
+##### TARGETS #####
+
+default: depend $(TOP)/$(LIB_DIR)/$(LIBNAME)
+
+
+$(TOP)/$(LIB_DIR)/$(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(WINOBJ) Makefile
+	CC="$(CC)" CXX="$(CXX)" $(TOP)/bin/mklib -o $(LIBNAME) -noprefix -install $(TOP)/$(LIB_DIR) \
+		$(OBJECTS) $(WINLIB) $(LIB_DEPS) $(WINOBJ) $(MESA_MODULES)
+
+
+depend: $(C_SOURCES) $(ASM_SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(C_SOURCES) $(ASM_SOURCES) \
+		> /dev/null 
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+clean:
+	-rm -f *.o server/*.o
+
+
+include depend
diff --git a/src/mesa/drivers/dri/i810/i810context.c b/src/mesa/drivers/dri/i810/i810context.c
index c281a4990e..6785655686 100644
--- a/src/mesa/drivers/dri/i810/i810context.c
+++ b/src/mesa/drivers/dri/i810/i810context.c
@@ -63,11 +63,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drirenderbuffer.h"
 #include "utils.h"
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
-#include "extension_helper.h"
-
 #ifndef I810_DEBUG
 int I810_DEBUG = (0);
 #endif
@@ -123,14 +118,11 @@ static void i810BufferSize(GLframebuffer *buffer, GLuint *width, GLuint *height)
  */
 const struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
     { "GL_ARB_texture_env_add",            NULL },
     { "GL_ARB_texture_env_combine",        NULL },
     { "GL_ARB_texture_env_crossbar",       NULL },
     { "GL_ARB_texture_mirrored_repeat",    NULL },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     { "GL_EXT_stencil_wrap",               NULL },
     { "GL_EXT_texture_edge_clamp",         NULL },
     { "GL_EXT_texture_env_combine",        NULL },
diff --git a/src/mesa/drivers/dri/i810/i810screen.c b/src/mesa/drivers/dri/i810/i810screen.c
index 48603f5d79..6e49f3466c 100644
--- a/src/mesa/drivers/dri/i810/i810screen.c
+++ b/src/mesa/drivers/dri/i810/i810screen.c
@@ -77,6 +77,7 @@ i810FillInModes( __DRIscreenPrivate *psp,
 
     uint8_t depth_bits_array[2];
     uint8_t stencil_bits_array[2];
+    uint8_t msaa_samples_array[1];
 
     depth_bits_array[0] = depth_bits;
     depth_bits_array[1] = depth_bits;
@@ -88,13 +89,16 @@ i810FillInModes( __DRIscreenPrivate *psp,
     stencil_bits_array[0] = 0;
     stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+    msaa_samples_array[0] = 0;
+
     depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 2 : 1;
     back_buffer_factor  = (have_back_buffer) ? 2 : 1;
 
     configs = driCreateConfigs(GL_RGB, GL_UNSIGNED_SHORT_5_6_5,
 			       depth_bits_array, stencil_bits_array,
 			       depth_buffer_factor,
-			       back_buffer_modes, back_buffer_factor);
+			       back_buffer_modes, back_buffer_factor,
+                               msaa_samples_array, 1);
     if (configs == NULL) {
 	fprintf( stderr, "[%s:%u] Error creating FBConfig!\n",
 		 __func__, __LINE__ );
@@ -337,7 +341,7 @@ i810CreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 i810DestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-    _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 const struct __DriverAPIRec driDriverAPI = {
diff --git a/src/mesa/drivers/dri/i810/i810tex.c b/src/mesa/drivers/dri/i810/i810tex.c
index ba4e6b5b0b..cd6e1a8e6e 100644
--- a/src/mesa/drivers/dri/i810/i810tex.c
+++ b/src/mesa/drivers/dri/i810/i810tex.c
@@ -162,7 +162,7 @@ static void i810SetTexFilter(i810ContextPtr imesa,
 
 
 static void
-i810SetTexBorderColor( i810TextureObjectPtr t, GLubyte color[4] )
+i810SetTexBorderColor( i810TextureObjectPtr t, const GLfloat color[4] )
 {
    /* Need a fallback.
     */
@@ -211,7 +211,7 @@ i810AllocTexObj( GLcontext *ctx, struct gl_texture_object *texObj )
       i810SetTexWrapping( t, texObj->WrapS, texObj->WrapT );
       /*i830SetTexMaxAnisotropy( t, texObj->MaxAnisotropy );*/
       i810SetTexFilter( imesa, t, texObj->MinFilter, texObj->MagFilter, bias );
-      i810SetTexBorderColor( t, texObj->_BorderChan );
+      i810SetTexBorderColor( t, texObj->BorderColor );
    }
 
    return t;
@@ -252,7 +252,7 @@ static void i810TexParameter( GLcontext *ctx, GLenum target,
       break;
   
    case GL_TEXTURE_BORDER_COLOR:
-      i810SetTexBorderColor( t, tObj->_BorderChan );
+      i810SetTexBorderColor( t, tObj->BorderColor );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
diff --git a/src/mesa/drivers/dri/i915/Makefile b/src/mesa/drivers/dri/i915/Makefile
index 5858e0ee9f..f5242ddc7b 100644
--- a/src/mesa/drivers/dri/i915/Makefile
+++ b/src/mesa/drivers/dri/i915/Makefile
@@ -11,15 +11,17 @@ DRIVER_SOURCES = \
 	i830_metaops.c \
 	i830_state.c \
 	i830_texblend.c \
-	i830_tex.c \
 	i830_texstate.c \
 	i830_vtbl.c \
 	intel_render.c \
 	intel_regions.c \
 	intel_buffer_objects.c \
 	intel_batchbuffer.c \
+	intel_clear.c \
+	intel_eglimage.c \
+	intel_extensions.c \
+	intel_generatemipmap.c \
 	intel_mipmap_tree.c \
-	i915_tex_layout.c \
 	intel_tex_layout.c \
 	intel_tex_image.c \
 	intel_tex_subimage.c \
@@ -28,13 +30,11 @@ DRIVER_SOURCES = \
 	intel_tex_format.c \
 	intel_tex.c \
 	intel_pixel.c \
-	intel_pixel_bitmap.c \
-	intel_pixel_copy.c \
-	intel_pixel_draw.c \
 	intel_pixel_read.c \
 	intel_buffers.c \
 	intel_blit.c \
-	i915_tex.c \
+	intel_swapbuffers.c \
+	i915_tex_layout.c \
 	i915_texstate.c \
 	i915_context.c \
 	i915_debug.c \
@@ -49,9 +49,9 @@ DRIVER_SOURCES = \
 	intel_screen.c \
 	intel_span.c \
 	intel_state.c \
+	intel_syncobj.c \
 	intel_tris.c \
-	intel_fbo.c \
-	intel_depthstencil.c
+	intel_fbo.c
 
 C_SOURCES = \
 	$(COMMON_SOURCES) \
@@ -65,7 +65,7 @@ DRIVER_DEFINES = -I../intel -I../intel/server -DI915 \
 
 DRI_LIB_DEPS += -ldrm_intel
 
-include ../Makefile.template
+include ../Makefile.es
 
 intel_decode.o: ../intel/intel_decode.c
 
diff --git a/src/mesa/drivers/dri/i915/i830_context.c b/src/mesa/drivers/dri/i915/i830_context.c
index 09b1ec922f..840946f908 100644
--- a/src/mesa/drivers/dri/i915/i830_context.c
+++ b/src/mesa/drivers/dri/i915/i830_context.c
@@ -47,7 +47,6 @@ i830InitDriverFunctions(struct dd_function_table *functions)
 {
    intelInitDriverFunctions(functions);
    i830InitStateFuncs(functions);
-   i830InitTextureFuncs(functions);
 }
 
 extern const struct tnl_pipeline_stage *intel_pipeline[];
@@ -73,6 +72,8 @@ i830CreateContext(const __GLcontextModes * mesaVis,
       return GL_FALSE;
    }
 
+   _math_matrix_ctr(&intel->ViewportMatrix);
+
    /* Initialize swrast, tnl driver tables: */
    intelInitSpanFuncs(ctx);
    intelInitTriFuncs(ctx);
@@ -97,6 +98,10 @@ i830CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.MaxTextureRectSize = (1 << 11);
    ctx->Const.MaxTextureUnits = I830_TEX_UNITS;
 
+   ctx->Const.MaxTextureMaxAnisotropy = 2.0;
+
+   ctx->Const.MaxDrawBuffers = 1;
+
    _tnl_init_vertices(ctx, ctx->Const.MaxArrayLockSize + 12,
                       18 * sizeof(GLfloat));
 
diff --git a/src/mesa/drivers/dri/i915/i830_context.h b/src/mesa/drivers/dri/i915/i830_context.h
index 1bdb32049d..f73cbbf88b 100644
--- a/src/mesa/drivers/dri/i915/i830_context.h
+++ b/src/mesa/drivers/dri/i915/i830_context.h
@@ -40,6 +40,7 @@
 #define I830_UPLOAD_BUFFERS          0x2
 #define I830_UPLOAD_STIPPLE          0x4
 #define I830_UPLOAD_INVARIENT        0x8
+#define I830_UPLOAD_RASTER_RULES     0x10
 #define I830_UPLOAD_TEX(i)           (0x10<<(i))
 #define I830_UPLOAD_TEXBLEND(i)      (0x100<<(i))
 #define I830_UPLOAD_TEX_ALL          (0x0f0)
@@ -99,6 +100,11 @@
 
 #define I830_TEXBLEND_SIZE	12      /* (4 args + op) * 2 + COLOR_FACTOR */
 
+enum {
+   I830_RASTER_RULES,
+   I830_RASTER_RULES_SIZE
+};
+
 struct i830_texture_object
 {
    struct intel_texture_object intel;
@@ -112,6 +118,7 @@ struct i830_hw_state
    GLuint Ctx[I830_CTX_SETUP_SIZE];
    GLuint Buffer[I830_DEST_SETUP_SIZE];
    GLuint Stipple[I830_STP_SETUP_SIZE];
+   GLuint RasterRules[I830_RASTER_RULES_SIZE];
    GLuint Tex[I830_TEX_UNITS][I830_TEX_SETUP_SIZE];
    GLuint TexBlend[I830_TEX_UNITS][I830_TEXBLEND_SIZE];
    GLuint TexBlendWordsUsed[I830_TEX_UNITS];
@@ -197,6 +204,7 @@ extern void i830InitStateFuncs(struct dd_function_table *functions);
 extern void i830EmitState(struct i830_context *i830);
 
 extern void i830InitState(struct i830_context *i830);
+extern void i830_update_provoking_vertex(GLcontext *ctx);
 
 /* i830_metaops.c
  */
diff --git a/src/mesa/drivers/dri/i915/i830_reg.h b/src/mesa/drivers/dri/i915/i830_reg.h
index d210c2d08e..ae1317029a 100644
--- a/src/mesa/drivers/dri/i915/i830_reg.h
+++ b/src/mesa/drivers/dri/i915/i830_reg.h
@@ -48,19 +48,6 @@
 #define AA_LINE_ENABLE			((1<<1) | 1)
 #define AA_LINE_DISABLE			(1<<1)
 
-#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
-/* Dword 1 */
-#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
-#define BUF_3D_ID_DEPTH 	(0x7<<24)
-#define BUF_3D_USE_FENCE	(1<<23)
-#define BUF_3D_TILED_SURFACE	(1<<22)
-#define BUF_3D_TILE_WALK_X	0
-#define BUF_3D_TILE_WALK_Y	(1<<21)
-#define BUF_3D_PITCH(x)         (((x)/4)<<2)
-/* Dword 2 */
-#define BUF_3D_ADDR(x)		((x) & ~0x3)
-
-
 #define _3DSTATE_COLOR_FACTOR_CMD	(CMD_3D | (0x1d<<24) | (0x1<<16))
 
 #define _3DSTATE_COLOR_FACTOR_N_CMD(stage)	(CMD_3D | (0x1d<<24) | \
@@ -433,8 +420,11 @@
 #define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
 #define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
 #define ENABLE_TRI_STRIP_PROVOKE_VRTX	(1<<2)
+#define LINE_STRIP_PROVOKE_VRTX_MASK	(3<<6)
 #define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX_MASK	(3<<3)
 #define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
+#define TRI_STRIP_PROVOKE_VRTX_MASK	(3<<0)
 #define TRI_STRIP_PROVOKE_VRTX(x)	(x)
 
 /* _3DSTATE_SCISSOR_ENABLE, p200 */
diff --git a/src/mesa/drivers/dri/i915/i830_state.c b/src/mesa/drivers/dri/i915/i830_state.c
index d9cad0c4bf..645ebe3057 100644
--- a/src/mesa/drivers/dri/i915/i830_state.c
+++ b/src/mesa/drivers/dri/i915/i830_state.c
@@ -39,6 +39,7 @@
 #include "intel_screen.h"
 #include "intel_batchbuffer.h"
 #include "intel_fbo.h"
+#include "intel_buffers.h"
 
 #include "i830_context.h"
 #include "i830_reg.h"
@@ -446,6 +447,24 @@ i830DepthMask(GLcontext * ctx, GLboolean flag)
       i830->state.Ctx[I830_CTXREG_ENABLES_2] |= DISABLE_DEPTH_WRITE;
 }
 
+/** Called from ctx->Driver.Viewport() */
+static void
+i830Viewport(GLcontext * ctx,
+              GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   intelCalcViewport(ctx);
+
+   intel_viewport(ctx, x, y, width, height);
+}
+
+
+/** Called from ctx->Driver.DepthRange() */
+static void
+i830DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
+{
+   intelCalcViewport(ctx);
+}
+
 /* =============================================================
  * Polygon stipple
  *
@@ -1028,6 +1047,16 @@ i830_init_packets(struct i830_context *i830)
                                          TEXBIND_SET1(TEXCOORDSRC_VTXSET_1) |
                                          TEXBIND_SET0(TEXCOORDSRC_VTXSET_0));
 
+   i830->state.RasterRules[I830_RASTER_RULES] = (_3DSTATE_RASTER_RULES_CMD |
+						 ENABLE_POINT_RASTER_RULE |
+						 OGL_POINT_RASTER_RULE |
+						 ENABLE_LINE_STRIP_PROVOKE_VRTX |
+						 ENABLE_TRI_FAN_PROVOKE_VRTX |
+						 ENABLE_TRI_STRIP_PROVOKE_VRTX |
+						 LINE_STRIP_PROVOKE_VRTX(1) |
+						 TRI_FAN_PROVOKE_VRTX(2) |
+						 TRI_STRIP_PROVOKE_VRTX(2));
+
 
    i830->state.Stipple[I830_STPREG_ST0] = _3DSTATE_STIPPLE;
 
@@ -1039,6 +1068,27 @@ i830_init_packets(struct i830_context *i830)
    i830->state.Buffer[I830_DESTREG_SR2] = 0;
 }
 
+void
+i830_update_provoking_vertex(GLcontext * ctx)
+{
+   struct i830_context *i830 = i830_context(ctx);
+
+   I830_STATECHANGE(i830, I830_UPLOAD_RASTER_RULES);
+   i830->state.RasterRules[I830_RASTER_RULES] &= ~(LINE_STRIP_PROVOKE_VRTX_MASK |
+						   TRI_FAN_PROVOKE_VRTX_MASK |
+						   TRI_STRIP_PROVOKE_VRTX_MASK);
+
+   /* _NEW_LIGHT */
+   if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION) {
+      i830->state.RasterRules[I830_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(1) |
+						     TRI_FAN_PROVOKE_VRTX(2) |
+						     TRI_STRIP_PROVOKE_VRTX(2));
+   } else {
+      i830->state.RasterRules[I830_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(0) |
+						     TRI_FAN_PROVOKE_VRTX(1) |
+						     TRI_STRIP_PROVOKE_VRTX(0));
+    }
+}
 
 void
 i830InitStateFuncs(struct dd_function_table *functions)
@@ -1064,6 +1114,8 @@ i830InitStateFuncs(struct dd_function_table *functions)
    functions->StencilFuncSeparate = i830StencilFuncSeparate;
    functions->StencilMaskSeparate = i830StencilMaskSeparate;
    functions->StencilOpSeparate = i830StencilOpSeparate;
+   functions->DepthRange = i830DepthRange;
+   functions->Viewport = i830Viewport;
 }
 
 void
@@ -1080,6 +1132,7 @@ i830InitState(struct i830_context *i830)
    i830->current = &i830->state;
    i830->state.emitted = 0;
    i830->state.active = (I830_UPLOAD_INVARIENT |
+                         I830_UPLOAD_RASTER_RULES |
                          I830_UPLOAD_TEXBLEND(0) |
                          I830_UPLOAD_STIPPLE |
                          I830_UPLOAD_CTX | I830_UPLOAD_BUFFERS);
diff --git a/src/mesa/drivers/dri/i915/i830_tex.c b/src/mesa/drivers/dri/i915/i830_tex.c
deleted file mode 100644
index 34ac42a78e..0000000000
--- a/src/mesa/drivers/dri/i915/i830_tex.c
+++ /dev/null
@@ -1,100 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/imports.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/mm.h"
-#include "main/texstore.h"
-#include "main/texformat.h"
-#include "swrast/swrast.h"
-
-#include "texmem.h"
-
-#include "i830_context.h"
-#include "i830_reg.h"
-
-
-
-static void
-i830TexEnv(GLcontext * ctx, GLenum target,
-           GLenum pname, const GLfloat * param)
-{
-
-   switch (pname) {
-   case GL_TEXTURE_ENV_COLOR:
-   case GL_TEXTURE_ENV_MODE:
-   case GL_COMBINE_RGB:
-   case GL_COMBINE_ALPHA:
-   case GL_SOURCE0_RGB:
-   case GL_SOURCE1_RGB:
-   case GL_SOURCE2_RGB:
-   case GL_SOURCE0_ALPHA:
-   case GL_SOURCE1_ALPHA:
-   case GL_SOURCE2_ALPHA:
-   case GL_OPERAND0_RGB:
-   case GL_OPERAND1_RGB:
-   case GL_OPERAND2_RGB:
-   case GL_OPERAND0_ALPHA:
-   case GL_OPERAND1_ALPHA:
-   case GL_OPERAND2_ALPHA:
-   case GL_RGB_SCALE:
-   case GL_ALPHA_SCALE:
-      break;
-
-   case GL_TEXTURE_LOD_BIAS:{
-         struct i830_context *i830 = i830_context(ctx);
-         GLuint unit = ctx->Texture.CurrentUnit;
-         int b = (int) ((*param) * 16.0);
-         if (b > 63)
-            b = 63;
-         if (b < -64)
-            b = -64;
-         I830_STATECHANGE(i830, I830_UPLOAD_TEX(unit));
-         i830->lodbias_tm0s3[unit] =
-            ((b << TM0S3_LOD_BIAS_SHIFT) & TM0S3_LOD_BIAS_MASK);
-         break;
-      }
-
-   default:
-      break;
-   }
-}
-
-
-
-
-void
-i830InitTextureFuncs(struct dd_function_table *functions)
-{
-/*
-   functions->TexEnv = i830TexEnv;
-*/
-}
diff --git a/src/mesa/drivers/dri/i915/i830_texstate.c b/src/mesa/drivers/dri/i915/i830_texstate.c
index c718bb0055..6f998fa6f7 100644
--- a/src/mesa/drivers/dri/i915/i830_texstate.c
+++ b/src/mesa/drivers/dri/i915/i830_texstate.c
@@ -38,7 +38,7 @@
 
 
 static GLuint
-translate_texture_format(GLuint mesa_format)
+translate_texture_format(GLuint mesa_format, GLuint internal_format)
 {
    switch (mesa_format) {
    case MESA_FORMAT_L8:
@@ -56,7 +56,10 @@ translate_texture_format(GLuint mesa_format)
    case MESA_FORMAT_ARGB4444:
       return MAPSURF_16BIT | MT_16BIT_ARGB4444;
    case MESA_FORMAT_ARGB8888:
-      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+      if (internal_format == GL_RGB)
+	 return MAPSURF_32BIT | MT_32BIT_XRGB8888;
+      else
+	 return MAPSURF_32BIT | MT_32BIT_ARGB8888;
    case MESA_FORMAT_YCBCR_REV:
       return (MAPSURF_422 | MT_422_YCRCB_NORMAL);
    case MESA_FORMAT_YCBCR:
@@ -119,6 +122,7 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    struct gl_texture_image *firstImage;
    GLuint *state = i830->state.Tex[unit], format, pitch;
    GLint lodbias;
+   GLubyte border[4];
 
    memset(state, 0, sizeof(state));
 
@@ -162,21 +166,24 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 								0, intelObj->
 								firstLevel);
 
-      format = translate_texture_format(firstImage->TexFormat->MesaFormat);
+      format = translate_texture_format(firstImage->TexFormat->MesaFormat,
+					firstImage->InternalFormat);
       pitch = intelObj->mt->pitch * intelObj->mt->cpp;
    }
 
    state[I830_TEXREG_TM0LI] = (_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
                                (LOAD_TEXTURE_MAP0 << unit) | 4);
 
-/*    state[I830_TEXREG_TM0S0] = (TM0S0_USE_FENCE | */
-/* 			       t->intel.TextureOffset); */
-
-
    state[I830_TEXREG_TM0S1] =
       (((firstImage->Height - 1) << TM0S1_HEIGHT_SHIFT) |
        ((firstImage->Width - 1) << TM0S1_WIDTH_SHIFT) | format);
 
+   if (intelObj->mt->region->tiling != I915_TILING_NONE) {
+      state[I830_TEXREG_TM0S1] |= TM0S1_TILED_SURFACE;
+      if (intelObj->mt->region->tiling == I915_TILING_Y)
+	 state[I830_TEXREG_TM0S1] |= TM0S1_TILE_WALK;
+   }
+
    state[I830_TEXREG_TM0S2] =
       ((((pitch / 4) - 1) << TM0S2_PITCH_SHIFT) | TM0S2_CUBE_FACE_ENA_MASK);
 
@@ -290,11 +297,16 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
                                                      (ws)));
    }
 
+   /* convert border color from float to ubyte */
+   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->BorderColor[0]);
+   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->BorderColor[1]);
+   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->BorderColor[2]);
+   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->BorderColor[3]);
 
-   state[I830_TEXREG_TM0S4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
-                                                  tObj->_BorderChan[1],
-                                                  tObj->_BorderChan[2],
-                                                  tObj->_BorderChan[3]);
+   state[I830_TEXREG_TM0S4] = INTEL_PACKCOLOR8888(border[0],
+                                                  border[1],
+                                                  border[2],
+                                                  border[3]);
 
 
    I830_ACTIVESTATE(i830, I830_UPLOAD_TEX(unit), GL_TRUE);
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 8fc8aa5f90..983f6724c9 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -26,12 +26,14 @@
  **************************************************************************/
 
 #include "glapi/glapi.h"
+#include "main/texformat.h"
 
 #include "i830_context.h"
 #include "i830_reg.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 #include "intel_tris.h"
+#include "intel_fbo.h"
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
 
@@ -297,7 +299,7 @@ i830_emit_invarient_state(struct intel_context *intel)
 {
    BATCH_LOCALS;
 
-   BEGIN_BATCH(30, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(29, IGNORE_CLIPRECTS);
 
    OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
    OUT_BATCH(0);
@@ -349,15 +351,6 @@ i830_emit_invarient_state(struct intel_context *intel)
    OUT_BATCH(_3DSTATE_MAP_COORD_TRANSFORM);
    OUT_BATCH(DISABLE_TEX_TRANSFORM | TEXTURE_SET(3));
 
-   OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-             ENABLE_POINT_RASTER_RULE |
-             OGL_POINT_RASTER_RULE |
-             ENABLE_LINE_STRIP_PROVOKE_VRTX |
-             ENABLE_TRI_FAN_PROVOKE_VRTX |
-             ENABLE_TRI_STRIP_PROVOKE_VRTX |
-             LINE_STRIP_PROVOKE_VRTX(1) |
-             TRI_FAN_PROVOKE_VRTX(2) | TRI_STRIP_PROVOKE_VRTX(2));
-
    OUT_BATCH(_3DSTATE_VERTEX_TRANSFORM);
    OUT_BATCH(DISABLE_VIEWPORT_TRANSFORM | DISABLE_PERSPECTIVE_DIVIDE);
 
@@ -392,6 +385,9 @@ get_state_size(struct i830_hw_state *state)
    if (dirty & I830_UPLOAD_INVARIENT)
       sz += 40 * sizeof(int);
 
+   if (dirty & I830_UPLOAD_RASTER_RULES)
+      sz += sizeof(state->RasterRules);
+
    if (dirty & I830_UPLOAD_CTX)
       sz += sizeof(state->Ctx);
 
@@ -422,10 +418,10 @@ i830_emit_state(struct intel_context *intel)
    struct i830_hw_state *state = i830->current;
    int i, count;
    GLuint dirty;
-   GET_CURRENT_CONTEXT(ctx);
-   BATCH_LOCALS;
    dri_bo *aper_array[3 + I830_TEX_UNITS];
    int aper_count;
+   GET_CURRENT_CONTEXT(ctx);
+   BATCH_LOCALS;
 
    /* We don't hold the lock at this point, so want to make sure that
     * there won't be a buffer wrap between the state emits and the primitive
@@ -484,6 +480,11 @@ i830_emit_state(struct intel_context *intel)
       i830_emit_invarient_state(intel);
    }
 
+   if (dirty & I830_UPLOAD_RASTER_RULES) {
+      DBG("I830_UPLOAD_RASTER_RULES:\n");
+      emit(intel, state->RasterRules, sizeof(state->RasterRules));
+   }
+
    if (dirty & I830_UPLOAD_CTX) {
       DBG("I830_UPLOAD_CTX:\n");
       emit(intel, state->Ctx, sizeof(state->Ctx));
@@ -550,7 +551,7 @@ i830_emit_state(struct intel_context *intel)
          if (state->tex_buffer[i]) {
             OUT_RELOC(state->tex_buffer[i],
 		      I915_GEM_DOMAIN_SAMPLER, 0,
-                      state->tex_offset[i] | TM0S0_USE_FENCE);
+                      state->tex_offset[i]);
          }
 	 else if (state == &i830->meta) {
 	    assert(i == 0);
@@ -614,6 +615,8 @@ i830_state_draw_region(struct intel_context *intel,
 {
    struct i830_context *i830 = i830_context(&intel->ctx);
    GLcontext *ctx = &intel->ctx;
+   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    GLuint value;
 
    ASSERT(state == &i830->state || state == &i830->meta);
@@ -630,34 +633,38 @@ i830_state_draw_region(struct intel_context *intel,
    /*
     * Set stride/cpp values
     */
-   if (color_region) {
-      state->Buffer[I830_DESTREG_CBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I830_DESTREG_CBUFADDR1] =
-         (BUF_3D_ID_COLOR_BACK |
-          BUF_3D_PITCH(color_region->pitch * color_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I830_DESTREG_CBUFADDR0],
+				color_region, BUF_3D_ID_COLOR_BACK);
 
-   if (depth_region) {
-      state->Buffer[I830_DESTREG_DBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I830_DESTREG_DBUFADDR1] =
-         (BUF_3D_ID_DEPTH |
-          BUF_3D_PITCH(depth_region->pitch * depth_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I830_DESTREG_DBUFADDR0],
+				depth_region, BUF_3D_ID_DEPTH);
 
    /*
     * Compute/set I830_DESTREG_DV1 value
     */
    value = (DSTORG_HORT_BIAS(0x8) |     /* .5 */
             DSTORG_VERT_BIAS(0x8) | DEPTH_IS_Z);    /* .5 */
-            
-   if (color_region && color_region->cpp == 4) {
-      value |= DV_PF_8888;
-   }
-   else {
-      value |= DV_PF_565;
+
+   if (irb != NULL) {
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+	 value |= DV_PF_8888;
+	 break;
+      case MESA_FORMAT_RGB565:
+	 value |= DV_PF_565;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 value |= DV_PF_1555;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 value |= DV_PF_4444;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
    }
+
    if (depth_region && depth_region->cpp == 4) {
       value |= DEPTH_FRMT_24_FIXED_8_OTHER;
    }
@@ -700,26 +707,6 @@ i830_set_draw_region(struct intel_context *intel,
    i830_state_draw_region(intel, &i830->state, color_regions[0], depth_region);
 }
 
-#if 0
-static void
-i830_update_color_z_regions(intelContextPtr intel,
-                            const intelRegion * colorRegion,
-                            const intelRegion * depthRegion)
-{
-   i830ContextPtr i830 = I830_CONTEXT(intel);
-
-   i830->state.Buffer[I830_DESTREG_CBUFADDR1] =
-      (BUF_3D_ID_COLOR_BACK | BUF_3D_PITCH(colorRegion->pitch) |
-       BUF_3D_USE_FENCE);
-   i830->state.Buffer[I830_DESTREG_CBUFADDR2] = colorRegion->offset;
-
-   i830->state.Buffer[I830_DESTREG_DBUFADDR1] =
-      (BUF_3D_ID_DEPTH | BUF_3D_PITCH(depthRegion->pitch) | BUF_3D_USE_FENCE);
-   i830->state.Buffer[I830_DESTREG_DBUFADDR2] = depthRegion->offset;
-}
-#endif
-
-
 /* This isn't really handled at the moment.
  */
 static void
@@ -750,9 +737,10 @@ i830_assert_not_dirty( struct intel_context *intel )
 }
 
 static void
-i830_note_unlock( struct intel_context *intel )
+i830_invalidate_state(struct intel_context *intel, GLuint new_state)
 {
-    /* nothing */
+   if (new_state & _NEW_LIGHT)
+      i830_update_provoking_vertex(&intel->ctx);
 }
 
 void
@@ -769,6 +757,6 @@ i830InitVtbl(struct i830_context *i830)
    i830->intel.vtbl.render_start = i830_render_start;
    i830->intel.vtbl.render_prevalidate = i830_render_prevalidate;
    i830->intel.vtbl.assert_not_dirty = i830_assert_not_dirty;
-   i830->intel.vtbl.note_unlock = i830_note_unlock; 
    i830->intel.vtbl.finish_batch = intel_finish_vb;
+   i830->intel.vtbl.invalidate_state = i830_invalidate_state;
 }
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 3d6af38057..3ab7d682ee 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -27,6 +27,7 @@
 
 #include "i915_context.h"
 #include "main/imports.h"
+#include "main/macros.h"
 #include "intel_tex.h"
 #include "intel_tris.h"
 #include "tnl/t_context.h"
@@ -50,16 +51,6 @@
  * Mesa's Driver Functions
  ***************************************/
 
-static const struct dri_extension i915_extensions[] = {
-   {"GL_ARB_depth_texture", NULL},
-   {"GL_ARB_fragment_program", NULL},
-   {"GL_ARB_shadow", NULL},
-   {"GL_ARB_texture_non_power_of_two", NULL},
-   {"GL_ATI_texture_env_combine3",       NULL},
-   {"GL_EXT_shadow_funcs", NULL},
-   {NULL, NULL}
-};
-
 /* Override intel default.
  */
 static void
@@ -83,8 +74,12 @@ i915InvalidateState(GLcontext * ctx, GLuint new_state)
          p->params_uptodate = 0;
    }
 
-   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM))
+   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS))
       i915_update_fog(ctx);
+   if (new_state & (_NEW_STENCIL | _NEW_BUFFERS | _NEW_POLYGON))
+      i915_update_stencil(ctx);
+   if (new_state & (_NEW_LIGHT))
+       i915_update_provoking_vertex(ctx);
 }
 
 
@@ -93,7 +88,6 @@ i915InitDriverFunctions(struct dd_function_table *functions)
 {
    intelInitDriverFunctions(functions);
    i915InitStateFunctions(functions);
-   i915InitTextureFuncs(functions);
    i915InitFragProgFuncs(functions);
    functions->UpdateState = i915InvalidateState;
 }
@@ -129,6 +123,8 @@ i915CreateContext(const __GLcontextModes * mesaVis,
       return GL_FALSE;
    }
 
+   _math_matrix_ctr(&intel->ViewportMatrix);
+
    /* Initialize swrast, tnl driver tables: */
    intelInitSpanFuncs(ctx);
    intelInitTriFuncs(ctx);
@@ -154,6 +150,8 @@ i915CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.MaxTextureRectSize = (1 << 11);
    ctx->Const.MaxTextureUnits = I915_TEX_UNITS;
 
+   ctx->Const.MaxTextureMaxAnisotropy = 4.0;
+
    /* GL_ARB_fragment_program limits - don't think Mesa actually
     * validates programs against these, and in any case one ARB
     * instruction can translate to more than one HW instruction, so
@@ -169,11 +167,13 @@ i915CreateContext(const __GLcontextModes * mesaVis,
    ctx->Const.FragmentProgram.MaxNativeTexIndirections =
       I915_MAX_TEX_INDIRECT;
    ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0; /* I don't think we have one */
+   ctx->Const.FragmentProgram.MaxEnvParams =
+      MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
+	   ctx->Const.FragmentProgram.MaxEnvParams);
 
    ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
-   driInitExtensions(ctx, i915_extensions, GL_FALSE);
-
+   ctx->Const.MaxDrawBuffers = 1;
 
    _tnl_init_vertices(ctx, ctx->Const.MaxArrayLockSize + 12,
                       36 * sizeof(GLfloat));
diff --git a/src/mesa/drivers/dri/i915/i915_context.h b/src/mesa/drivers/dri/i915/i915_context.h
index 87bbf5f927..8de4a9d0d3 100644
--- a/src/mesa/drivers/dri/i915/i915_context.h
+++ b/src/mesa/drivers/dri/i915/i915_context.h
@@ -48,6 +48,7 @@
 #define I915_UPLOAD_FOG              0x20
 #define I915_UPLOAD_INVARIENT        0x40
 #define I915_UPLOAD_DEFAULTS         0x80
+#define I915_UPLOAD_RASTER_RULES     0x100
 #define I915_UPLOAD_TEX(i)           (0x00010000<<(i))
 #define I915_UPLOAD_TEX_ALL          (0x00ff0000)
 #define I915_UPLOAD_TEX_0_SHIFT      16
@@ -82,7 +83,9 @@
 #define I915_CTXREG_IAB   	 	6
 #define I915_CTXREG_BLENDCOLOR0		7
 #define I915_CTXREG_BLENDCOLOR1		8
-#define I915_CTX_SETUP_SIZE		9
+#define I915_CTXREG_BF_STENCIL_OPS	9
+#define I915_CTXREG_BF_STENCIL_MASKS	10
+#define I915_CTX_SETUP_SIZE		11
 
 #define I915_FOGREG_COLOR		0
 #define I915_FOGREG_MODE0		1
@@ -110,6 +113,10 @@
 #define I915_DEFREG_Z1    5
 #define I915_DEF_SETUP_SIZE    6
 
+enum {
+   I915_RASTER_RULES,
+   I915_RASTER_RULES_SETUP_SIZE,
+};
 
 #define I915_MAX_CONSTANT      32
 #define I915_CONSTANT_SIZE     (2+(4*I915_MAX_CONSTANT))
@@ -206,6 +213,7 @@ struct i915_hw_state
    GLuint Stipple[I915_STP_SETUP_SIZE];
    GLuint Fog[I915_FOG_SETUP_SIZE];
    GLuint Defaults[I915_DEF_SETUP_SIZE];
+   GLuint RasterRules[I915_RASTER_RULES_SETUP_SIZE];
    GLuint Tex[I915_TEX_UNITS][I915_TEX_SETUP_SIZE];
    GLuint Constant[I915_CONSTANT_SIZE];
    GLuint ConstantSize;
@@ -321,6 +329,8 @@ extern void i915_print_ureg(const char *msg, GLuint ureg);
 extern void i915InitStateFunctions(struct dd_function_table *functions);
 extern void i915InitState(struct i915_context *i915);
 extern void i915_update_fog(GLcontext * ctx);
+extern void i915_update_stencil(GLcontext * ctx);
+extern void i915_update_provoking_vertex(GLcontext *ctx);
 
 
 /*======================================================================
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index f091d600c3..2db10c60e9 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -162,12 +162,12 @@ src_vector(struct i915_fragment_program *p,
                  GET_SWZ(source->Swizzle, 1),
                  GET_SWZ(source->Swizzle, 2), GET_SWZ(source->Swizzle, 3));
 
-   if (source->NegateBase)
+   if (source->Negate)
       src = negate(src,
-                   GET_BIT(source->NegateBase, 0),
-                   GET_BIT(source->NegateBase, 1),
-                   GET_BIT(source->NegateBase, 2),
-                   GET_BIT(source->NegateBase, 3));
+                   GET_BIT(source->Negate, 0),
+                   GET_BIT(source->Negate, 1),
+                   GET_BIT(source->Negate, 2),
+                   GET_BIT(source->Negate, 3));
 
    return src;
 }
@@ -180,9 +180,9 @@ get_result_vector(struct i915_fragment_program *p,
    switch (inst->DstReg.File) {
    case PROGRAM_OUTPUT:
       switch (inst->DstReg.Index) {
-      case FRAG_RESULT_COLR:
+      case FRAG_RESULT_COLOR:
          return UREG(REG_TYPE_OC, 0);
-      case FRAG_RESULT_DEPR:
+      case FRAG_RESULT_DEPTH:
          p->depth_written = 1;
          return UREG(REG_TYPE_OD, 0);
       default:
@@ -323,7 +323,8 @@ upload_program(struct i915_fragment_program *p)
       p->ctx->FragmentProgram._Current;
    const struct prog_instruction *inst = program->Base.Instructions;
 
-/*    _mesa_debug_fp_inst(program->Base.NumInstructions, inst); */
+   if (INTEL_DEBUG & DEBUG_WM)
+      _mesa_print_program(&program->Base);
 
    /* Is this a parse-failed program?  Ensure a valid program is
     * loaded, as the flagging of an error isn't sufficient to stop
@@ -1049,9 +1050,6 @@ i915ProgramStringNotify(GLcontext * ctx,
          _mesa_append_fog_code(ctx, &p->FragProg);
          p->FragProg.FogOption = GL_NONE;
       }
-
-      if (INTEL_DEBUG & DEBUG_STATE)
-	 _mesa_print_program(prog);
    }
 
    _tnl_program_string(ctx, target, prog);
diff --git a/src/mesa/drivers/dri/i915/i915_reg.h b/src/mesa/drivers/dri/i915/i915_reg.h
index 8891e11c6f..b5fa7fddb9 100644
--- a/src/mesa/drivers/dri/i915/i915_reg.h
+++ b/src/mesa/drivers/dri/i915/i915_reg.h
@@ -86,27 +86,15 @@
 #define BFM_ENABLE_STENCIL_WRITE_MASK     (1<<16)
 #define BFM_STENCIL_TEST_MASK_SHIFT       8
 #define BFM_STENCIL_TEST_MASK_MASK        (0xff<<8)
+#define BFM_STENCIL_TEST_MASK(x)	  (((x)&0xff) << 8)
 #define BFM_STENCIL_WRITE_MASK_SHIFT      0
 #define BFM_STENCIL_WRITE_MASK_MASK       (0xff<<0)
+#define BFM_STENCIL_WRITE_MASK(x)	  ((x)&0xff)
 
 
 
 /* 3DSTATE_BIN_CONTROL p141 */
 
-/* p143 */
-#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
-/* Dword 1 */
-#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
-#define BUF_3D_ID_DEPTH 	(0x7<<24)
-#define BUF_3D_USE_FENCE	(1<<23)
-#define BUF_3D_TILED_SURFACE	(1<<22)
-#define BUF_3D_TILE_WALK_X	0
-#define BUF_3D_TILE_WALK_Y	(1<<21)
-#define BUF_3D_PITCH(x)         (((x)/4)<<2)
-/* Dword 2 */
-#define BUF_3D_ADDR(x)		((x) & ~0x3)
-
-
 /* 3DSTATE_CHROMA_KEY */
 
 /* 3DSTATE_CLEAR_PARAMETERS, p150 */
@@ -155,6 +143,7 @@
 /* p161 */
 #define _3DSTATE_DST_BUF_VARS_CMD	(CMD_3D | (0x1d<<24) | (0x85<<16))
 /* Dword 1 */
+#define CLASSIC_EARLY_DEPTH		(1<<31)
 #define TEX_DEFAULT_COLOR_OGL           (0<<30)
 #define TEX_DEFAULT_COLOR_D3D           (1<<30)
 #define ZR_EARLY_DEPTH                  (1<<29)
@@ -308,7 +297,9 @@
 #define TEXKILL_4D                      (1<<9)
 #define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
 #define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
+#define LINE_STRIP_PROVOKE_VRTX_MASK	(3 << 6)
 #define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX_MASK	(3 << 3)
 #define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
 
 /* _3DSTATE_SCISSOR_ENABLE, p256 */
diff --git a/src/mesa/drivers/dri/i915/i915_state.c b/src/mesa/drivers/dri/i915/i915_state.c
index a53f120a81..b60efea75b 100644
--- a/src/mesa/drivers/dri/i915/i915_state.c
+++ b/src/mesa/drivers/dri/i915/i915_state.c
@@ -41,79 +41,126 @@
 #include "intel_fbo.h"
 #include "intel_screen.h"
 #include "intel_batchbuffer.h"
+#include "intel_buffers.h"
 
 #include "i915_context.h"
 #include "i915_reg.h"
 
 #define FILE_DEBUG_FLAG DEBUG_STATE
 
-static void
-i915StencilFuncSeparate(GLcontext * ctx, GLenum face, GLenum func, GLint ref,
-                        GLuint mask)
+void
+i915_update_stencil(GLcontext * ctx)
 {
    struct i915_context *i915 = I915_CONTEXT(ctx);
-   int test = intel_translate_compare_func(func);
-
-   mask = mask & 0xff;
+   GLuint front_ref, front_writemask, front_mask;
+   GLenum front_func, front_fail, front_pass_z_fail, front_pass_z_pass;
+   GLuint back_ref, back_writemask, back_mask;
+   GLenum back_func, back_fail, back_pass_z_fail, back_pass_z_pass;
 
-   DBG("%s : func: %s, ref : 0x%x, mask: 0x%x\n", __FUNCTION__,
-       _mesa_lookup_enum_by_nr(func), ref, mask);
+   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
 
+   /* The 915 considers CW to be "front" for two-sided stencil, so choose
+    * appropriately.
+    */
+   /* _NEW_POLYGON | _NEW_STENCIL */
+   if (ctx->Polygon.FrontFace == GL_CW) {
+      front_ref = ctx->Stencil.Ref[0];
+      front_mask = ctx->Stencil.ValueMask[0];
+      front_writemask = ctx->Stencil.WriteMask[0];
+      front_func = ctx->Stencil.Function[0];
+      front_fail = ctx->Stencil.FailFunc[0];
+      front_pass_z_fail = ctx->Stencil.ZFailFunc[0];
+      front_pass_z_pass = ctx->Stencil.ZPassFunc[0];
+      back_ref = ctx->Stencil.Ref[ctx->Stencil._BackFace];
+      back_mask = ctx->Stencil.ValueMask[ctx->Stencil._BackFace];
+      back_writemask = ctx->Stencil.WriteMask[ctx->Stencil._BackFace];
+      back_func = ctx->Stencil.Function[ctx->Stencil._BackFace];
+      back_fail = ctx->Stencil.FailFunc[ctx->Stencil._BackFace];
+      back_pass_z_fail = ctx->Stencil.ZFailFunc[ctx->Stencil._BackFace];
+      back_pass_z_pass = ctx->Stencil.ZPassFunc[ctx->Stencil._BackFace];
+   } else {
+      front_ref = ctx->Stencil.Ref[ctx->Stencil._BackFace];
+      front_mask = ctx->Stencil.ValueMask[ctx->Stencil._BackFace];
+      front_writemask = ctx->Stencil.WriteMask[ctx->Stencil._BackFace];
+      front_func = ctx->Stencil.Function[ctx->Stencil._BackFace];
+      front_fail = ctx->Stencil.FailFunc[ctx->Stencil._BackFace];
+      front_pass_z_fail = ctx->Stencil.ZFailFunc[ctx->Stencil._BackFace];
+      front_pass_z_pass = ctx->Stencil.ZPassFunc[ctx->Stencil._BackFace];
+      back_ref = ctx->Stencil.Ref[0];
+      back_mask = ctx->Stencil.ValueMask[0];
+      back_writemask = ctx->Stencil.WriteMask[0];
+      back_func = ctx->Stencil.Function[0];
+      back_fail = ctx->Stencil.FailFunc[0];
+      back_pass_z_fail = ctx->Stencil.ZFailFunc[0];
+      back_pass_z_pass = ctx->Stencil.ZPassFunc[0];
+   }
 
-   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
-   i915->state.Ctx[I915_CTXREG_STATE4] &= ~MODE4_ENABLE_STENCIL_TEST_MASK;
+   /* Set front state. */
+   i915->state.Ctx[I915_CTXREG_STATE4] &= ~(MODE4_ENABLE_STENCIL_TEST_MASK |
+					    MODE4_ENABLE_STENCIL_WRITE_MASK);
    i915->state.Ctx[I915_CTXREG_STATE4] |= (ENABLE_STENCIL_TEST_MASK |
-                                           STENCIL_TEST_MASK(mask));
+					   ENABLE_STENCIL_WRITE_MASK |
+					   STENCIL_TEST_MASK(front_mask) |
+					   STENCIL_WRITE_MASK(front_writemask));
 
    i915->state.Ctx[I915_CTXREG_LIS5] &= ~(S5_STENCIL_REF_MASK |
-                                          S5_STENCIL_TEST_FUNC_MASK);
+					  S5_STENCIL_TEST_FUNC_MASK |
+					  S5_STENCIL_FAIL_MASK |
+					  S5_STENCIL_PASS_Z_FAIL_MASK |
+					  S5_STENCIL_PASS_Z_PASS_MASK);
+
+   i915->state.Ctx[I915_CTXREG_LIS5] |=
+      (front_ref << S5_STENCIL_REF_SHIFT) |
+      (intel_translate_compare_func(front_func) << S5_STENCIL_TEST_FUNC_SHIFT) |
+      (intel_translate_stencil_op(front_fail) << S5_STENCIL_FAIL_SHIFT) |
+      (intel_translate_stencil_op(front_pass_z_fail) <<
+       S5_STENCIL_PASS_Z_FAIL_SHIFT) |
+      (intel_translate_stencil_op(front_pass_z_pass) <<
+       S5_STENCIL_PASS_Z_PASS_SHIFT);
+
+   /* Set back state if different from front. */
+   if (ctx->Stencil._TestTwoSide) {
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_OPS] &=
+	 ~(BFO_STENCIL_REF_MASK |
+	   BFO_STENCIL_TEST_MASK |
+	   BFO_STENCIL_FAIL_MASK |
+	   BFO_STENCIL_PASS_Z_FAIL_MASK |
+	   BFO_STENCIL_PASS_Z_PASS_MASK);
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_OPS] |= BFO_STENCIL_TWO_SIDE |
+	 (back_ref << BFO_STENCIL_REF_SHIFT) |
+	 (intel_translate_compare_func(back_func) << BFO_STENCIL_TEST_SHIFT) |
+	 (intel_translate_stencil_op(back_fail) << BFO_STENCIL_FAIL_SHIFT) |
+	 (intel_translate_stencil_op(back_pass_z_fail) <<
+	  BFO_STENCIL_PASS_Z_FAIL_SHIFT) |
+	 (intel_translate_stencil_op(back_pass_z_pass) <<
+	  BFO_STENCIL_PASS_Z_PASS_SHIFT);
+
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_MASKS] &=
+	 ~(BFM_STENCIL_TEST_MASK_MASK |
+	   BFM_STENCIL_WRITE_MASK_MASK);
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_MASKS] |=
+	 BFM_STENCIL_TEST_MASK(back_mask) |
+	 BFM_STENCIL_WRITE_MASK(back_writemask);
+   } else {
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_OPS] &= ~BFO_STENCIL_TWO_SIDE;
+   }
+}
 
-   i915->state.Ctx[I915_CTXREG_LIS5] |= ((ref << S5_STENCIL_REF_SHIFT) |
-                                         (test <<
-                                          S5_STENCIL_TEST_FUNC_SHIFT));
+static void
+i915StencilFuncSeparate(GLcontext * ctx, GLenum face, GLenum func, GLint ref,
+                        GLuint mask)
+{
 }
 
 static void
 i915StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 {
-   struct i915_context *i915 = I915_CONTEXT(ctx);
-
-   DBG("%s : mask 0x%x\n", __FUNCTION__, mask);
-   
-   mask = mask & 0xff;
-
-   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
-   i915->state.Ctx[I915_CTXREG_STATE4] &= ~MODE4_ENABLE_STENCIL_WRITE_MASK;
-   i915->state.Ctx[I915_CTXREG_STATE4] |= (ENABLE_STENCIL_WRITE_MASK |
-                                           STENCIL_WRITE_MASK(mask));
 }
 
-
 static void
 i915StencilOpSeparate(GLcontext * ctx, GLenum face, GLenum fail, GLenum zfail,
                       GLenum zpass)
 {
-   struct i915_context *i915 = I915_CONTEXT(ctx);
-   int fop = intel_translate_stencil_op(fail);
-   int dfop = intel_translate_stencil_op(zfail);
-   int dpop = intel_translate_stencil_op(zpass);
-
-
-   DBG("%s: fail : %s, zfail: %s, zpass : %s\n", __FUNCTION__,
-       _mesa_lookup_enum_by_nr(fail),
-       _mesa_lookup_enum_by_nr(zfail), _mesa_lookup_enum_by_nr(zpass));
-
-   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
-
-   i915->state.Ctx[I915_CTXREG_LIS5] &= ~(S5_STENCIL_FAIL_MASK |
-                                          S5_STENCIL_PASS_Z_FAIL_MASK |
-                                          S5_STENCIL_PASS_Z_PASS_MASK);
-
-   i915->state.Ctx[I915_CTXREG_LIS5] |= ((fop << S5_STENCIL_FAIL_SHIFT) |
-                                         (dfop <<
-                                          S5_STENCIL_PASS_Z_FAIL_SHIFT) |
-                                         (dpop <<
-                                          S5_STENCIL_PASS_Z_PASS_SHIFT));
 }
 
 static void
@@ -301,6 +348,65 @@ i915DepthMask(GLcontext * ctx, GLboolean flag)
       i915->state.Ctx[I915_CTXREG_LIS6] &= ~S6_DEPTH_WRITE_ENABLE;
 }
 
+
+
+/**
+ * Update the viewport transformation matrix.  Depends on:
+ *  - viewport pos/size
+ *  - depthrange
+ *  - window pos/size or FBO size
+ */
+void
+intelCalcViewport(GLcontext * ctx)
+{
+   struct intel_context *intel = intel_context(ctx);
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   GLfloat *m = intel->ViewportMatrix.m;
+   GLfloat yScale, yBias;
+
+   if (ctx->DrawBuffer->Name) {
+      /* User created FBO */
+      /* y=0=bottom */
+      yScale = 1.0;
+      yBias = 0.0;
+   }
+   else {
+      /* window buffer, y=0=top */
+      yScale = -1.0;
+      yBias = (intel->driDrawable) ? intel->driDrawable->h : 0.0F;
+   }
+
+   m[MAT_SX] = v[MAT_SX];
+   m[MAT_TX] = v[MAT_TX];
+
+   m[MAT_SY] = v[MAT_SY] * yScale;
+   m[MAT_TY] = v[MAT_TY] * yScale + yBias;
+
+   m[MAT_SZ] = v[MAT_SZ] * depthScale;
+   m[MAT_TZ] = v[MAT_TZ] * depthScale;
+}
+
+
+/** Called from ctx->Driver.Viewport() */
+static void
+i915Viewport(GLcontext * ctx,
+              GLint x, GLint y, GLsizei width, GLsizei height)
+{
+   intelCalcViewport(ctx);
+
+   intel_viewport(ctx, x, y, width, height);
+}
+
+
+/** Called from ctx->Driver.DepthRange() */
+static void
+i915DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
+{
+   intelCalcViewport(ctx);
+}
+
+
 /* =============================================================
  * Polygon stipple
  *
@@ -885,6 +991,17 @@ i915_init_packets(struct i915_context *i915)
          _3DSTATE_CONST_BLEND_COLOR_CMD;
       i915->state.Ctx[I915_CTXREG_BLENDCOLOR1] = 0;
 
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_MASKS] =
+	 _3DSTATE_BACKFACE_STENCIL_MASKS |
+	 BFM_ENABLE_STENCIL_TEST_MASK |
+	 BFM_ENABLE_STENCIL_WRITE_MASK |
+	 (0xff << BFM_STENCIL_WRITE_MASK_SHIFT) |
+	 (0xff << BFM_STENCIL_TEST_MASK_SHIFT);
+      i915->state.Ctx[I915_CTXREG_BF_STENCIL_OPS] =
+	 _3DSTATE_BACKFACE_STENCIL_OPS |
+	 BFO_ENABLE_STENCIL_REF |
+	 BFO_ENABLE_STENCIL_FUNCS |
+	 BFO_ENABLE_STENCIL_TWO_SIDE;
    }
 
    {
@@ -916,6 +1033,13 @@ i915_init_packets(struct i915_context *i915)
       i915->state.Buffer[I915_DESTREG_SR2] = 0;
    }
 
+   i915->state.RasterRules[I915_RASTER_RULES] = _3DSTATE_RASTER_RULES_CMD |
+      ENABLE_POINT_RASTER_RULE |
+      OGL_POINT_RASTER_RULE |
+      ENABLE_LINE_STRIP_PROVOKE_VRTX |
+      ENABLE_TRI_FAN_PROVOKE_VRTX |
+      LINE_STRIP_PROVOKE_VRTX(1) |
+      TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D;
 
 #if 0
    {
@@ -936,7 +1060,33 @@ i915_init_packets(struct i915_context *i915)
    i915->state.active = (I915_UPLOAD_PROGRAM |
                          I915_UPLOAD_STIPPLE |
                          I915_UPLOAD_CTX |
-                         I915_UPLOAD_BUFFERS | I915_UPLOAD_INVARIENT);
+                         I915_UPLOAD_BUFFERS |
+			 I915_UPLOAD_INVARIENT |
+			 I915_UPLOAD_RASTER_RULES);
+}
+
+void
+i915_update_provoking_vertex(GLcontext * ctx)
+{
+   struct i915_context *i915 = I915_CONTEXT(ctx);
+
+   I915_STATECHANGE(i915, I915_UPLOAD_CTX);
+   i915->state.Ctx[I915_CTXREG_LIS6] &= ~(S6_TRISTRIP_PV_MASK);
+
+   I915_STATECHANGE(i915, I915_UPLOAD_RASTER_RULES);
+   i915->state.RasterRules[I915_RASTER_RULES] &= ~(LINE_STRIP_PROVOKE_VRTX_MASK |
+						   TRI_FAN_PROVOKE_VRTX_MASK);
+
+   /* _NEW_LIGHT */
+   if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION) {
+      i915->state.RasterRules[I915_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(1) |
+						     TRI_FAN_PROVOKE_VRTX(2));
+      i915->state.Ctx[I915_CTXREG_LIS6] |= (2 << S6_TRISTRIP_PV_SHIFT);
+   } else {
+      i915->state.RasterRules[I915_RASTER_RULES] |= (LINE_STRIP_PROVOKE_VRTX(0) |
+						     TRI_FAN_PROVOKE_VRTX(1));
+      i915->state.Ctx[I915_CTXREG_LIS6] |= (0 << S6_TRISTRIP_PV_SHIFT);
+    }
 }
 
 void
@@ -964,6 +1114,8 @@ i915InitStateFunctions(struct dd_function_table *functions)
    functions->StencilFuncSeparate = i915StencilFuncSeparate;
    functions->StencilMaskSeparate = i915StencilMaskSeparate;
    functions->StencilOpSeparate = i915StencilOpSeparate;
+   functions->DepthRange = i915DepthRange;
+   functions->Viewport = i915Viewport;
 }
 
 
diff --git a/src/mesa/drivers/dri/i915/i915_tex_layout.c b/src/mesa/drivers/dri/i915/i915_tex_layout.c
index d44a2f47b3..d9588e5b56 100644
--- a/src/mesa/drivers/dri/i915/i915_tex_layout.c
+++ b/src/mesa/drivers/dri/i915/i915_tex_layout.c
@@ -55,6 +55,17 @@ static GLint step_offsets[6][2] = {
    [FACE_NEG_Z] = {-1, 1},
 };
 
+
+static GLint bottom_offsets[6] = {
+   [FACE_POS_X] = 16 + 0 * 8,
+   [FACE_POS_Y] = 16 + 1 * 8,
+   [FACE_POS_Z] = 16 + 2 * 8,
+   [FACE_NEG_X] = 16 + 3 * 8,
+   [FACE_NEG_Y] = 16 + 4 * 8,
+   [FACE_NEG_Z] = 16 + 5 * 8,
+};
+
+
 /**
  * Cube texture map layout for i830M-GM915.
  *
@@ -101,7 +112,8 @@ static GLint step_offsets[6][2] = {
  */
 static void
 i915_miptree_layout_cube(struct intel_context *intel,
-			 struct intel_mipmap_tree * mt)
+			 struct intel_mipmap_tree * mt,
+			 uint32_t tiling)
 {
    const GLuint dim = mt->width0;
    GLuint face;
@@ -111,7 +123,7 @@ i915_miptree_layout_cube(struct intel_context *intel,
    assert(lvlWidth == lvlHeight); /* cubemap images are square */
 
    /* double pitch for cube layouts */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, dim * 2);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, dim * 2);
    mt->total_height = dim * 4;
 
    for (level = mt->first_level; level <= mt->last_level; level++) {
@@ -145,7 +157,8 @@ i915_miptree_layout_cube(struct intel_context *intel,
 
 static void
 i915_miptree_layout_3d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
@@ -154,7 +167,7 @@ i915_miptree_layout_3d(struct intel_context *intel,
    GLint level;
 
    /* Calculate the size of a single slice. */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
 
    /* XXX: hardware expects/requires 9 levels at minimum. */
    for (level = mt->first_level; level <= MAX2(8, mt->last_level); level++) {
@@ -189,14 +202,15 @@ i915_miptree_layout_3d(struct intel_context *intel,
 
 static void
 i915_miptree_layout_2d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
    GLuint img_height;
    GLint level;
 
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
    mt->total_height = 0;
 
    for (level = mt->first_level; level <= mt->last_level; level++) {
@@ -217,19 +231,20 @@ i915_miptree_layout_2d(struct intel_context *intel,
 }
 
 GLboolean
-i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
+i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt,
+		    uint32_t tiling)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      i915_miptree_layout_cube(intel, mt);
+      i915_miptree_layout_cube(intel, mt, tiling);
       break;
    case GL_TEXTURE_3D:
-      i915_miptree_layout_3d(intel, mt);
+      i915_miptree_layout_3d(intel, mt, tiling);
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE_ARB:
-      i915_miptree_layout_2d(intel, mt);
+      i915_miptree_layout_2d(intel, mt, tiling);
       break;
    default:
       _mesa_problem(NULL, "Unexpected tex target in i915_miptree_layout()");
@@ -297,7 +312,7 @@ i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
  * +---+   +---+   +---+   +---+   +---+   +---+
  *
  * The bottom row continues with the remaining 2x2 then the 1x1 mip contents
- * in order, with each of them aligned to a 4x4 block boundary.  Thus, for
+ * in order, with each of them aligned to a 8x8 block boundary.  Thus, for
  * 32x32 cube maps and smaller, the bottom row layout is going to dictate the
  * pitch of the tree.  For a tree with 4x4 images, the pitch is at least
  * 14 * 8 = 112 texels, for 2x2 it is at least 12 * 8 texels, and for 1x1
@@ -306,7 +321,8 @@ i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
 
 static void
 i945_miptree_layout_cube(struct intel_context *intel,
-			 struct intel_mipmap_tree * mt)
+			 struct intel_mipmap_tree * mt,
+			 uint32_t tiling)
 {
    const GLuint dim = mt->width0;
    GLuint face;
@@ -320,9 +336,9 @@ i945_miptree_layout_cube(struct intel_context *intel,
     * or the final row of 4x4, 2x2 and 1x1 faces below this.
     */
    if (dim > 32)
-      mt->pitch = intel_miptree_pitch_align (intel, mt, dim * 2);
+      mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, dim * 2);
    else
-      mt->pitch = intel_miptree_pitch_align (intel, mt, 14 * 8);
+      mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, 14 * 8);
 
    if (dim >= 4)
       mt->total_height = dim * 4 + 4;
@@ -375,10 +391,11 @@ i945_miptree_layout_cube(struct intel_context *intel,
 	       x = (face - 4) * 8;
 	       break;
 	    }
+	    break;
 
 	 case 2:
 	    y = mt->total_height - 4;
-	    x = 16 + face * 8;
+	    x = bottom_offsets[face];
 	    break;
 
 	 case 1:
@@ -396,7 +413,8 @@ i945_miptree_layout_cube(struct intel_context *intel,
 
 static void
 i945_miptree_layout_3d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
@@ -405,7 +423,7 @@ i945_miptree_layout_3d(struct intel_context *intel,
    GLuint pack_y_pitch;
    GLuint level;
 
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
    mt->total_height = 0;
 
    pack_y_pitch = MAX2(mt->height0, 2);
@@ -450,19 +468,23 @@ i945_miptree_layout_3d(struct intel_context *intel,
 }
 
 GLboolean
-i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
+i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt,
+		    uint32_t tiling)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      i945_miptree_layout_cube(intel, mt);
+      if (mt->compressed)
+	 i945_miptree_layout_cube(intel, mt, tiling);
+      else
+	 i915_miptree_layout_cube(intel, mt, tiling);
       break;
    case GL_TEXTURE_3D:
-      i945_miptree_layout_3d(intel, mt);
+      i945_miptree_layout_3d(intel, mt, tiling);
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE_ARB:
-      i945_miptree_layout_2d(intel, mt);
+      i945_miptree_layout_2d(intel, mt, tiling);
       break;
    default:
       _mesa_problem(NULL, "Unexpected tex target in i945_miptree_layout()");
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index adbb52a3a3..32d4b30cf9 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -37,7 +37,8 @@
 
 
 static GLuint
-translate_texture_format(GLuint mesa_format, GLenum DepthMode)
+translate_texture_format(GLuint mesa_format, GLuint internal_format,
+			 GLenum DepthMode)
 {
    switch (mesa_format) {
    case MESA_FORMAT_L8:
@@ -55,7 +56,10 @@ translate_texture_format(GLuint mesa_format, GLenum DepthMode)
    case MESA_FORMAT_ARGB4444:
       return MAPSURF_16BIT | MT_16BIT_ARGB4444;
    case MESA_FORMAT_ARGB8888:
-      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+      if (internal_format == GL_RGB)
+	 return MAPSURF_32BIT | MT_32BIT_XRGB8888;
+      else
+	 return MAPSURF_32BIT | MT_32BIT_ARGB8888;
    case MESA_FORMAT_YCBCR_REV:
       return (MAPSURF_422 | MT_422_YCRCB_NORMAL);
    case MESA_FORMAT_YCBCR:
@@ -128,7 +132,8 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage;
    GLuint *state = i915->state.Tex[unit], format, pitch;
-   GLint lodbias;
+   GLint lodbias, aniso = 0;
+   GLubyte border[4];
 
    memset(state, 0, sizeof(state));
 
@@ -173,14 +178,20 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 								 firstLevel);
 
       format = translate_texture_format(firstImage->TexFormat->MesaFormat, 
-		tObj->DepthMode);
+					firstImage->InternalFormat,
+					tObj->DepthMode);
       pitch = intelObj->mt->pitch * intelObj->mt->cpp;
    }
 
    state[I915_TEXREG_MS3] =
       (((firstImage->Height - 1) << MS3_HEIGHT_SHIFT) |
-       ((firstImage->Width - 1) << MS3_WIDTH_SHIFT) | format |
-       MS3_USE_FENCE_REGS);
+       ((firstImage->Width - 1) << MS3_WIDTH_SHIFT) | format);
+
+   if (intelObj->mt->region->tiling != I915_TILING_NONE) {
+      state[I915_TEXREG_MS3] |= MS3_TILED_SURFACE;
+      if (intelObj->mt->region->tiling == I915_TILING_Y)
+	 state[I915_TEXREG_MS3] |= MS3_TILE_WALK;
+   }
 
    state[I915_TEXREG_MS4] =
      ((((pitch / 4) - 1) << MS4_PITCH_SHIFT) | MS4_CUBE_FACE_ENA_MASK |
@@ -224,6 +235,10 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
       if (tObj->MaxAnisotropy > 1.0) {
          minFilt = FILTER_ANISOTROPIC;
          magFilt = FILTER_ANISOTROPIC;
+         if (tObj->MaxAnisotropy > 2.0)
+            aniso = SS2_MAX_ANISO_4;
+         else
+            aniso = SS2_MAX_ANISO_2;
       }
       else {
          switch (tObj->MagFilter) {
@@ -269,7 +284,8 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
       state[I915_TEXREG_SS2] |= ((minFilt << SS2_MIN_FILTER_SHIFT) |
                                  (mipFilt << SS2_MIP_FILTER_SHIFT) |
-                                 (magFilt << SS2_MAG_FILTER_SHIFT));
+                                 (magFilt << SS2_MAG_FILTER_SHIFT) |
+                                 aniso);
    }
 
    {
@@ -313,21 +329,26 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
       state[I915_TEXREG_SS3] |= (unit << SS3_TEXTUREMAP_INDEX_SHIFT);
    }
 
+   /* convert border color from float to ubyte */
+   CLAMPED_FLOAT_TO_UBYTE(border[0], tObj->BorderColor[0]);
+   CLAMPED_FLOAT_TO_UBYTE(border[1], tObj->BorderColor[1]);
+   CLAMPED_FLOAT_TO_UBYTE(border[2], tObj->BorderColor[2]);
+   CLAMPED_FLOAT_TO_UBYTE(border[3], tObj->BorderColor[3]);
 
    if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
       /* GL specs that border color for depth textures is taken from the
        * R channel, while the hardware uses A.  Spam R into all the channels
        * for safety.
        */
-      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
-						   tObj->_BorderChan[0],
-						   tObj->_BorderChan[0],
-						   tObj->_BorderChan[0]);
+      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(border[0],
+						   border[0],
+						   border[0],
+						   border[0]);
    } else {
-      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
-						   tObj->_BorderChan[1],
-						   tObj->_BorderChan[2],
-						   tObj->_BorderChan[3]);
+      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(border[0],
+						   border[1],
+						   border[2],
+						   border[3]);
    }
 
 
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 3f6d282d34..9a723d3cd7 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -32,6 +32,7 @@
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/colormac.h"
+#include "main/texformat.h"
 
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
@@ -40,6 +41,8 @@
 #include "intel_tex.h"
 #include "intel_regions.h"
 #include "intel_tris.h"
+#include "intel_fbo.h"
+#include "intel_chipset.h"
 
 #include "i915_reg.h"
 #include "i915_context.h"
@@ -173,7 +176,7 @@ i915_emit_invarient_state(struct intel_context *intel)
 {
    BATCH_LOCALS;
 
-   BEGIN_BATCH(20, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(17, IGNORE_CLIPRECTS);
 
    OUT_BATCH(_3DSTATE_AA_CMD |
              AA_LINE_ECAAR_WIDTH_ENABLE |
@@ -197,14 +200,6 @@ i915_emit_invarient_state(struct intel_context *intel)
              CSB_TCB(3, 3) |
              CSB_TCB(4, 4) | CSB_TCB(5, 5) | CSB_TCB(6, 6) | CSB_TCB(7, 7));
 
-   OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
-             ENABLE_POINT_RASTER_RULE |
-             OGL_POINT_RASTER_RULE |
-             ENABLE_LINE_STRIP_PROVOKE_VRTX |
-             ENABLE_TRI_FAN_PROVOKE_VRTX |
-             LINE_STRIP_PROVOKE_VRTX(1) |
-             TRI_FAN_PROVOKE_VRTX(2) | ENABLE_TEXKILL_3D_4D | TEXKILL_4D);
-
    /* Need to initialize this to zero.
     */
    OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | (0));
@@ -222,11 +217,6 @@ i915_emit_invarient_state(struct intel_context *intel)
    OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);       /* disable indirect state */
    OUT_BATCH(0);
 
-
-   /* Don't support twosided stencil yet */
-   OUT_BATCH(_3DSTATE_BACKFACE_STENCIL_OPS | BFO_ENABLE_STENCIL_TWO_SIDE | 0);
-   OUT_BATCH(0);
-
    ADVANCE_BATCH();
 }
 
@@ -260,6 +250,9 @@ get_state_size(struct i915_hw_state *state)
    if (dirty & I915_UPLOAD_INVARIENT)
       sz += 30 * 4;
 
+   if (dirty & I915_UPLOAD_RASTER_RULES)
+      sz += sizeof(state->RasterRules);
+
    if (dirty & I915_UPLOAD_CTX)
       sz += sizeof(state->Ctx);
 
@@ -368,6 +361,12 @@ i915_emit_state(struct intel_context *intel)
       i915_emit_invarient_state(intel);
    }
 
+   if (dirty & I915_UPLOAD_RASTER_RULES) {
+      if (INTEL_DEBUG & DEBUG_STATE)
+         fprintf(stderr, "I915_UPLOAD_RASTER_RULES:\n");
+      emit(intel, state->RasterRules, sizeof(state->RasterRules));
+   }
+
    if (dirty & I915_UPLOAD_CTX) {
       if (INTEL_DEBUG & DEBUG_STATE)
          fprintf(stderr, "I915_UPLOAD_CTX:\n");
@@ -527,6 +526,23 @@ i915_destroy_context(struct intel_context *intel)
    _tnl_free_vertices(&intel->ctx);
 }
 
+void
+i915_set_buf_info_for_region(uint32_t *state, struct intel_region *region,
+			     uint32_t buffer_id)
+{
+   state[0] = _3DSTATE_BUF_INFO_CMD;
+   state[1] = buffer_id;
+
+   if (region != NULL) {
+      state[1] |= BUF_3D_PITCH(region->pitch * region->cpp);
+
+      if (region->tiling != I915_TILING_NONE) {
+	 state[1] |= BUF_3D_TILED_SURFACE;
+	 if (region->tiling == I915_TILING_Y)
+	    state[1] |= BUF_3D_TILE_WALK_Y;
+      }
+   }
+}
 
 /**
  * Set the drawing regions for the color and depth/stencil buffers.
@@ -542,6 +558,8 @@ i915_state_draw_region(struct intel_context *intel,
 {
    struct i915_context *i915 = i915_context(&intel->ctx);
    GLcontext *ctx = &intel->ctx;
+   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
    GLuint value;
 
    ASSERT(state == &i915->state || state == &i915->meta);
@@ -558,21 +576,11 @@ i915_state_draw_region(struct intel_context *intel,
    /*
     * Set stride/cpp values
     */
-   if (color_region) {
-      state->Buffer[I915_DESTREG_CBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I915_DESTREG_CBUFADDR1] =
-         (BUF_3D_ID_COLOR_BACK |
-          BUF_3D_PITCH(color_region->pitch * color_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I915_DESTREG_CBUFADDR0],
+				color_region, BUF_3D_ID_COLOR_BACK);
 
-   if (depth_region) {
-      state->Buffer[I915_DESTREG_DBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I915_DESTREG_DBUFADDR1] =
-         (BUF_3D_ID_DEPTH |
-          BUF_3D_PITCH(depth_region->pitch * depth_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I915_DESTREG_DBUFADDR0],
+				depth_region, BUF_3D_ID_DEPTH);
 
    /*
     * Compute/set I915_DESTREG_DV1 value
@@ -580,12 +588,34 @@ i915_state_draw_region(struct intel_context *intel,
    value = (DSTORG_HORT_BIAS(0x8) |     /* .5 */
             DSTORG_VERT_BIAS(0x8) |     /* .5 */
             LOD_PRECLAMP_OGL | TEX_DEFAULT_COLOR_OGL);
-   if (color_region && color_region->cpp == 4) {
-      value |= DV_PF_8888;
-   }
-   else {
-      value |= (DITHER_FULL_ALWAYS | DV_PF_565);
+   if (irb != NULL) {
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+	 value |= DV_PF_8888;
+	 break;
+      case MESA_FORMAT_RGB565:
+	 value |= DV_PF_565 | DITHER_FULL_ALWAYS;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 value |= DV_PF_1555 | DITHER_FULL_ALWAYS;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 value |= DV_PF_4444 | DITHER_FULL_ALWAYS;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
    }
+
+   /* This isn't quite safe, thus being hidden behind an option.  When changing
+    * the value of this bit, the pipeline needs to be MI_FLUSHed.  And it
+    * can only be set when a depth buffer is already defined.
+    */
+   if (IS_945(intel->intelScreen->deviceID) && intel->use_early_z &&
+       depth_region->tiling != I915_TILING_NONE)
+      value |= CLASSIC_EARLY_DEPTH;
+
    if (depth_region && depth_region->cpp == 4) {
       value |= DEPTH_FRMT_24_FIXED_8_OTHER;
    }
@@ -658,13 +688,6 @@ i915_assert_not_dirty( struct intel_context *intel )
    assert(!dirty);
 }
 
-static void
-i915_note_unlock( struct intel_context *intel )
-{
-    /* nothing */
-}
-
-
 void
 i915InitVtbl(struct i915_context *i915)
 {
@@ -679,6 +702,5 @@ i915InitVtbl(struct i915_context *i915)
    i915->intel.vtbl.update_texture_state = i915UpdateTextureState;
    i915->intel.vtbl.flush_cmd = i915_flush_cmd;
    i915->intel.vtbl.assert_not_dirty = i915_assert_not_dirty;
-   i915->intel.vtbl.note_unlock = i915_note_unlock; 
    i915->intel.vtbl.finish_batch = intel_finish_vb;
 }
diff --git a/src/mesa/drivers/dri/i915/intel_clear.c b/src/mesa/drivers/dri/i915/intel_clear.c
new file mode 120000
index 0000000000..9a2a742a0d
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_clear.c
@@ -0,0 +1 @@
+../intel/intel_clear.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_depthstencil.c b/src/mesa/drivers/dri/i915/intel_depthstencil.c
deleted file mode 120000
index 4ac4ae690a..0000000000
--- a/src/mesa/drivers/dri/i915/intel_depthstencil.c
+++ /dev/null
@@ -1 +0,0 @@
-../intel/intel_depthstencil.c
-\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_eglimage.c b/src/mesa/drivers/dri/i915/intel_eglimage.c
new file mode 120000
index 0000000000..6047b254a0
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_eglimage.c
@@ -0,0 +1 @@
+../intel/intel_eglimage.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_eglimage.h b/src/mesa/drivers/dri/i915/intel_eglimage.h
new file mode 120000
index 0000000000..a2cde85a28
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_eglimage.h
@@ -0,0 +1 @@
+../intel/intel_eglimage.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_extensions.c b/src/mesa/drivers/dri/i915/intel_extensions.c
new file mode 120000
index 0000000000..a2f3e8cd20
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_extensions.c
@@ -0,0 +1 @@
+../intel/intel_extensions.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_generatemipmap.c b/src/mesa/drivers/dri/i915/intel_generatemipmap.c
new file mode 120000
index 0000000000..4c6b37ada0
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_generatemipmap.c
@@ -0,0 +1 @@
+../intel/intel_generatemipmap.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_pixel_read.c b/src/mesa/drivers/dri/i915/intel_pixel_read.c
index 56087aacd4..cc4589f4d4 100644..120000
--- a/src/mesa/drivers/dri/i915/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_read.c
@@ -1,306 +1 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/enums.h"
-#include "main/mtypes.h"
-#include "main/macros.h"
-#include "main/image.h"
-#include "main/bufferobj.h"
-#include "swrast/swrast.h"
-
-#include "intel_screen.h"
-#include "intel_context.h"
-#include "intel_batchbuffer.h"
-#include "intel_blit.h"
-#include "intel_buffers.h"
-#include "intel_regions.h"
-#include "intel_pixel.h"
-#include "intel_buffer_objects.h"
-
-/* For many applications, the new ability to pull the source buffers
- * back out of the GTT and then do the packing/conversion operations
- * in software will be as much of an improvement as trying to get the
- * blitter and/or texture engine to do the work. 
- *
- * This step is gated on private backbuffers.
- * 
- * Obviously the frontbuffer can't be pulled back, so that is either
- * an argument for blit/texture readpixels, or for blitting to a
- * temporary and then pulling that back.
- *
- * When the destination is a pbo, however, it's not clear if it is
- * ever going to be pulled to main memory (though the access param
- * will be a good hint).  So it sounds like we do want to be able to
- * choose between blit/texture implementation on the gpu and pullback
- * and cpu-based copying.
- *
- * Unless you can magically turn client memory into a PBO for the
- * duration of this call, there will be a cpu-based copying step in
- * any case.
- */
-
-
-static GLboolean
-do_texture_readpixels(GLcontext * ctx,
-                      GLint x, GLint y, GLsizei width, GLsizei height,
-                      GLenum format, GLenum type,
-                      const struct gl_pixelstore_attrib *pack,
-                      struct intel_region *dest_region)
-{
-#if 0
-   struct intel_context *intel = intel_context(ctx);
-   intelScreenPrivate *screen = intel->intelScreen;
-   GLint pitch = pack->RowLength ? pack->RowLength : width;
-   __DRIdrawablePrivate *dPriv = intel->driDrawable;
-   int textureFormat;
-   GLenum glTextureFormat;
-   int destFormat, depthFormat, destPitch;
-   drm_clip_rect_t tmp;
-
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-
-   if (ctx->_ImageTransferState ||
-       pack->SwapBytes || pack->LsbFirst || !pack->Invert) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         fprintf(stderr, "%s: check_color failed\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   intel->vtbl.meta_texrect_source(intel, intel_readbuf_region(intel));
-
-   if (!intel->vtbl.meta_render_dest(intel, dest_region, type, format)) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         fprintf(stderr, "%s: couldn't set dest %s/%s\n",
-                 __FUNCTION__,
-                 _mesa_lookup_enum_by_nr(type),
-                 _mesa_lookup_enum_by_nr(format));
-      return GL_FALSE;
-   }
-
-   LOCK_HARDWARE(intel);
-
-   if (intel->driDrawable->numClipRects) {
-      intel->vtbl.install_meta_state(intel);
-      intel->vtbl.meta_no_depth_write(intel);
-      intel->vtbl.meta_no_stencil_write(intel);
-
-      if (!driClipRectToFramebuffer(ctx->ReadBuffer, &x, &y, &width, &height)) {
-         UNLOCK_HARDWARE(intel);
-         SET_STATE(i830, state);
-         if (INTEL_DEBUG & DEBUG_PIXEL)
-            fprintf(stderr, "%s: cliprect failed\n", __FUNCTION__);
-         return GL_TRUE;
-      }
-
-      y = dPriv->h - y - height;
-      x += dPriv->x;
-      y += dPriv->y;
-
-
-      /* Set the frontbuffer up as a large rectangular texture.
-       */
-      intel->vtbl.meta_tex_rect_source(intel, src_region, textureFormat);
-
-
-      intel->vtbl.meta_texture_blend_replace(i830, glTextureFormat);
-
-
-      /* Set the 3d engine to draw into the destination region:
-       */
-
-      intel->vtbl.meta_draw_region(intel, dest_region);
-      intel->vtbl.meta_draw_format(intel, destFormat, depthFormat);     /* ?? */
-
-
-      /* Draw a single quad, no cliprects:
-       */
-      intel->vtbl.meta_disable_cliprects(intel);
-
-      intel->vtbl.draw_quad(intel,
-                            0, width, 0, height,
-                            0x00ff00ff, x, x + width, y, y + height);
-
-      intel->vtbl.leave_meta_state(intel);
-   }
-   UNLOCK_HARDWARE(intel);
-
-   intel_region_wait_fence(ctx, dest_region);   /* required by GL */
-   return GL_TRUE;
-#endif
-
-   return GL_FALSE;
-}
-
-
-
-
-static GLboolean
-do_blit_readpixels(GLcontext * ctx,
-                   GLint x, GLint y, GLsizei width, GLsizei height,
-                   GLenum format, GLenum type,
-                   const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_region *src = intel_readbuf_region(intel);
-   struct intel_buffer_object *dst = intel_buffer_object(pack->BufferObj);
-   GLuint dst_offset;
-   GLuint rowLength;
-
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      _mesa_printf("%s\n", __FUNCTION__);
-
-   if (!src)
-      return GL_FALSE;
-
-   if (dst) {
-      /* XXX This validation should be done by core mesa:
-       */
-      if (!_mesa_validate_pbo_access(2, pack, width, height, 1,
-                                     format, type, pixels)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawPixels");
-         return GL_TRUE;
-      }
-   }
-   else {
-      /* PBO only for now:
-       */
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         _mesa_printf("%s - not PBO\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-
-   if (ctx->_ImageTransferState ||
-       !intel_check_blit_format(src, format, type)) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         _mesa_printf("%s - bad format for blit\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   if (pack->Alignment != 1 || pack->SwapBytes || pack->LsbFirst) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         _mesa_printf("%s: bad packing params\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   if (pack->RowLength > 0)
-      rowLength = pack->RowLength;
-   else
-      rowLength = width;
-
-   if (pack->Invert) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         _mesa_printf("%s: MESA_PACK_INVERT not done yet\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-   else {
-      rowLength = -rowLength;
-   }
-
-   /* XXX 64-bit cast? */
-   dst_offset = (GLuint) _mesa_image_address(2, pack, pixels, width, height,
-                                             format, type, 0, 0, 0);
-
-
-   /* Although the blits go on the command buffer, need to do this and
-    * fire with lock held to guarentee cliprects are correct.
-    */
-   intelFlush(&intel->ctx);
-   LOCK_HARDWARE(intel);
-
-   if (intel->driDrawable->numClipRects) {
-      GLboolean all = (width * height * src->cpp == dst->Base.Size &&
-                       x == 0 && dst_offset == 0);
-
-      dri_bo *dst_buffer = intel_bufferobj_buffer(intel, dst,
-						  all ? INTEL_WRITE_FULL :
-						  INTEL_WRITE_PART);
-      __DRIdrawablePrivate *dPriv = intel->driDrawable;
-      int nbox = dPriv->numClipRects;
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t rect;
-      drm_clip_rect_t src_rect;
-      int i;
-
-      src_rect.x1 = dPriv->x + x;
-      src_rect.y1 = dPriv->y + dPriv->h - (y + height);
-      src_rect.x2 = src_rect.x1 + width;
-      src_rect.y2 = src_rect.y1 + height;
-
-
-
-      for (i = 0; i < nbox; i++) {
-         if (!intel_intersect_cliprects(&rect, &src_rect, &box[i]))
-            continue;
-
-         intelEmitCopyBlit(intel,
-                           src->cpp,
-                           src->pitch, src->buffer, 0, src->tiling,
-                           rowLength, dst_buffer, dst_offset, GL_FALSE,
-                           rect.x1,
-                           rect.y1,
-                           rect.x1 - src_rect.x1,
-                           rect.y2 - src_rect.y2,
-                           rect.x2 - rect.x1, rect.y2 - rect.y1,
-			   GL_COPY);
-      }
-   }
-   UNLOCK_HARDWARE(intel);
-
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      _mesa_printf("%s - DONE\n", __FUNCTION__);
-
-   return GL_TRUE;
-}
-
-void
-intelReadPixels(GLcontext * ctx,
-                GLint x, GLint y, GLsizei width, GLsizei height,
-                GLenum format, GLenum type,
-                const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
-{
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   intelFlush(ctx);
-
-   if (do_blit_readpixels
-       (ctx, x, y, width, height, format, type, pack, pixels))
-      return;
-
-   if (do_texture_readpixels
-       (ctx, x, y, width, height, format, type, pack, pixels))
-      return;
-
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      _mesa_printf("%s: fallback to swrast\n", __FUNCTION__);
-
-   _swrast_ReadPixels(ctx, x, y, width, height, format, type, pack, pixels);
-}
+../intel/intel_pixel_read.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_state.c b/src/mesa/drivers/dri/i915/intel_state.c
index 4aa43e5f3a..519672fc35 100644..120000
--- a/src/mesa/drivers/dri/i915/intel_state.c
+++ b/src/mesa/drivers/dri/i915/intel_state.c
@@ -1,297 +1 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "main/colormac.h"
-#include "main/dd.h"
-
-#include "intel_screen.h"
-#include "intel_context.h"
-#include "intel_fbo.h"
-#include "intel_regions.h"
-#include "swrast/swrast.h"
-
-int 
-intel_translate_shadow_compare_func( GLenum func )
-{
-   switch(func) {
-   case GL_NEVER: 
-       return COMPAREFUNC_ALWAYS; 
-   case GL_LESS: 
-       return COMPAREFUNC_LEQUAL; 
-   case GL_LEQUAL: 
-       return COMPAREFUNC_LESS;
-   case GL_GREATER: 
-       return COMPAREFUNC_GEQUAL; 
-   case GL_GEQUAL: 
-      return COMPAREFUNC_GREATER; 
-   case GL_NOTEQUAL: 
-      return COMPAREFUNC_EQUAL; 
-   case GL_EQUAL: 
-      return COMPAREFUNC_NOTEQUAL; 
-   case GL_ALWAYS: 
-       return COMPAREFUNC_NEVER; 
-   }
-
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
-   return COMPAREFUNC_NEVER; 
-}
-
-int
-intel_translate_compare_func(GLenum func)
-{
-   switch (func) {
-   case GL_NEVER:
-      return COMPAREFUNC_NEVER;
-   case GL_LESS:
-      return COMPAREFUNC_LESS;
-   case GL_LEQUAL:
-      return COMPAREFUNC_LEQUAL;
-   case GL_GREATER:
-      return COMPAREFUNC_GREATER;
-   case GL_GEQUAL:
-      return COMPAREFUNC_GEQUAL;
-   case GL_NOTEQUAL:
-      return COMPAREFUNC_NOTEQUAL;
-   case GL_EQUAL:
-      return COMPAREFUNC_EQUAL;
-   case GL_ALWAYS:
-      return COMPAREFUNC_ALWAYS;
-   }
-
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
-   return COMPAREFUNC_ALWAYS;
-}
-
-int
-intel_translate_stencil_op(GLenum op)
-{
-   switch (op) {
-   case GL_KEEP:
-      return STENCILOP_KEEP;
-   case GL_ZERO:
-      return STENCILOP_ZERO;
-   case GL_REPLACE:
-      return STENCILOP_REPLACE;
-   case GL_INCR:
-      return STENCILOP_INCRSAT;
-   case GL_DECR:
-      return STENCILOP_DECRSAT;
-   case GL_INCR_WRAP:
-      return STENCILOP_INCR;
-   case GL_DECR_WRAP:
-      return STENCILOP_DECR;
-   case GL_INVERT:
-      return STENCILOP_INVERT;
-   default:
-      return STENCILOP_ZERO;
-   }
-}
-
-int
-intel_translate_blend_factor(GLenum factor)
-{
-   switch (factor) {
-   case GL_ZERO:
-      return BLENDFACT_ZERO;
-   case GL_SRC_ALPHA:
-      return BLENDFACT_SRC_ALPHA;
-   case GL_ONE:
-      return BLENDFACT_ONE;
-   case GL_SRC_COLOR:
-      return BLENDFACT_SRC_COLR;
-   case GL_ONE_MINUS_SRC_COLOR:
-      return BLENDFACT_INV_SRC_COLR;
-   case GL_DST_COLOR:
-      return BLENDFACT_DST_COLR;
-   case GL_ONE_MINUS_DST_COLOR:
-      return BLENDFACT_INV_DST_COLR;
-   case GL_ONE_MINUS_SRC_ALPHA:
-      return BLENDFACT_INV_SRC_ALPHA;
-   case GL_DST_ALPHA:
-      return BLENDFACT_DST_ALPHA;
-   case GL_ONE_MINUS_DST_ALPHA:
-      return BLENDFACT_INV_DST_ALPHA;
-   case GL_SRC_ALPHA_SATURATE:
-      return BLENDFACT_SRC_ALPHA_SATURATE;
-   case GL_CONSTANT_COLOR:
-      return BLENDFACT_CONST_COLOR;
-   case GL_ONE_MINUS_CONSTANT_COLOR:
-      return BLENDFACT_INV_CONST_COLOR;
-   case GL_CONSTANT_ALPHA:
-      return BLENDFACT_CONST_ALPHA;
-   case GL_ONE_MINUS_CONSTANT_ALPHA:
-      return BLENDFACT_INV_CONST_ALPHA;
-   }
-
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, factor);
-   return BLENDFACT_ZERO;
-}
-
-int
-intel_translate_logic_op(GLenum opcode)
-{
-   switch (opcode) {
-   case GL_CLEAR:
-      return LOGICOP_CLEAR;
-   case GL_AND:
-      return LOGICOP_AND;
-   case GL_AND_REVERSE:
-      return LOGICOP_AND_RVRSE;
-   case GL_COPY:
-      return LOGICOP_COPY;
-   case GL_COPY_INVERTED:
-      return LOGICOP_COPY_INV;
-   case GL_AND_INVERTED:
-      return LOGICOP_AND_INV;
-   case GL_NOOP:
-      return LOGICOP_NOOP;
-   case GL_XOR:
-      return LOGICOP_XOR;
-   case GL_OR:
-      return LOGICOP_OR;
-   case GL_OR_INVERTED:
-      return LOGICOP_OR_INV;
-   case GL_NOR:
-      return LOGICOP_NOR;
-   case GL_EQUIV:
-      return LOGICOP_EQUIV;
-   case GL_INVERT:
-      return LOGICOP_INV;
-   case GL_OR_REVERSE:
-      return LOGICOP_OR_RVRSE;
-   case GL_NAND:
-      return LOGICOP_NAND;
-   case GL_SET:
-      return LOGICOP_SET;
-   default:
-      return LOGICOP_SET;
-   }
-}
-
-
-static void
-intelClearColor(GLcontext * ctx, const GLfloat color[4])
-{
-   struct intel_context *intel = intel_context(ctx);
-   GLubyte clear[4];
-
-   CLAMPED_FLOAT_TO_UBYTE(clear[0], color[0]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[1], color[1]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[2], color[2]);
-   CLAMPED_FLOAT_TO_UBYTE(clear[3], color[3]);
-
-   /* compute both 32 and 16-bit clear values */
-   intel->ClearColor8888 = INTEL_PACKCOLOR8888(clear[0], clear[1],
-                                               clear[2], clear[3]);
-   intel->ClearColor565 = INTEL_PACKCOLOR565(clear[0], clear[1], clear[2]);
-}
-
-
-/**
- * Update the viewport transformation matrix.  Depends on:
- *  - viewport pos/size
- *  - depthrange
- *  - window pos/size or FBO size
- */
-static void
-intelCalcViewport(GLcontext * ctx)
-{
-   struct intel_context *intel = intel_context(ctx);
-   const GLfloat *v = ctx->Viewport._WindowMap.m;
-   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
-   GLfloat *m = intel->ViewportMatrix.m;
-   GLfloat yScale, yBias;
-
-   if (ctx->DrawBuffer->Name) {
-      /* User created FBO */
-      struct intel_renderbuffer *irb
-         = intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[0]);
-      if (irb && !irb->RenderToTexture) {
-         /* y=0=top */
-         yScale = -1.0;
-         yBias = irb->Base.Height;
-      }
-      else {
-         /* y=0=bottom */
-         yScale = 1.0;
-         yBias = 0.0;
-      }
-   }
-   else {
-      /* window buffer, y=0=top */
-      yScale = -1.0;
-      yBias = (intel->driDrawable) ? intel->driDrawable->h : 0.0F;
-   }
-
-   m[MAT_SX] = v[MAT_SX];
-   m[MAT_TX] = v[MAT_TX];
-
-   m[MAT_SY] = v[MAT_SY] * yScale;
-   m[MAT_TY] = v[MAT_TY] * yScale + yBias;
-
-   m[MAT_SZ] = v[MAT_SZ] * depthScale;
-   m[MAT_TZ] = v[MAT_TZ] * depthScale;
-}
-
-static void
-intelViewport(GLcontext * ctx,
-              GLint x, GLint y, GLsizei width, GLsizei height)
-{
-   intelCalcViewport(ctx);
-
-   intel_viewport(ctx, x, y, width, height);
-}
-
-static void
-intelDepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
-{
-   intelCalcViewport(ctx);
-}
-
-/* Fallback to swrast for select and feedback.
- */
-static void
-intelRenderMode(GLcontext * ctx, GLenum mode)
-{
-   struct intel_context *intel = intel_context(ctx);
-   FALLBACK(intel, INTEL_FALLBACK_RENDERMODE, (mode != GL_RENDER));
-}
-
-
-void
-intelInitStateFuncs(struct dd_function_table *functions)
-{
-   functions->RenderMode = intelRenderMode;
-   functions->Viewport = intelViewport;
-   functions->DepthRange = intelDepthRange;
-   functions->ClearColor = intelClearColor;
-}
+../intel/intel_state.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_swapbuffers.c b/src/mesa/drivers/dri/i915/intel_swapbuffers.c
new file mode 120000
index 0000000000..148d5215aa
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_swapbuffers.c
@@ -0,0 +1 @@
+../intel/intel_swapbuffers.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_syncobj.c b/src/mesa/drivers/dri/i915/intel_syncobj.c
new file mode 120000
index 0000000000..0b2e56ab24
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_syncobj.c
@@ -0,0 +1 @@
+../intel/intel_syncobj.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_tris.c b/src/mesa/drivers/dri/i915/intel_tris.c
index e80996580c..a905455342 100644
--- a/src/mesa/drivers/dri/i915/intel_tris.c
+++ b/src/mesa/drivers/dri/i915/intel_tris.c
@@ -89,8 +89,8 @@ intel_flush_inline_primitive(struct intel_context *intel)
 
 static void intel_start_inline(struct intel_context *intel, uint32_t prim)
 {
-   BATCH_LOCALS;
    uint32_t batch_flags = LOOP_CLIPRECTS;
+   BATCH_LOCALS;
 
    intel->vtbl.emit_state(intel);
 
@@ -201,10 +201,10 @@ uint32_t *intel_get_prim_space(struct intel_context *intel, unsigned int count)
 /** Dispatches the accumulated primitive to the batchbuffer. */
 void intel_flush_prim(struct intel_context *intel)
 {
-   BATCH_LOCALS;
    dri_bo *aper_array[2];
    dri_bo *vb_bo;
    unsigned int offset, count;
+   BATCH_LOCALS;
 
    /* Must be called after an intel_start_prim. */
    assert(intel->prim.primitive != ~0);
@@ -989,7 +989,7 @@ intelChooseRenderState(GLcontext * ctx)
             intel->draw_tri = intel_fallback_tri;
 
          if (flags & DD_TRI_SMOOTH) {
-	    if (intel->strict_conformance)
+	    if (intel->conformance_mode > 0)
 	       intel->draw_tri = intel_fallback_tri;
 	 }
 
@@ -1001,7 +1001,7 @@ intelChooseRenderState(GLcontext * ctx)
 	 }
 
 	 if (flags & DD_POINT_SMOOTH) {
-	    if (intel->strict_conformance)
+	    if (intel->conformance_mode > 0)
 	       intel->draw_point = intel_fallback_point;
 	 }
 
@@ -1255,11 +1255,9 @@ intel_meta_draw_poly(struct intel_context *intel,
 {
    union fi *vb;
    GLint i;
-   GLboolean was_locked = intel->locked;
    unsigned int saved_vertex_size = intel->vertex_size;
 
-   if (!was_locked)
-       LOCK_HARDWARE(intel);
+   LOCK_HARDWARE(intel);
 
    intel->vertex_size = 6;
 
@@ -1283,8 +1281,7 @@ intel_meta_draw_poly(struct intel_context *intel,
 
    intel->vertex_size = saved_vertex_size;
 
-   if (!was_locked)
-       UNLOCK_HARDWARE(intel);
+   UNLOCK_HARDWARE(intel);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 37a470f2e2..6e9a9a29a3 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -9,10 +9,12 @@ DRIVER_SOURCES = \
 	intel_blit.c \
 	intel_buffer_objects.c \
 	intel_buffers.c \
+	intel_clear.c \
 	intel_context.c \
 	intel_decode.c \
-	intel_depthstencil.c \
+	intel_extensions.c \
 	intel_fbo.c \
+	intel_generatemipmap.c \
 	intel_mipmap_tree.c \
 	intel_regions.c \
 	intel_screen.c \
@@ -21,7 +23,10 @@ DRIVER_SOURCES = \
 	intel_pixel_bitmap.c \
 	intel_pixel_copy.c \
 	intel_pixel_draw.c \
+	intel_pixel_read.c \
 	intel_state.c \
+	intel_swapbuffers.c \
+	intel_syncobj.c \
 	intel_tex.c \
 	intel_tex_copy.c \
 	intel_tex_format.c \
@@ -39,6 +44,7 @@ DRIVER_SOURCES = \
 	brw_clip_util.c \
 	brw_context.c \
 	brw_curbe.c \
+	brw_disasm.c \
 	brw_draw.c \
 	brw_draw_upload.c \
 	brw_eu.c \
@@ -49,7 +55,6 @@ DRIVER_SOURCES = \
 	brw_gs.c \
 	brw_gs_emit.c \
 	brw_gs_state.c \
-	brw_metaops.c \
 	brw_misc_state.c \
 	brw_program.c \
 	brw_queryobj.c \
@@ -68,6 +73,7 @@ DRIVER_SOURCES = \
 	brw_vs_constval.c \
 	brw_vs_emit.c \
 	brw_vs_state.c \
+	brw_vs_surface_state.c \
 	brw_vtbl.c \
 	brw_wm.c \
 	brw_wm_debug.c \
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index fa8121e02d..1088a7a607 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -39,12 +39,14 @@
 
 static void prepare_cc_vp( struct brw_context *brw )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_cc_viewport ccv;
 
    memset(&ccv, 0, sizeof(ccv));
 
-   ccv.min_depth = 0.0;
-   ccv.max_depth = 1.0;
+   /* _NEW_VIEWPORT */
+   ccv.min_depth = ctx->Viewport.Near;
+   ccv.max_depth = ctx->Viewport.Far;
 
    dri_bo_unreference(brw->cc.vp_bo);
    brw->cc.vp_bo = brw_cache_data( &brw->cache, BRW_CC_VP, &ccv, NULL, 0 );
@@ -52,7 +54,7 @@ static void prepare_cc_vp( struct brw_context *brw )
 
 const struct brw_tracked_state brw_cc_vp = {
    .dirty = {
-      .mesa = 0,
+      .mesa = _NEW_VIEWPORT,
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
@@ -83,59 +85,60 @@ struct brw_cc_unit_key {
 static void
 cc_unit_populate_key(struct brw_context *brw, struct brw_cc_unit_key *key)
 {
-   struct gl_stencil_attrib *stencil = brw->attribs.Stencil;
+   GLcontext *ctx = &brw->intel.ctx;
+   const unsigned back = ctx->Stencil._BackFace;
 
    memset(key, 0, sizeof(*key));
 
-   key->stencil = stencil->Enabled;
-   key->stencil_two_side = stencil->_TestTwoSide;
+   key->stencil = ctx->Stencil._Enabled;
+   key->stencil_two_side = ctx->Stencil._TestTwoSide;
 
    if (key->stencil) {
-      key->stencil_func[0] = stencil->Function[0];
-      key->stencil_fail_op[0] = stencil->FailFunc[0];
-      key->stencil_pass_depth_fail_op[0] = stencil->ZFailFunc[0];
-      key->stencil_pass_depth_pass_op[0] = stencil->ZPassFunc[0];
-      key->stencil_ref[0] = stencil->Ref[0];
-      key->stencil_write_mask[0] = stencil->WriteMask[0];
-      key->stencil_test_mask[0] = stencil->ValueMask[0];
+      key->stencil_func[0] = ctx->Stencil.Function[0];
+      key->stencil_fail_op[0] = ctx->Stencil.FailFunc[0];
+      key->stencil_pass_depth_fail_op[0] = ctx->Stencil.ZFailFunc[0];
+      key->stencil_pass_depth_pass_op[0] = ctx->Stencil.ZPassFunc[0];
+      key->stencil_ref[0] = ctx->Stencil.Ref[0];
+      key->stencil_write_mask[0] = ctx->Stencil.WriteMask[0];
+      key->stencil_test_mask[0] = ctx->Stencil.ValueMask[0];
    }
    if (key->stencil_two_side) {
-      key->stencil_func[1] = stencil->Function[1];
-      key->stencil_fail_op[1] = stencil->FailFunc[1];
-      key->stencil_pass_depth_fail_op[1] = stencil->ZFailFunc[1];
-      key->stencil_pass_depth_pass_op[1] = stencil->ZPassFunc[1];
-      key->stencil_ref[1] = stencil->Ref[1];
-      key->stencil_write_mask[1] = stencil->WriteMask[1];
-      key->stencil_test_mask[1] = stencil->ValueMask[1];
+      key->stencil_func[1] = ctx->Stencil.Function[back];
+      key->stencil_fail_op[1] = ctx->Stencil.FailFunc[back];
+      key->stencil_pass_depth_fail_op[1] = ctx->Stencil.ZFailFunc[back];
+      key->stencil_pass_depth_pass_op[1] = ctx->Stencil.ZPassFunc[back];
+      key->stencil_ref[1] = ctx->Stencil.Ref[back];
+      key->stencil_write_mask[1] = ctx->Stencil.WriteMask[back];
+      key->stencil_test_mask[1] = ctx->Stencil.ValueMask[back];
    }
 
-   if (brw->attribs.Color->_LogicOpEnabled)
-      key->logic_op = brw->attribs.Color->LogicOp;
+   if (ctx->Color._LogicOpEnabled)
+      key->logic_op = ctx->Color.LogicOp;
    else
       key->logic_op = GL_COPY;
 
-   key->color_blend = brw->attribs.Color->BlendEnabled;
+   key->color_blend = ctx->Color.BlendEnabled;
    if (key->color_blend) {
-      key->blend_eq_rgb = brw->attribs.Color->BlendEquationRGB;
-      key->blend_eq_a = brw->attribs.Color->BlendEquationA;
-      key->blend_src_rgb = brw->attribs.Color->BlendSrcRGB;
-      key->blend_dst_rgb = brw->attribs.Color->BlendDstRGB;
-      key->blend_src_a = brw->attribs.Color->BlendSrcA;
-      key->blend_dst_a = brw->attribs.Color->BlendDstA;
+      key->blend_eq_rgb = ctx->Color.BlendEquationRGB;
+      key->blend_eq_a = ctx->Color.BlendEquationA;
+      key->blend_src_rgb = ctx->Color.BlendSrcRGB;
+      key->blend_dst_rgb = ctx->Color.BlendDstRGB;
+      key->blend_src_a = ctx->Color.BlendSrcA;
+      key->blend_dst_a = ctx->Color.BlendDstA;
    }
 
-   key->alpha_enabled = brw->attribs.Color->AlphaEnabled;
+   key->alpha_enabled = ctx->Color.AlphaEnabled;
    if (key->alpha_enabled) {
-      key->alpha_func = brw->attribs.Color->AlphaFunc;
-      key->alpha_ref = brw->attribs.Color->AlphaRef;
+      key->alpha_func = ctx->Color.AlphaFunc;
+      key->alpha_ref = ctx->Color.AlphaRef;
    }
 
-   key->dither = brw->attribs.Color->DitherFlag;
+   key->dither = ctx->Color.DitherFlag;
 
-   key->depth_test = brw->attribs.Depth->Test;
+   key->depth_test = ctx->Depth.Test;
    if (key->depth_test) {
-      key->depth_func = brw->attribs.Depth->Func;
-      key->depth_write = brw->attribs.Depth->Mask;
+      key->depth_func = ctx->Depth.Func;
+      key->depth_write = ctx->Depth.Mask;
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 38d8b704d7..20a927cf38 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -65,21 +65,31 @@ static void compile_clip_prog( struct brw_context *brw,
    c.func.single_program_flow = 1;
 
    c.key = *key;
-
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
 
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.header_position_offset = ATTR_SIZE;
 
-   for (i = 0, delta = REG_SIZE; i < VERT_RESULT_MAX; i++)
+   if (BRW_IS_IGDNG(brw))
+       delta = 3 * REG_SIZE;
+   else
+       delta = REG_SIZE;
+
+   for (i = 0; i < VERT_RESULT_MAX; i++)
       if (c.key.attrs & (1<<i)) {
 	 c.offset[i] = delta;
 	 delta += ATTR_SIZE;
       }
 
    c.nr_attrs = brw_count_bits(c.key.attrs);
-   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
    c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
@@ -145,14 +155,19 @@ static void upload_clip_prog(struct brw_context *brw)
    /* CACHE_NEW_VS_PROG */
    key.attrs = brw->vs.prog_data->outputs_written;
    /* _NEW_LIGHT */
-   key.do_flat_shading = (brw->attribs.Light->ShadeModel == GL_FLAT);
+   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
    /* _NEW_TRANSFORM */
-   key.nr_userclip = brw_count_bits(brw->attribs.Transform->ClipPlanesEnabled);
-   key.clip_mode = BRW_CLIPMODE_NORMAL;
+   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+
+   if (BRW_IS_IGDNG(brw))
+       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key.clip_mode = BRW_CLIPMODE_NORMAL;
 
    /* _NEW_POLYGON */
    if (key.primitive == GL_TRIANGLES) {
-      if (brw->attribs.Polygon->CullFaceMode == GL_FRONT_AND_BACK) 
+      if (ctx->Polygon.CullFlag &&
+	  ctx->Polygon.CullFaceMode == GL_FRONT_AND_BACK)
 	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
       else {
 	 GLuint fill_front = CLIP_CULL;
@@ -160,44 +175,44 @@ static void upload_clip_prog(struct brw_context *brw)
 	 GLuint offset_front = 0;
 	 GLuint offset_back = 0;
 
-	 if (!brw->attribs.Polygon->CullFlag ||
-	     brw->attribs.Polygon->CullFaceMode != GL_FRONT) {
-	    switch (brw->attribs.Polygon->FrontMode) {
+	 if (!ctx->Polygon.CullFlag ||
+	     ctx->Polygon.CullFaceMode != GL_FRONT) {
+	    switch (ctx->Polygon.FrontMode) {
 	    case GL_FILL: 
 	       fill_front = CLIP_FILL; 
 	       offset_front = 0;
 	       break;
 	    case GL_LINE:
 	       fill_front = CLIP_LINE;
-	       offset_front = brw->attribs.Polygon->OffsetLine;
+	       offset_front = ctx->Polygon.OffsetLine;
 	       break;
 	    case GL_POINT:
 	       fill_front = CLIP_POINT;
-	       offset_front = brw->attribs.Polygon->OffsetPoint;
+	       offset_front = ctx->Polygon.OffsetPoint;
 	       break;
 	    }
 	 }
 
-	 if (!brw->attribs.Polygon->CullFlag ||
-	     brw->attribs.Polygon->CullFaceMode != GL_BACK) {
-	    switch (brw->attribs.Polygon->BackMode) {
+	 if (!ctx->Polygon.CullFlag ||
+	     ctx->Polygon.CullFaceMode != GL_BACK) {
+	    switch (ctx->Polygon.BackMode) {
 	    case GL_FILL: 
 	       fill_back = CLIP_FILL; 
 	       offset_back = 0;
 	       break;
 	    case GL_LINE:
 	       fill_back = CLIP_LINE;
-	       offset_back = brw->attribs.Polygon->OffsetLine;
+	       offset_back = ctx->Polygon.OffsetLine;
 	       break;
 	    case GL_POINT:
 	       fill_back = CLIP_POINT;
-	       offset_back = brw->attribs.Polygon->OffsetPoint;
+	       offset_back = ctx->Polygon.OffsetPoint;
 	       break;
 	    }
 	 }
 
-	 if (brw->attribs.Polygon->BackMode != GL_FILL ||
-	     brw->attribs.Polygon->FrontMode != GL_FILL) {
+	 if (ctx->Polygon.BackMode != GL_FILL ||
+	     ctx->Polygon.FrontMode != GL_FILL) {
 	    key.do_unfilled = 1;
 
 	    /* Most cases the fixed function units will handle.  Cases where
@@ -207,17 +222,17 @@ static void upload_clip_prog(struct brw_context *brw)
 
 	    if (offset_back || offset_front) {
 	       /* _NEW_POLYGON, _NEW_BUFFERS */
-	       key.offset_units = brw->attribs.Polygon->OffsetUnits * brw->intel.polygon_offset_scale;
-	       key.offset_factor = brw->attribs.Polygon->OffsetFactor * ctx->DrawBuffer->_MRD;
+	       key.offset_units = ctx->Polygon.OffsetUnits * brw->intel.polygon_offset_scale;
+	       key.offset_factor = ctx->Polygon.OffsetFactor * ctx->DrawBuffer->_MRD;
 	    }
 
-	    switch (brw->attribs.Polygon->FrontFace) {
+	    switch (ctx->Polygon.FrontFace) {
 	    case GL_CCW:
 	       key.fill_ccw = fill_front;
 	       key.fill_cw = fill_back;
 	       key.offset_ccw = offset_front;
 	       key.offset_cw = offset_back;
-	       if (brw->attribs.Light->Model.TwoSide &&
+	       if (ctx->Light.Model.TwoSide &&
 		   key.fill_cw != CLIP_CULL) 
 		  key.copy_bfc_cw = 1;
 	       break;
@@ -226,7 +241,7 @@ static void upload_clip_prog(struct brw_context *brw)
 	       key.fill_ccw = fill_back;
 	       key.offset_cw = offset_front;
 	       key.offset_ccw = offset_back;
-	       if (brw->attribs.Light->Model.TwoSide &&
+	       if (ctx->Light.Model.TwoSide &&
 		   key.fill_ccw != CLIP_CULL) 
 		  key.copy_bfc_ccw = 1;
 	       break;
diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/mesa/drivers/dri/i965/brw_clip.h
index e06747864b..957df441ab 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.h
+++ b/src/mesa/drivers/dri/i965/brw_clip.h
@@ -100,6 +100,8 @@ struct brw_clip_compile {
       
       struct brw_reg fixed_planes;
       struct brw_reg plane_equation;
+       
+      struct brw_reg ff_sync;
    } reg;
 
    /* 3 different ways of expressing vertex size:
@@ -117,6 +119,7 @@ struct brw_clip_compile {
 
    GLuint header_position_offset;
    GLuint offset[VERT_ATTRIB_MAX];
+   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
@@ -171,5 +174,6 @@ struct brw_reg get_tmp( struct brw_clip_compile *c );
 
 void brw_clip_project_position(struct brw_clip_compile *c,
              struct brw_reg pos );
-
+void brw_clip_ff_sync(struct brw_clip_compile *c);
+void brw_clip_init_ff_sync(struct brw_clip_compile *c);
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index c45d48dff8..048ca620fa 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -85,6 +85,10 @@ static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
       i++;
    }
 
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
 
    c->first_tmp = i;
    c->last_tmp = i;
@@ -130,7 +134,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    struct brw_instruction *plane_loop;
    struct brw_instruction *plane_active;
    struct brw_instruction *is_negative;
-   struct brw_instruction *is_neg2;
+   struct brw_instruction *is_neg2 = NULL;
    struct brw_instruction *not_culled;
    struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
 
@@ -148,7 +152,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_clipmask(c);
 
    /* -ve rhw workaround */
-   if (!BRW_IS_G4X(p->brw)) {
+   if (BRW_IS_965(p->brw)) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
               brw_imm_ud(1<<20));
@@ -181,34 +185,54 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
 	 is_negative = brw_IF(p, BRW_EXECUTE_1);
 	 {
-	    brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
-	    brw_math_invert(p, c->reg.t, c->reg.t);
-	    brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
-
-	    brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
-	    brw_MOV(p, c->reg.t1, c->reg.t);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+             /*
+              * Both can be negative on GM965/G965 due to RHW workaround
+              * if so, this object should be rejected.
+              */
+             if (BRW_IS_965(p->brw)) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+                 {
+                     brw_clip_kill_thread(c);
+                 }
+                 brw_ENDIF(p, is_neg2);
+             }
+
+             brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+             brw_math_invert(p, c->reg.t, c->reg.t);
+             brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+             brw_MOV(p, c->reg.t1, c->reg.t);
+             brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 	 } 
 	 is_negative = brw_ELSE(p, is_negative);
 	 {
-	    /* Coming back in.  We know that both cannot be negative
-	     * because the line would have been culled in that case.
-	     */
+             /* Coming back in.  We know that both cannot be negative
+              * because the line would have been culled in that case.
+              */
+
+             /* If both are positive, do nothing */
+             /* Only on GM965/G965 */
+             if (BRW_IS_965(p->brw)) {
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+                 is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+             }
 
-	    /* If both are positive, do nothing */
-             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
-             is_neg2 = brw_IF(p, BRW_EXECUTE_1);
              {
-		brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
-		brw_math_invert(p, c->reg.t, c->reg.t);
-		brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
-
-		brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
-		brw_MOV(p, c->reg.t0, c->reg.t);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-	     }
-	     brw_ENDIF(p, is_neg2);
-	 }
+                 brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+                 brw_math_invert(p, c->reg.t, c->reg.t);
+                 brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+                 brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+                 brw_MOV(p, c->reg.t0, c->reg.t);
+                 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+             }
+
+             if (BRW_IS_965(p->brw)) {
+                 brw_ENDIF(p, is_neg2);
+             }
+         }
 	 brw_ENDIF(p, is_negative);	 
       }
       brw_ENDIF(p, plane_active);
@@ -243,6 +267,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 void brw_emit_line_clip( struct brw_clip_compile *c )
 {
    brw_clip_line_alloc_regs(c);
+   brw_clip_init_ff_sync(c);
 
    if (c->key.do_flat_shading)
       brw_clip_copy_colors(c, 0, 1);
diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/mesa/drivers/dri/i965/brw_clip_point.c
index d17b199b89..8458f61c5a 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_point.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_point.c
@@ -50,5 +50,7 @@ void brw_emit_point_clip( struct brw_clip_compile *c )
    /* Send an empty message to kill the thread:
     */
    brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_init_ff_sync(c);
+
    brw_clip_kill_thread(c);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 9b0d7eab7b..234b3744bf 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -43,11 +43,14 @@ struct brw_clip_unit_key {
    unsigned int curbe_offset;
 
    unsigned int nr_urb_entries, urb_size;
+
+   GLboolean depth_clamp;
 };
 
 static void
 clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_CLIP_PROG */
@@ -62,6 +65,9 @@ clip_unit_populate_key(struct brw_context *brw, struct brw_clip_unit_key *key)
    /* BRW_NEW_URB_FENCE */
    key->nr_urb_entries = brw->urb.nr_clip_entries;
    key->urb_size = brw->urb.vsize;
+
+   /* _NEW_TRANSOFORM */
+   key->depth_clamp = ctx->Transform.DepthClamp;
 }
 
 static dri_bo *
@@ -95,7 +101,14 @@ clip_unit_create_from_key(struct brw_context *brw,
        * even number.
        */
       assert(key->nr_urb_entries % 2 == 0);
-      clip.thread4.max_threads = 2 - 1;
+      
+      /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
+       * only 2 threads can output VUEs at a time.
+       */
+      if (BRW_IS_IGDNG(brw))
+         clip.thread4.max_threads = 16 - 1;        
+      else
+         clip.thread4.max_threads = 2 - 1;
    } else {
       assert(key->nr_urb_entries >= 5);
       clip.thread4.max_threads = 1 - 1;
@@ -110,7 +123,8 @@ clip_unit_create_from_key(struct brw_context *brw,
    clip.clip5.userclip_enable_flags = 0x7f;
    clip.clip5.userclip_must_clip = 1;
    clip.clip5.guard_band_enable = 0;
-   clip.clip5.viewport_z_clip_enable = 1;
+   if (!key->depth_clamp)
+      clip.clip5.viewport_z_clip_enable = 1;
    clip.clip5.viewport_xy_clip_enable = 1;
    clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
    clip.clip5.api_mode = BRW_CLIP_API_OGL;
@@ -161,7 +175,7 @@ static void upload_clip_unit( struct brw_context *brw )
 
 const struct brw_tracked_state brw_clip_unit = {
    .dirty = {
-      .mesa  = 0,
+      .mesa  = _NEW_TRANSFORM,
       .brw   = (BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_CLIP_PROG
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 1dbba37fe7..0efd77225e 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -77,6 +77,10 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
    if (c->nr_attrs & 1) {
       for (j = 0; j < 3; j++) {
 	 GLuint delta = c->nr_attrs*16 + 32;
+
+         if (BRW_IS_IGDNG(c->func.brw))
+             delta = c->nr_attrs * 16 + 32 * 3;
+
 	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
       }
    }
@@ -115,6 +119,11 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
       i++;
    }
 
+   if (c->need_ff_sync) {
+      c->reg.ff_sync = retype(brw_vec1_grf(i, 0), BRW_REGISTER_TYPE_UD);
+      i++;
+   }
+
    c->first_tmp = i;
    c->last_tmp = i;
 
@@ -455,6 +464,8 @@ static void brw_clip_test( struct brw_clip_compile *c )
     struct brw_indirect vt2 = brw_indirect(2, 0);
 
     struct brw_compile *p = &c->func;
+    struct brw_instruction *is_outside;
+    struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
 
     brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
     brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
@@ -462,53 +473,87 @@ static void brw_clip_test( struct brw_clip_compile *c )
     brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
     brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
     brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
+    brw_AND(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(~0x3f));
 
     /* test nearz, xmin, ymin plane */
-    brw_CMP(p, t1, BRW_CONDITIONAL_LE, negate(v0), get_element(v0, 3)); 
+    /* clip.xyz < -clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, negate(get_element(v0, 3))); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, negate(get_element(v1, 3))); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t2, BRW_CONDITIONAL_LE, negate(v1), get_element(v1, 3)); 
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, negate(get_element(v2, 3))); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t3, BRW_CONDITIONAL_LE, negate(v2), get_element(v2, 3)); 
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* some vertices are inside a plane, some are outside,need to clip */
     brw_XOR(p, t, t1, t2);
     brw_XOR(p, t1, t2, t3);
     brw_OR(p, t, t, t1);
-
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 0), brw_imm_ud(0));
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 1), brw_imm_ud(0));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 2), brw_imm_ud(0));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
     /* test farz, xmax, ymax plane */
-    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, get_element(v0, 3)); 
+    /* clip.xyz > clip.w */
+    brw_CMP(p, t1, BRW_CONDITIONAL_G, v0, get_element(v0, 3)); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, get_element(v1, 3)); 
+    brw_CMP(p, t2, BRW_CONDITIONAL_G, v1, get_element(v1, 3)); 
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, get_element(v2, 3)); 
+    brw_CMP(p, t3, BRW_CONDITIONAL_G, v2, get_element(v2, 3)); 
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* All vertices are outside of a plane, rejected */
+    brw_AND(p, t, t1, t2);
+    brw_AND(p, t, t, t3);
+    brw_OR(p, tmp0, get_element(t, 0), get_element(t, 1));
+    brw_OR(p, tmp0, tmp0, get_element(t, 2));
+    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+    brw_AND(p, brw_null_reg(), tmp0, brw_imm_ud(0x1));
+    is_outside = brw_IF(p, BRW_EXECUTE_1);
+    {
+        brw_clip_kill_thread(c);
+    }
+    brw_ENDIF(p, is_outside);
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
+    /* some vertices are inside a plane, some are outside,need to clip */
     brw_XOR(p, t, t1, t2);
     brw_XOR(p, t1, t2, t3);
     brw_OR(p, t, t, t1);
-
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 0), brw_imm_ud(0));
+    brw_AND(p, t, t, brw_imm_ud(0x1));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 0), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 1), brw_imm_ud(0));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 1), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ, 
-	    get_element(t, 2), brw_imm_ud(0));
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+            get_element(t, 2), brw_imm_ud(0));
     brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
     brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 
@@ -523,10 +568,11 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
    brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
    brw_clip_tri_init_vertices(c);
    brw_clip_init_clipmask(c);
+   brw_clip_init_ff_sync(c);
 
    /* if -ve rhw workaround bit is set, 
       do cliptest */
-   if (!BRW_IS_G4X(p->brw)) {
+   if (BRW_IS_965(p->brw)) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
               brw_imm_ud(1<<20));
@@ -543,11 +589,12 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
    if (c->key.do_flat_shading) 
       brw_clip_tri_flat_shade(c); 
       
-   if (c->key.clip_mode == BRW_CLIPMODE_NORMAL)
+   if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP))
       do_clip_tri(c);
    else 
       maybe_do_clip_tri(c);
-      
+
    brw_clip_tri_emit_polygon(c);
 
    /* Send an empty message to kill the thread:
diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
index d7ca517927..ad1bfa435f 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
@@ -453,6 +453,7 @@ void brw_emit_unfilled_clip( struct brw_clip_compile *c )
 
    brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
    brw_clip_tri_init_vertices(c);
+   brw_clip_init_ff_sync(c);
 
    assert(c->offset[VERT_RESULT_EDGE]);
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 9d3b0be694..5a73abdfee 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -140,6 +140,10 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 
    /* Just copy the vertex header:
     */
+   /*
+    * After CLIP stage, only first 256 bits of the VUE are read
+    * back on IGDNG, so needn't change it
+    */
    brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
       
    /* Iterate over each attribute (could be done in pairs?)
@@ -147,6 +151,9 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    for (i = 0; i < c->nr_attrs; i++) {
       GLuint delta = i*16 + 32;
 
+      if (BRW_IS_IGDNG(p->brw))
+          delta = i * 16 + 32 * 3;
+
       if (delta == c->offset[VERT_RESULT_EDGE]) {
 	 if (force_edgeflag) 
 	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
@@ -177,6 +184,10 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 
    if (i & 1) {
       GLuint delta = i*16 + 32;
+
+      if (BRW_IS_IGDNG(p->brw))
+          delta = i * 16 + 32 * 3;
+
       brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
    }
 
@@ -202,6 +213,8 @@ void brw_clip_emit_vue(struct brw_clip_compile *c,
    struct brw_compile *p = &c->func;
    GLuint start = c->last_mrf;
 
+   brw_clip_ff_sync(c);
+
    assert(!(allocate && eot));
    
    /* Cycle through mrf regs - probably futile as we have to wait for
@@ -252,6 +265,7 @@ void brw_clip_kill_thread(struct brw_clip_compile *c)
 {
    struct brw_compile *p = &c->func;
 
+   brw_clip_ff_sync(c);
    /* Send an empty message to kill the thread and release any
     * allocated urb entry:
     */
@@ -343,3 +357,40 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
    }
 }
 
+void brw_clip_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+        struct brw_compile *p = &c->func;
+        struct brw_instruction *need_ff_sync;
+
+        brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
+        brw_AND(p, brw_null_reg(), c->reg.ff_sync, brw_imm_ud(0x1));
+        need_ff_sync = brw_IF(p, BRW_EXECUTE_1);
+        {
+            brw_OR(p, c->reg.ff_sync, c->reg.ff_sync, brw_imm_ud(0x1));
+            brw_ff_sync(p, 
+                    c->reg.R0,
+                    0,
+                    c->reg.R0,
+                    1,	
+                    1,		/* used */
+                    1,  	/* msg length */
+                    1,		/* response length */
+                    0,		/* eot */
+                    1,		/* write compelete */
+                    0,		/* urb offset */
+                    BRW_URB_SWIZZLE_NONE);
+        }
+        brw_ENDIF(p, need_ff_sync);
+        brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    }
+}
+
+void brw_clip_init_ff_sync(struct brw_clip_compile *c)
+{
+    if (c->need_ff_sync) {
+	struct brw_compile *p = &c->func;
+        
+        brw_MOV(p, c->reg.ff_sync, brw_imm_ud(0));
+    }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index d7a2bd95ee..3c5b848319 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -76,30 +76,6 @@ static void brwInitDriverFunctions( struct dd_function_table *functions )
    functions->Viewport = intel_viewport;
 }
 
-
-static void brw_init_attribs( struct brw_context *brw )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-
-   brw->attribs.Color = &ctx->Color;
-   brw->attribs.Depth = &ctx->Depth;
-   brw->attribs.Fog = &ctx->Fog;
-   brw->attribs.Hint = &ctx->Hint;
-   brw->attribs.Light = &ctx->Light;
-   brw->attribs.Line = &ctx->Line;
-   brw->attribs.Point = &ctx->Point;
-   brw->attribs.Polygon = &ctx->Polygon;
-   brw->attribs.Scissor = &ctx->Scissor;
-   brw->attribs.Stencil = &ctx->Stencil;
-   brw->attribs.Texture = &ctx->Texture;
-   brw->attribs.Transform = &ctx->Transform;
-   brw->attribs.Viewport = &ctx->Viewport;
-   brw->attribs.VertexProgram = &ctx->VertexProgram;
-   brw->attribs.FragmentProgram = &ctx->FragmentProgram;
-   brw->attribs.PolygonStipple = &ctx->PolygonStipple[0];
-}
-
-
 GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 			    __DRIcontextPrivate *driContextPriv,
 			    void *sharedContextPrivate)
@@ -135,21 +111,48 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
                                      ctx->Const.MaxTextureImageUnits);
    ctx->Const.MaxVertexTextureImageUnits = 0; /* no vertex shader textures */
 
-   /* Advertise the full hardware capabilities.  The new memory
-    * manager should cope much better with overload situations:
+   /* Mesa limits textures to 4kx4k; it would be nice to fix that someday
     */
-   ctx->Const.MaxTextureLevels = 12;
+   ctx->Const.MaxTextureLevels = 13;
    ctx->Const.Max3DTextureLevels = 9;
    ctx->Const.MaxCubeTextureLevels = 12;
-   ctx->Const.MaxTextureRectSize = (1<<11);
+   ctx->Const.MaxTextureRectSize = (1<<12);
    
+   ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+
    /* if conformance mode is set, swrast can handle any size AA point */
    ctx->Const.MaxPointSizeAA = 255.0;
 
-/*    ctx->Const.MaxNativeVertexProgramTemps = 32; */
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   ctx->Shader.EmitCondCodes = GL_TRUE;
+
+   ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
+   ctx->Const.VertexProgram.MaxAluInstructions = 0;
+   ctx->Const.VertexProgram.MaxTexInstructions = 0;
+   ctx->Const.VertexProgram.MaxTexIndirections = 0;
+   ctx->Const.VertexProgram.MaxNativeAluInstructions = 0;
+   ctx->Const.VertexProgram.MaxNativeTexInstructions = 0;
+   ctx->Const.VertexProgram.MaxNativeTexIndirections = 0;
+   ctx->Const.VertexProgram.MaxNativeAttribs = 16;
+   ctx->Const.VertexProgram.MaxNativeTemps = 256;
+   ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+   ctx->Const.VertexProgram.MaxNativeParameters = 1024;
+   ctx->Const.VertexProgram.MaxEnvParams =
+      MIN2(ctx->Const.VertexProgram.MaxNativeParameters,
+	   ctx->Const.VertexProgram.MaxEnvParams);
+
+   ctx->Const.FragmentProgram.MaxNativeInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeAluInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeTexInstructions = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeTexIndirections = (16 * 1024);
+   ctx->Const.FragmentProgram.MaxNativeAttribs = 12;
+   ctx->Const.FragmentProgram.MaxNativeTemps = 256;
+   ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+   ctx->Const.FragmentProgram.MaxNativeParameters = 1024;
+   ctx->Const.FragmentProgram.MaxEnvParams =
+      MIN2(ctx->Const.FragmentProgram.MaxNativeParameters,
+	   ctx->Const.FragmentProgram.MaxEnvParams);
 
-   brw_init_attribs( brw );
-   brw_init_metaops( brw );
    brw_init_state( brw );
 
    brw->state.dirty.mesa = ~0;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 5d3f99e025..a5209ac41b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -46,7 +46,7 @@
  *
  * CURBE - constant URB entry.  An urb region (entry) used to hold
  * constant values which the fixed function units can be instructed to
- * preload into the GRF when spawining a thread.
+ * preload into the GRF when spawning a thread.
  *
  * VUE - vertex URB entry.  An urb entry holding a vertex and usually
  * a vertex header.  The header contains control information and
@@ -63,7 +63,7 @@
  * special and may be overwritten.
  *
  * MRF - message register file.  Threads communicate (and terminate)
- * by sending messages.  Message parameters are placed in contigous
+ * by sending messages.  Message parameters are placed in contiguous
  * MRF registers.  All program output is via these messages.  URB
  * entries are populated by sending a message to the shared URB
  * function containing the new data, together with a control word,
@@ -129,9 +129,8 @@ struct brw_context;
 #define BRW_NEW_PRIMITIVE               0x40
 #define BRW_NEW_CONTEXT                 0x80
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
-#define BRW_NEW_INPUT_VARYING           0x200
 #define BRW_NEW_PSP                     0x800
-#define BRW_NEW_METAOPS                 0x1000
+#define BRW_NEW_WM_SURFACES		0x1000
 #define BRW_NEW_FENCE                   0x2000
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
@@ -142,7 +141,9 @@ struct brw_context;
 #define BRW_NEW_BATCH			0x10000
 /** brw->depth_region updated */
 #define BRW_NEW_DEPTH_BUFFER		0x20000
-#define BRW_NEW_NR_SURFACES		0x40000
+#define BRW_NEW_NR_WM_SURFACES		0x40000
+#define BRW_NEW_NR_VS_SURFACES		0x80000
+#define BRW_NEW_INDEX_BUFFER		0x100000
 
 struct brw_state_flags {
    /** State update flags signalled by mesa internals */
@@ -155,19 +156,28 @@ struct brw_state_flags {
    GLuint cache;
 };
 
+
+/** Subclass of Mesa vertex program */
 struct brw_vertex_program {
    struct gl_vertex_program program;
    GLuint id;
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
 };
 
 
-
+/** Subclass of Mesa fragment program */
 struct brw_fragment_program {
    struct gl_fragment_program program;
-   GLuint id;
-};
+   GLuint id;  /**< serial no. to identify frag progs, never re-used */
+   GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
+   dri_bo *const_buffer;    /** Program constant buffer/surface */
+   GLboolean use_const_buffer;
 
+   /** for debugging, which texture units are referenced */
+   GLbitfield tex_units_used;
+};
 
 
 /* Data about a particular attempt to compile a program.  Note that
@@ -183,7 +193,7 @@ struct brw_wm_prog_data {
    GLuint total_grf;
    GLuint total_scratch;
 
-   GLuint nr_params;
+   GLuint nr_params;       /**< number of float params/constants */
    GLboolean error;
 
    /* Pointer to tracked values (only valid once
@@ -222,6 +232,7 @@ struct brw_vs_prog_data {
    GLuint urb_read_length;
    GLuint total_grf;
    GLuint outputs_written;
+   GLuint nr_params;       /**< number of float params/constants */
 
    GLuint inputs_read;
 
@@ -238,8 +249,35 @@ struct brw_vs_ouput_sizes {
 };
 
 
+/** Number of texture sampler units */
 #define BRW_MAX_TEX_UNIT 16
-#define BRW_WM_MAX_SURF BRW_MAX_TEX_UNIT + MAX_DRAW_BUFFERS
+
+/**
+ * Size of our surface binding table for the WM.
+ * This contains pointers to the drawing surfaces and current texture
+ * objects and shader constant buffers (+2).
+ */
+#define BRW_WM_MAX_SURF (MAX_DRAW_BUFFERS + BRW_MAX_TEX_UNIT + 1)
+
+/**
+ * Helpers to convert drawing buffers, textures and constant buffers
+ * to surface binding table indexes, for WM.
+ */
+#define SURF_INDEX_DRAW(d)           (d)
+#define SURF_INDEX_FRAG_CONST_BUFFER (MAX_DRAW_BUFFERS) 
+#define SURF_INDEX_TEXTURE(t)        (MAX_DRAW_BUFFERS + 1 + (t))
+
+/**
+ * Size of surface binding table for the VS.
+ * Only one constant buffer for now.
+ */
+#define BRW_VS_MAX_SURF 1
+
+/**
+ * Only a VS constant buffer
+ */
+#define SURF_INDEX_VERT_CONST_BUFFER 0
+
 
 enum brw_cache_id {
    BRW_CC_VP,
@@ -303,26 +341,6 @@ struct brw_cache {
 };
 
 
-
-struct brw_state_pointers {
-   struct gl_colorbuffer_attrib	*Color;
-   struct gl_depthbuffer_attrib	*Depth;
-   struct gl_fog_attrib		*Fog;
-   struct gl_hint_attrib	*Hint;
-   struct gl_light_attrib	*Light;
-   struct gl_line_attrib	*Line;
-   struct gl_point_attrib	*Point;
-   struct gl_polygon_attrib	*Polygon;
-   GLuint                       *PolygonStipple;
-   struct gl_scissor_attrib	*Scissor;
-   struct gl_stencil_attrib	*Stencil;
-   struct gl_texture_attrib	*Texture;
-   struct gl_transform_attrib	*Transform;
-   struct gl_viewport_attrib	*Viewport;
-   struct gl_vertex_program_state *VertexProgram; 
-   struct gl_fragment_program_state *FragmentProgram;
-};
-
 /* Considered adding a member to this struct to document which flags
  * an update might raise so that ordering of the state atoms can be
  * checked or derived at runtime.  Dropped the idea in favor of having
@@ -372,6 +390,8 @@ struct brw_cached_batch_item {
 struct brw_vertex_element {
    const struct gl_client_array *glarray;
 
+   /** The corresponding Mesa vertex attribute */
+   gl_vert_attrib attrib;
    /** Size of a complete element */
    GLuint element_size;
    /** Number of uploaded elements for this input. */
@@ -387,7 +407,6 @@ struct brw_vertex_element {
 
 
 struct brw_vertex_info {
-   GLuint varying;  /* varying:1[VERT_ATTRIB_MAX] */
    GLuint sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
 };
 
@@ -425,9 +444,13 @@ struct brw_query_object {
    unsigned int count;
 };
 
+
+/**
+ * brw_context is derived from intel_context.
+ */
 struct brw_context 
 {
-   struct intel_context intel;
+   struct intel_context intel;  /**< base class, must be first field */
    GLuint primitive;
 
    GLboolean emit_state_always;
@@ -436,11 +459,9 @@ struct brw_context
 
    struct {
       struct brw_state_flags dirty;
-      struct brw_tracked_state **atoms;
-      GLuint nr_atoms;
 
-      GLuint nr_draw_regions;
-      struct intel_region *draw_regions[MAX_DRAW_BUFFERS];
+      GLuint nr_color_regions;
+      struct intel_region *color_regions[MAX_DRAW_BUFFERS];
       struct intel_region *depth_region;
 
       /**
@@ -457,13 +478,16 @@ struct brw_context
       int validated_bo_count;
    } state;
 
-   struct brw_state_pointers attribs;
-   struct brw_cache cache;
+   struct brw_cache cache;  /** non-surface items */
+   struct brw_cache surface_cache;  /* surface items */
    struct brw_cached_batch_item *cached_batch_items;
 
    struct {
       struct brw_vertex_element inputs[VERT_ATTRIB_MAX];
 
+      struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
+      GLuint nr_enabled;
+
 #define BRW_NR_UPLOAD_BUFS 17
 #define BRW_UPLOAD_INIT_SIZE (128*1024)
 
@@ -487,31 +511,16 @@ struct brw_context
        */
       const struct _mesa_index_buffer *ib;
 
+      /* Updates to these fields are signaled by BRW_NEW_INDEX_BUFFER. */
       dri_bo *bo;
       unsigned int offset;
-   } ib;
-
-   struct {
-      /* Will be allocated on demand if needed.   
+      unsigned int size;
+      /* Offset to index buffer index to use in CMD_3D_PRIM so that we can
+       * avoid re-uploading the IB packet over and over if we're actually
+       * referencing the same index buffer.
        */
-      struct brw_state_pointers attribs;
-      struct gl_vertex_program *vp;
-      struct gl_fragment_program *fp, *fp_tex;
-
-      struct gl_buffer_object *vbo;
-
-      struct intel_region *saved_draw_region;
-      GLuint saved_nr_draw_regions;
-      struct intel_region *saved_depth_region;
-
-      GLuint restore_draw_buffers[MAX_DRAW_BUFFERS];
-      GLuint restore_num_draw_buffers;
-
-      struct gl_fragment_program *restore_fp;
-      
-      GLboolean active;
-   } metaops;
-
+      unsigned int start_vertex_offset;
+   } ib;
 
    /* Active vertex program: 
     */
@@ -556,19 +565,14 @@ struct brw_context
    /* BRW_NEW_CURBE_OFFSETS: 
     */
    struct {
-      GLuint wm_start;
-      GLuint wm_size;
+      GLuint wm_start;  /**< pos of first wm const in CURBE buffer */
+      GLuint wm_size;   /**< number of float[4] consts, multiple of 16 */
       GLuint clip_start;
       GLuint clip_size;
       GLuint vs_start;
       GLuint vs_size;
       GLuint total_size;
 
-      /* Dynamic tracker which changes to reflect the state referenced
-       * by active fp and vp program parameters:
-       */
-      struct brw_tracked_state tracked_state;
-
       dri_bo *curbe_bo;
       /** Offset within curbe_bo of space for current curbe entry */
       GLuint curbe_offset;
@@ -589,6 +593,11 @@ struct brw_context
 
       dri_bo *prog_bo;
       dri_bo *state_bo;
+
+      /** Binding table of pointers to surf_bo entries */
+      dri_bo *bind_bo;
+      dri_bo *surf_bo[BRW_VS_MAX_SURF];
+      GLuint nr_surfaces;      
    } vs;
 
    struct {
@@ -620,9 +629,10 @@ struct brw_context
       struct brw_wm_prog_data *prog_data;
       struct brw_wm_compile *compile_data;
 
-      /* Input sizes, calculated from active vertex program:
+      /** Input sizes, calculated from active vertex program.
+       * One bit per fragment program input attribute.
        */
-      GLuint input_size_masks[4];
+      GLbitfield input_size_masks[4];
 
       /** Array of surface default colors (texture border color) */
       dri_bo *sdc_bo[BRW_MAX_TEX_UNIT];
@@ -631,7 +641,7 @@ struct brw_context
       GLuint nr_surfaces;      
 
       GLuint max_threads;
-      dri_bo *scratch_buffer;
+      dri_bo *scratch_bo;
 
       GLuint sampler_count;
       dri_bo *sampler_bo;
@@ -671,8 +681,6 @@ struct brw_context
  * brw_vtbl.c
  */
 void brwInitVtbl( struct brw_context *brw );
-void brw_do_flush( struct brw_context *brw, 
-		   GLuint flags );
 
 /*======================================================================
  * brw_context.c
@@ -703,13 +711,6 @@ void brw_FrameBufferTexInit( struct brw_context *brw,
 void brw_FrameBufferTexDestroy( struct brw_context *brw );
 void brw_validate_textures( struct brw_context *brw );
 
-/*======================================================================
- * brw_metaops.c
- */
-
-void brw_init_metaops( struct brw_context *brw );
-void brw_destroy_metaops( struct brw_context *brw );
-
 
 /*======================================================================
  * brw_program.c
@@ -721,8 +722,12 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
  */
 void brw_upload_urb_fence(struct brw_context *brw);
 
-void brw_upload_constant_buffer_state(struct brw_context *brw);
+/* brw_curbe.c
+ */
+void brw_upload_cs_urb_state(struct brw_context *brw);
 
+/* brw_disasm.c */
+int brw_disasm (FILE *file, struct brw_instruction *inst);
 
 /*======================================================================
  * Inline conversion functions.  These are better-typed than the
@@ -734,6 +739,32 @@ brw_context( GLcontext *ctx )
    return (struct brw_context *)ctx;
 }
 
+static INLINE struct brw_vertex_program *
+brw_vertex_program(struct gl_vertex_program *p)
+{
+   return (struct brw_vertex_program *) p;
+}
+
+static INLINE const struct brw_vertex_program *
+brw_vertex_program_const(const struct gl_vertex_program *p)
+{
+   return (const struct brw_vertex_program *) p;
+}
+
+static INLINE struct brw_fragment_program *
+brw_fragment_program(struct gl_fragment_program *p)
+{
+   return (struct brw_fragment_program *) p;
+}
+
+static INLINE const struct brw_fragment_program *
+brw_fragment_program_const(const struct gl_fragment_program *p)
+{
+   return (const struct brw_fragment_program *) p;
+}
+
+
+
 #define DO_SETUP_BITS ((1<<(FRAG_ATTRIB_MAX)) - 1)
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index fbf473abf6..0b0e6931a0 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -36,30 +36,37 @@
 #include "main/macros.h"
 #include "main/enums.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
+#include "intel_regions.h"
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_state.h"
 #include "brw_util.h"
 
 
-/* Partition the CURBE between the various users of constant values:
+/**
+ * Partition the CURBE between the various users of constant values:
+ * Note that vertex and fragment shaders can now fetch constants out
+ * of constant buffers.  We no longer allocatea block of the GRF for
+ * constants.  That greatly reduces the demand for space in the CURBE.
+ * Some of the comments within are dated...
  */
 static void calculate_curbe_offsets( struct brw_context *brw )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    /* CACHE_NEW_WM_PROG */
-   GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
+   const GLuint nr_fp_regs = (brw->wm.prog_data->nr_params + 15) / 16;
    
    /* BRW_NEW_VERTEX_PROGRAM */
-   struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-   GLuint nr_vp_regs = (vp->program.Base.Parameters->NumParameters * 4 + 15) / 16;
+   const GLuint nr_vp_regs = (brw->vs.prog_data->nr_params + 15) / 16;
    GLuint nr_clip_regs = 0;
    GLuint total_regs;
 
    /* _NEW_TRANSFORM */
-   if (brw->attribs.Transform->ClipPlanesEnabled) {
-      GLuint nr_planes = 6 + brw_count_bits(brw->attribs.Transform->ClipPlanesEnabled);
+   if (ctx->Transform.ClipPlanesEnabled) {
+      GLuint nr_planes = 6 + brw_count_bits(ctx->Transform.ClipPlanesEnabled);
       nr_clip_regs = (nr_planes * 4 + 15) / 16;
    }
 
@@ -137,24 +144,24 @@ const struct brw_tracked_state brw_curbe_offsets = {
  * fixed-function hardware in a double-buffering scheme to avoid a
  * pipeline stall each time the contents of the curbe is changed.
  */
-void brw_upload_constant_buffer_state(struct brw_context *brw)
+void brw_upload_cs_urb_state(struct brw_context *brw)
 {
-   struct brw_constant_buffer_state cbs; 
-   memset(&cbs, 0, sizeof(cbs));
+   struct brw_cs_urb_state cs_urb;
+   memset(&cs_urb, 0, sizeof(cs_urb));
 
    /* It appears that this is the state packet for the CS unit, ie. the
     * urb entries detailed here are housed in the CS range from the
     * URB_FENCE command.
     */
-   cbs.header.opcode = CMD_CONST_BUFFER_STATE;
-   cbs.header.length = sizeof(cbs)/4 - 2;
+   cs_urb.header.opcode = CMD_CS_URB_STATE;
+   cs_urb.header.length = sizeof(cs_urb)/4 - 2;
 
    /* BRW_NEW_URB_FENCE */
-   cbs.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
-   cbs.bits0.urb_entry_size = brw->urb.csize - 1;
+   cs_urb.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cs_urb.bits0.urb_entry_size = brw->urb.csize - 1;
 
    assert(brw->urb.nr_cs_entries);
-   BRW_CACHED_BATCH_STRUCT(brw, &cbs);
+   BRW_CACHED_BATCH_STRUCT(brw, &cs_urb);
 }
 
 static GLfloat fixed_plane[6][4] = {
@@ -173,42 +180,35 @@ static GLfloat fixed_plane[6][4] = {
 static void prepare_constant_buffer(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
-   struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-   struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program;
-   GLuint sz = brw->curbe.total_size;
-   GLuint bufsz = sz * 16 * sizeof(GLfloat);
+   const struct brw_vertex_program *vp =
+      brw_vertex_program_const(brw->vertex_program);
+   const struct brw_fragment_program *fp =
+      brw_fragment_program_const(brw->fragment_program);
+   const GLuint sz = brw->curbe.total_size;
+   const GLuint bufsz = sz * 16 * sizeof(GLfloat);
    GLfloat *buf;
    GLuint i;
 
-   /* Update our own dependency flags.  This works because this
-    * function will also be called whenever fp or vp changes.
-    */
-   brw->curbe.tracked_state.dirty.mesa = (_NEW_TRANSFORM|_NEW_PROJECTION);
-   brw->curbe.tracked_state.dirty.mesa |= vp->program.Base.Parameters->StateFlags;
-   brw->curbe.tracked_state.dirty.mesa |= fp->program.Base.Parameters->StateFlags;
-
    if (sz == 0) {
-
       if (brw->curbe.last_buf) {
 	 free(brw->curbe.last_buf);
 	 brw->curbe.last_buf = NULL;
 	 brw->curbe.last_bufsz  = 0;
       }
-
       return;
    }
 
-   buf = (GLfloat *)malloc(bufsz);
-
-   memset(buf, 0, bufsz);
+   buf = (GLfloat *) _mesa_calloc(bufsz);
 
+   /* fragment shader constants */
    if (brw->curbe.wm_size) {
       GLuint offset = brw->curbe.wm_start * 16;
 
       _mesa_load_state_parameters(ctx, fp->program.Base.Parameters); 
 
+      /* copy float constants */
       for (i = 0; i < brw->wm.prog_data->nr_params; i++) 
-	 buf[offset + i] = brw->wm.prog_data->param[i][0];
+	 buf[offset + i] = *brw->wm.prog_data->param[i];
    }
 
 
@@ -233,28 +233,33 @@ static void prepare_constant_buffer(struct brw_context *brw)
        */
       assert(MAX_CLIP_PLANES == 6);
       for (j = 0; j < MAX_CLIP_PLANES; j++) {
-	 if (brw->attribs.Transform->ClipPlanesEnabled & (1<<j)) {
-	    buf[offset + i * 4 + 0] = brw->attribs.Transform->_ClipUserPlane[j][0];
-	    buf[offset + i * 4 + 1] = brw->attribs.Transform->_ClipUserPlane[j][1];
-	    buf[offset + i * 4 + 2] = brw->attribs.Transform->_ClipUserPlane[j][2];
-	    buf[offset + i * 4 + 3] = brw->attribs.Transform->_ClipUserPlane[j][3];
+	 if (ctx->Transform.ClipPlanesEnabled & (1<<j)) {
+	    buf[offset + i * 4 + 0] = ctx->Transform._ClipUserPlane[j][0];
+	    buf[offset + i * 4 + 1] = ctx->Transform._ClipUserPlane[j][1];
+	    buf[offset + i * 4 + 2] = ctx->Transform._ClipUserPlane[j][2];
+	    buf[offset + i * 4 + 3] = ctx->Transform._ClipUserPlane[j][3];
 	    i++;
 	 }
       }
    }
 
-
+   /* vertex shader constants */
    if (brw->curbe.vs_size) {
       GLuint offset = brw->curbe.vs_start * 16;
-      GLuint nr = vp->program.Base.Parameters->NumParameters;
+      GLuint nr = brw->vs.prog_data->nr_params / 4;
 
+      /* Updates the ParamaterValues[i] pointers for all parameters of the
+       * basic type of PROGRAM_STATE_VAR.
+       */
       _mesa_load_state_parameters(ctx, vp->program.Base.Parameters); 
 
+      /* XXX just use a memcpy here */
       for (i = 0; i < nr; i++) {
-	 buf[offset + i * 4 + 0] = vp->program.Base.Parameters->ParameterValues[i][0];
-	 buf[offset + i * 4 + 1] = vp->program.Base.Parameters->ParameterValues[i][1];
-	 buf[offset + i * 4 + 2] = vp->program.Base.Parameters->ParameterValues[i][2];
-	 buf[offset + i * 4 + 3] = vp->program.Base.Parameters->ParameterValues[i][3];
+         const GLfloat *value = vp->program.Base.Parameters->ParameterValues[i];
+	 buf[offset + i * 4 + 0] = value[0];
+	 buf[offset + i * 4 + 1] = value[1];
+	 buf[offset + i * 4 + 2] = value[2];
+	 buf[offset + i * 4 + 3] = value[3];
       }
    }
 
@@ -273,11 +278,14 @@ static void prepare_constant_buffer(struct brw_context *brw)
        brw->curbe.last_buf &&
        bufsz == brw->curbe.last_bufsz &&
        memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
-      free(buf);
+      /* constants have not changed */
+      _mesa_free(buf);
    } 
    else {
+      /* constants have changed */
       if (brw->curbe.last_buf)
-	 free(brw->curbe.last_buf);
+	 _mesa_free(brw->curbe.last_buf);
+
       brw->curbe.last_buf = buf;
       brw->curbe.last_bufsz = bufsz;
 
@@ -324,7 +332,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
     */
 }
 
-
 static void emit_constant_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
@@ -351,7 +358,7 @@ static void emit_constant_buffer(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
-      .mesa = (_NEW_TRANSFORM|_NEW_PROJECTION),      /* plus fp and vp flags */
+      .mesa = _NEW_PROGRAM_CONSTANTS,
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 39c32255f8..78d457ad2b 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -139,6 +139,7 @@
 #define BRW_CLIPMODE_CLIP_NON_REJECTED   2
 #define BRW_CLIPMODE_REJECT_ALL          3
 #define BRW_CLIPMODE_ACCEPT_ALL          4
+#define BRW_CLIPMODE_KERNEL_CLIP         5
 
 #define BRW_CLIP_NDCSPACE     0
 #define BRW_CLIP_SCREENSPACE  1
@@ -225,6 +226,24 @@
 
 #define BRW_RASTRULE_UPPER_LEFT  0    
 #define BRW_RASTRULE_UPPER_RIGHT 1
+/* These are listed as "Reserved, but not seen as useful"
+ * in Intel documentation (page 212, "Point Rasterization Rule",
+ * section 7.4 "SF Pipeline State Summary", of document
+ * "Intel® 965 Express Chipset Family and Intel® G35 Express
+ * Chipset Graphics Controller Programmer's Reference Manual,
+ * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+ * available at 
+ *     http://intellinuxgraphics.org/documentation.html
+ * at the time of this writing).
+ *
+ * These appear to be supported on at least some
+ * i965-family devices, and the BRW_RASTRULE_LOWER_RIGHT
+ * is useful when using OpenGL to render to a FBO
+ * (which has the pixel coordinate Y orientation inverted
+ * with respect to the normal OpenGL pixel coordinate system).
+ */
+#define BRW_RASTRULE_LOWER_LEFT  2
+#define BRW_RASTRULE_LOWER_RIGHT 3
 
 #define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
 #define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
@@ -349,9 +368,10 @@
 #define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114 
 #define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
 #define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
-#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117 
-#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119 
-#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A 
+#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
+#define BRW_SURFACEFORMAT_L8A8_UNORM_SRGB                0x118
+#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
 #define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
 #define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
 #define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
@@ -368,6 +388,7 @@
 #define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
 #define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
 #define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
+#define BRW_SURFACEFORMAT_L8_UNORM_SRGB                  0x14C
 #define BRW_SURFACEFORMAT_R1_UINT                        0x181 
 #define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182 
 #define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183 
@@ -450,8 +471,9 @@
 #define BRW_CONDITIONAL_GE    4
 #define BRW_CONDITIONAL_L     5
 #define BRW_CONDITIONAL_LE    6
-#define BRW_CONDITIONAL_C     7
+#define BRW_CONDITIONAL_R     7
 #define BRW_CONDITIONAL_O     8
+#define BRW_CONDITIONAL_U     9
 
 #define BRW_DEBUG_NONE        0
 #define BRW_DEBUG_BREAKPOINT  1
@@ -491,6 +513,7 @@
 #define BRW_OPCODE_RSL        11
 #define BRW_OPCODE_ASR        12
 #define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_CMPN       17
 #define BRW_OPCODE_JMPI       32
 #define BRW_OPCODE_IF         34
 #define BRW_OPCODE_IFF        35
@@ -650,6 +673,25 @@
 #define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
 #define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
 
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG            0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG          0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG           0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG       1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG     1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG      1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG        2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG      2
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG       2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG    3
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG   3
+
+/* for IGDNG only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
 #define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
 #define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
 #define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
@@ -734,7 +776,7 @@
 
 
 #define CMD_URB_FENCE                 0x6000
-#define CMD_CONST_BUFFER_STATE        0x6001
+#define CMD_CS_URB_STATE              0x6001
 #define CMD_CONST_BUFFER              0x6002
 
 #define CMD_STATE_BASE_ADDRESS        0x6101
@@ -799,8 +841,11 @@
 #include "intel_chipset.h"
 
 #define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define CMD_PIPELINE_SELECT(brw)        (BRW_IS_G4X(brw) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
-#define CMD_VF_STATISTICS(brw)          (BRW_IS_G4X(brw) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
-#define URB_SIZES(brw)                  (BRW_IS_G4X(brw) ? 384 : 256)  /* 512 bit units */
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
+#define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
+#define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
+#define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
+#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
+                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
new file mode 100644
index 0000000000..9fef230507
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -0,0 +1,903 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "main/mtypes.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+
+struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+} opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 1, .ndst = 01 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+char *conditional_modifier[16] = {
+    [BRW_CONDITIONAL_NONE] = "",
+    [BRW_CONDITIONAL_Z] = ".e",
+    [BRW_CONDITIONAL_NZ] = ".ne",
+    [BRW_CONDITIONAL_G] = ".g",
+    [BRW_CONDITIONAL_GE] = ".ge",
+    [BRW_CONDITIONAL_L] = ".l",
+    [BRW_CONDITIONAL_LE] = ".le",
+    [BRW_CONDITIONAL_R] = ".r",
+    [BRW_CONDITIONAL_O] = ".o",
+    [BRW_CONDITIONAL_U] = ".u",
+};
+
+char *negate[2] = {
+    [0] = "",
+    [1] = "-",
+};
+
+char *_abs[2] = {
+    [0] = "",
+    [1] = "(abs)",
+};
+
+char *vert_stride[16] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4",
+    [4] = "8",
+    [5] = "16",
+    [6] = "32",
+    [15] = "VxH",
+};
+
+char *width[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+};
+
+char *horiz_stride[4] = {
+    [0] = "0",
+    [1] = "1",
+    [2] = "2",
+    [3] = "4"
+};
+
+char *chan_sel[4] = {
+    [0] = "x",
+    [1] = "y",
+    [2] = "z",
+    [3] = "w",
+};
+
+char *dest_condmod[16] = {
+};
+
+char *debug_ctrl[2] = {
+    [0] = "",
+    [1] = ".breakpoint"
+};
+
+char *saturate[2] = {
+    [0] = "",
+    [1] = ".sat"
+};
+
+char *exec_size[8] = {
+    [0] = "1",
+    [1] = "2",
+    [2] = "4",
+    [3] = "8",
+    [4] = "16",
+    [5] = "32"
+};
+
+char *pred_inv[2] = {
+    [0] = "+",
+    [1] = "-"
+};
+
+char *pred_ctrl_align16[16] = {
+    [1] = "",
+    [2] = ".x",
+    [3] = ".y",
+    [4] = ".z",
+    [5] = ".w",
+    [6] = ".any4h",
+    [7] = ".all4h",
+};
+
+char *pred_ctrl_align1[16] = {
+    [1] = "",
+    [2] = ".anyv",
+    [3] = ".allv",
+    [4] = ".any2h",
+    [5] = ".all2h",
+    [6] = ".any4h",
+    [7] = ".all4h",
+    [8] = ".any8h",
+    [9] = ".all8h",
+    [10] = ".any16h",
+    [11] = ".all16h",
+};
+
+char *thread_ctrl[4] = {
+    [0] = "",
+    [2] = "switch"
+};
+
+char *compr_ctrl[4] = {
+    [0] = "",
+    [1] = "sechalf",
+    [2] = "compr",
+};
+
+char *dep_ctrl[4] = {
+    [0] = "",
+    [1] = "NoDDClr",
+    [2] = "NoDDChk",
+    [3] = "NoDDClr,NoDDChk",
+};
+
+char *mask_ctrl[4] = {
+    [0] = "",
+    [1] = "nomask",
+};
+
+char *access_mode[2] = {
+    [0] = "align1",
+    [1] = "align16",
+};
+
+char *reg_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [4] = "UB",
+    [5] = "B",
+    [7] = "F"
+};
+
+char *imm_encoding[8] = {
+    [0] = "UD",
+    [1] = "D",
+    [2] = "UW",
+    [3] = "W",
+    [5] = "VF",
+    [5] = "V",
+    [7] = "F"
+};
+
+char *reg_file[4] = {
+    [0] = "A",
+    [1] = "g",
+    [2] = "m",
+    [3] = "imm",
+};
+
+char *writemask[16] = {
+    [0x0] = ".",
+    [0x1] = ".x",
+    [0x2] = ".y",
+    [0x3] = ".xy",
+    [0x4] = ".z",
+    [0x5] = ".xz",
+    [0x6] = ".yz",
+    [0x7] = ".xyz",
+    [0x8] = ".w",
+    [0x9] = ".xw",
+    [0xa] = ".yw",
+    [0xb] = ".xyw",
+    [0xc] = ".zw",
+    [0xd] = ".xzw",
+    [0xe] = ".yzw",
+    [0xf] = "",
+};
+
+char *end_of_thread[2] = {
+    [0] = "",
+    [1] = "EOT"
+};
+
+char *target_function[16] = {
+    [BRW_MESSAGE_TARGET_NULL] = "null",
+    [BRW_MESSAGE_TARGET_MATH] = "math",
+    [BRW_MESSAGE_TARGET_SAMPLER] = "sampler",
+    [BRW_MESSAGE_TARGET_GATEWAY] = "gateway",
+    [BRW_MESSAGE_TARGET_DATAPORT_READ] = "read",
+    [BRW_MESSAGE_TARGET_DATAPORT_WRITE] = "write",
+    [BRW_MESSAGE_TARGET_URB] = "urb",
+    [BRW_MESSAGE_TARGET_THREAD_SPAWNER] = "thread_spawner"
+};
+
+char *math_function[16] = {
+    [BRW_MATH_FUNCTION_INV] = "inv",
+    [BRW_MATH_FUNCTION_LOG] = "log",
+    [BRW_MATH_FUNCTION_EXP] = "exp",
+    [BRW_MATH_FUNCTION_SQRT] = "sqrt",
+    [BRW_MATH_FUNCTION_RSQ] = "rsq",
+    [BRW_MATH_FUNCTION_SIN] = "sin",
+    [BRW_MATH_FUNCTION_COS] = "cos",
+    [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+    [BRW_MATH_FUNCTION_TAN] = "tan",
+    [BRW_MATH_FUNCTION_POW] = "pow",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+    [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intmod",
+    [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intdiv",
+};
+
+char *math_saturate[2] = {
+    [0] = "",
+    [1] = "sat"
+};
+
+char *math_signed[2] = {
+    [0] = "",
+    [1] = "signed"
+};
+
+char *math_scalar[2] = {
+    [0] = "",
+    [1] = "scalar"
+};
+
+char *math_precision[2] = {
+    [0] = "",
+    [1] = "partial_precision"
+};
+
+char *urb_swizzle[4] = {
+    [BRW_URB_SWIZZLE_NONE] = "",
+    [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+    [BRW_URB_SWIZZLE_TRANSPOSE] = "transpose",
+};
+
+char *urb_allocate[2] = {
+    [0] = "",
+    [1] = "allocate"
+};
+
+char *urb_used[2] = {
+    [0] = "",
+    [1] = "used"
+};
+
+char *urb_complete[2] = {
+    [0] = "",
+    [1] = "complete"
+};
+
+char *sampler_target_format[4] = {
+    [0] = "F",
+    [2] = "UD",
+    [3] = "D"
+};
+
+
+static int column;
+
+static int string (FILE *file, char *string)
+{
+    fputs (string, file);
+    column += strlen (string);
+    return 0;
+}
+
+static int format (FILE *f, char *format, ...)
+{
+    char    buf[1024];
+    va_list	args;
+    va_start (args, format);
+
+    vsnprintf (buf, sizeof (buf) - 1, format, args);
+    string (f, buf);
+    return 0;
+}
+
+static int newline (FILE *f)
+{
+    putc ('\n', f);
+    column = 0;
+    return 0;
+}
+
+static int pad (FILE *f, int c)
+{
+    do
+	string (f, " ");
+    while (column < c);
+    return 0;
+}
+
+static int control (FILE *file, char *name, char *ctrl[], GLuint id, int *space)
+{
+    if (!ctrl[id]) {
+	fprintf (file, "*** invalid %s value %d ",
+		 name, id);
+	return 1;
+    }
+    if (ctrl[id][0])
+    {
+	if (space && *space)
+	    string (file, " ");
+	string (file, ctrl[id]);
+	if (space)
+	    *space = 1;
+    }
+    return 0;
+}
+
+static int print_opcode (FILE *file, int id)
+{
+    if (!opcode[id].name) {
+	format (file, "*** invalid opcode value %d ", id);
+	return 1;
+    }
+    string (file, opcode[id].name);
+    return 0;
+}
+
+static int reg (FILE *file, GLuint _reg_file, GLuint _reg_nr)
+{
+    int	err = 0;
+    if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+	switch (_reg_nr & 0xf0) {
+	case BRW_ARF_NULL:
+	    string (file, "null");
+	    return -1;
+	case BRW_ARF_ADDRESS:
+	    format (file, "a%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_ACCUMULATOR:
+	    format (file, "acc%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK:
+	    format (file, "mask%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_MASK_STACK:
+	    format (file, "msd%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_STATE:
+	    format (file, "sr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_CONTROL:
+	    format (file, "cr%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_NOTIFICATION_COUNT:
+	    format (file, "n%d", _reg_nr & 0x0f);
+	    break;
+	case BRW_ARF_IP:
+	    string (file, "ip");
+	    return -1;
+	    break;
+	default:
+	    format (file, "ARF%d", _reg_nr);
+	    break;
+	}
+    } else {
+	err  |= control (file, "src reg file", reg_file, _reg_file, NULL);
+	format (file, "%d", _reg_nr);
+    }
+    return err;
+}
+
+static int dest (FILE *file, struct brw_instruction *inst)
+{
+    int	err = 0;
+
+    if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da1.dest_subreg_nr);
+	    format (file, "<%d>", inst->bits1.da1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+	}
+	else
+	{
+	    string (file, "g[a0");
+	    if (inst->bits1.ia1.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.ia1.dest_subreg_nr);
+	    if (inst->bits1.ia1.dest_indirect_offset)
+		format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
+	    string (file, "]");
+	    format (file, "<%d>", inst->bits1.ia1.dest_horiz_stride);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+	}
+    }
+    else
+    {
+	if (inst->bits1.da16.dest_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+	    if (err == -1)
+		return 0;
+	    if (inst->bits1.da16.dest_subreg_nr)
+		format (file, ".%d", inst->bits1.da16.dest_subreg_nr);
+	    string (file, "<1>");
+	    err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
+	    err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
+	}
+	else
+	{
+	    err = 1;
+	    string (file, "Indirect align16 address mode not supported");
+	}
+    }
+
+    return 0;
+}
+
+static int src_align1_region (FILE *file,
+			      GLuint _vert_stride, GLuint _width, GLuint _horiz_stride)
+{
+    int err = 0;
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",");
+    err |= control (file, "width", width, _width, NULL);
+    string (file, ",");
+    err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+    string (file, ">");
+    return err;
+}
+
+static int src_da1 (FILE *file, GLuint type, GLuint _reg_file,
+		    GLuint _vert_stride, GLuint _width, GLuint _horiz_stride,
+		    GLuint reg_num, GLuint sub_reg_num, GLuint __abs, GLuint _negate)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, reg_num);
+    if (err == -1)
+	return 0;
+    if (sub_reg_num)
+	format (file, ".%d", sub_reg_num);
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_ia1 (FILE *file,
+		    GLuint type,
+		    GLuint _reg_file,
+		    GLint _addr_imm,
+		    GLuint _addr_subreg_nr,
+		    GLuint _negate,
+		    GLuint __abs,
+		    GLuint _addr_mode,
+		    GLuint _horiz_stride,
+		    GLuint _width,
+		    GLuint _vert_stride)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    string (file, "g[a0");
+    if (_addr_subreg_nr)
+	format (file, ".%d", _addr_subreg_nr);
+    if (_addr_imm)
+	format (file, " %d", _addr_imm);
+    string (file, "]");
+    src_align1_region (file, _vert_stride, _width, _horiz_stride);
+    err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    return err;
+}
+
+static int src_da16 (FILE *file,
+		     GLuint _reg_type,
+		     GLuint _reg_file,
+		     GLuint _vert_stride,
+		     GLuint _reg_nr,
+		     GLuint _subreg_nr,
+		     GLuint __abs,
+		     GLuint _negate,
+		     GLuint swz_x,
+		     GLuint swz_y,
+		     GLuint swz_z,
+		     GLuint swz_w)
+{
+    int err = 0;
+    err |= control (file, "negate", negate, _negate, NULL);
+    err |= control (file, "abs", _abs, __abs, NULL);
+
+    err |= reg (file, _reg_file, _reg_nr);
+    if (err == -1)
+	return 0;
+    if (_subreg_nr)
+	format (file, ".%d", _subreg_nr);
+    string (file, "<");
+    err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
+    string (file, ",1,1>");
+    err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+    /*
+     * Three kinds of swizzle display:
+     *  identity - nothing printed
+     *  1->all	 - print the single channel
+     *  1->1     - print the mapping
+     */
+    if (swz_x == BRW_CHANNEL_X &&
+	swz_y == BRW_CHANNEL_Y &&
+	swz_z == BRW_CHANNEL_Z &&
+	swz_w == BRW_CHANNEL_W)
+    {
+	;
+    }
+    else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+    }
+    else
+    {
+	string (file, ".");
+	err |= control (file, "channel select", chan_sel, swz_x, NULL);
+	err |= control (file, "channel select", chan_sel, swz_y, NULL);
+	err |= control (file, "channel select", chan_sel, swz_z, NULL);
+	err |= control (file, "channel select", chan_sel, swz_w, NULL);
+    }
+    return err;
+}
+
+
+static int imm (FILE *file, GLuint type, struct brw_instruction *inst) {
+    switch (type) {
+    case BRW_REGISTER_TYPE_UD:
+	format (file, "0x%08xUD", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_D:
+	format (file, "%dD", inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UW:
+	format (file, "0x%04xUW", (uint16_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_W:
+	format (file, "%dW", (int16_t) inst->bits3.d);
+	break;
+    case BRW_REGISTER_TYPE_UB:
+	format (file, "0x%02xUB", (int8_t) inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_VF:
+	format (file, "Vector Float");
+	break;
+    case BRW_REGISTER_TYPE_V:
+	format (file, "0x%08xV", inst->bits3.ud);
+	break;
+    case BRW_REGISTER_TYPE_F:
+	format (file, "%-gF", inst->bits3.f);
+    }
+    return 0;
+}
+
+static int src0 (FILE *file, struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src0_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src0_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits2.da1.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src0_reg_type,
+			    inst->bits1.da1.src0_reg_file,
+			    inst->bits2.da1.src0_vert_stride,
+			    inst->bits2.da1.src0_width,
+			    inst->bits2.da1.src0_horiz_stride,
+			    inst->bits2.da1.src0_reg_nr,
+			    inst->bits2.da1.src0_subreg_nr,
+			    inst->bits2.da1.src0_abs,
+			    inst->bits2.da1.src0_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src0_reg_type,
+			    inst->bits1.ia1.src0_reg_file,
+			    inst->bits2.ia1.src0_indirect_offset,
+			    inst->bits2.ia1.src0_subreg_nr,
+			    inst->bits2.ia1.src0_negate,
+			    inst->bits2.ia1.src0_abs,
+			    inst->bits2.ia1.src0_address_mode,
+			    inst->bits2.ia1.src0_horiz_stride,
+			    inst->bits2.ia1.src0_width,
+			    inst->bits2.ia1.src0_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits2.da16.src0_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src0_reg_type,
+			     inst->bits1.da16.src0_reg_file,
+			     inst->bits2.da16.src0_vert_stride,
+			     inst->bits2.da16.src0_reg_nr,
+			     inst->bits2.da16.src0_subreg_nr,
+			     inst->bits2.da16.src0_abs,
+			     inst->bits2.da16.src0_negate,
+			     inst->bits2.da16.src0_swz_x,
+			     inst->bits2.da16.src0_swz_y,
+			     inst->bits2.da16.src0_swz_z,
+			     inst->bits2.da16.src0_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+static int src1 (FILE *file, struct brw_instruction *inst)
+{
+    if (inst->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE)
+	return imm (file, inst->bits1.da1.src1_reg_type,
+		    inst);
+    else if (inst->header.access_mode == BRW_ALIGN_1)
+    {
+	if (inst->bits3.da1.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da1 (file,
+			    inst->bits1.da1.src1_reg_type,
+			    inst->bits1.da1.src1_reg_file,
+			    inst->bits3.da1.src1_vert_stride,
+			    inst->bits3.da1.src1_width,
+			    inst->bits3.da1.src1_horiz_stride,
+			    inst->bits3.da1.src1_reg_nr,
+			    inst->bits3.da1.src1_subreg_nr,
+			    inst->bits3.da1.src1_abs,
+			    inst->bits3.da1.src1_negate);
+	}
+	else
+	{
+	    return src_ia1 (file,
+			    inst->bits1.ia1.src1_reg_type,
+			    inst->bits1.ia1.src1_reg_file,
+			    inst->bits3.ia1.src1_indirect_offset,
+			    inst->bits3.ia1.src1_subreg_nr,
+			    inst->bits3.ia1.src1_negate,
+			    inst->bits3.ia1.src1_abs,
+			    inst->bits3.ia1.src1_address_mode,
+			    inst->bits3.ia1.src1_horiz_stride,
+			    inst->bits3.ia1.src1_width,
+			    inst->bits3.ia1.src1_vert_stride);
+	}
+    }
+    else
+    {
+	if (inst->bits3.da16.src1_address_mode == BRW_ADDRESS_DIRECT)
+	{
+	    return src_da16 (file,
+			     inst->bits1.da16.src1_reg_type,
+			     inst->bits1.da16.src1_reg_file,
+			     inst->bits3.da16.src1_vert_stride,
+			     inst->bits3.da16.src1_reg_nr,
+			     inst->bits3.da16.src1_subreg_nr,
+			     inst->bits3.da16.src1_abs,
+			     inst->bits3.da16.src1_negate,
+			     inst->bits3.da16.src1_swz_x,
+			     inst->bits3.da16.src1_swz_y,
+			     inst->bits3.da16.src1_swz_z,
+			     inst->bits3.da16.src1_swz_w);
+	}
+	else
+	{
+	    string (file, "Indirect align16 address mode not supported");
+	    return 1;
+	}
+    }
+}
+
+int brw_disasm (FILE *file, struct brw_instruction *inst)
+{
+    int	err = 0;
+    int space = 0;
+
+    if (inst->header.predicate_control) {
+	string (file, "(");
+	err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
+	string (file, "f0");
+	if (inst->bits2.da1.flag_reg_nr)
+	    format (file, ".%d", inst->bits2.da1.flag_reg_nr);
+	if (inst->header.access_mode == BRW_ALIGN_1)
+	    err |= control (file, "predicate control align1", pred_ctrl_align1,
+			    inst->header.predicate_control, NULL);
+	else
+	    err |= control (file, "predicate control align16", pred_ctrl_align16,
+			    inst->header.predicate_control, NULL);
+	string (file, ") ");
+    }
+
+    err |= print_opcode (file, inst->header.opcode);
+    err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
+    err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_SEND)
+	err |= control (file, "conditional modifier", conditional_modifier,
+			inst->header.destreg__conditionalmod, NULL);
+
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "(");
+	err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
+	string (file, ")");
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND)
+	format (file, " %d", inst->header.destreg__conditionalmod);
+
+    if (opcode[inst->header.opcode].ndst > 0) {
+	pad (file, 16);
+	err |= dest (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 0) {
+	pad (file, 32);
+	err |= src0 (file, inst);
+    }
+    if (opcode[inst->header.opcode].nsrc > 1) {
+	pad (file, 48);
+	err |= src1 (file, inst);
+    }
+
+    if (inst->header.opcode == BRW_OPCODE_SEND) {
+	newline (file);
+	pad (file, 16);
+	space = 0;
+	err |= control (file, "target function", target_function,
+			inst->bits3.generic.msg_target, &space);
+	switch (inst->bits3.generic.msg_target) {
+	case BRW_MESSAGE_TARGET_MATH:
+	    err |= control (file, "math function", math_function,
+			    inst->bits3.math.function, &space);
+	    err |= control (file, "math saturate", math_saturate,
+			    inst->bits3.math.saturate, &space);
+	    err |= control (file, "math signed", math_signed,
+			    inst->bits3.math.int_type, &space);
+	    err |= control (file, "math scalar", math_scalar,
+			    inst->bits3.math.data_type, &space);
+	    err |= control (file, "math precision", math_precision,
+			    inst->bits3.math.precision, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_SAMPLER:
+	    format (file, " (%d, %d, ",
+		    inst->bits3.sampler.binding_table_index,
+		    inst->bits3.sampler.sampler);
+	    err |= control (file, "sampler target format", sampler_target_format,
+			    inst->bits3.sampler.return_format, NULL);
+	    string (file, ")");
+	    break;
+	case BRW_MESSAGE_TARGET_DATAPORT_WRITE:
+	    format (file, " (%d, %d, %d, %d)",
+		    inst->bits3.dp_write.binding_table_index,
+		    (inst->bits3.dp_write.pixel_scoreboard_clear << 3) |
+		    inst->bits3.dp_write.msg_control,
+		    inst->bits3.dp_write.msg_type,
+		    inst->bits3.dp_write.send_commit_msg);
+	    break;
+	case BRW_MESSAGE_TARGET_URB:
+	    format (file, " %d", inst->bits3.urb.offset);
+	    space = 1;
+	    err |= control (file, "urb swizzle", urb_swizzle,
+			    inst->bits3.urb.swizzle_control, &space);
+	    err |= control (file, "urb allocate", urb_allocate,
+			    inst->bits3.urb.allocate, &space);
+	    err |= control (file, "urb used", urb_used,
+			    inst->bits3.urb.used, &space);
+	    err |= control (file, "urb complete", urb_complete,
+			    inst->bits3.urb.complete, &space);
+	    break;
+	case BRW_MESSAGE_TARGET_THREAD_SPAWNER:
+	    break;
+	default:
+	    format (file, "unsupported target %d", inst->bits3.generic.msg_target);
+	    break;
+	}
+	if (space)
+	    string (file, " ");
+	format (file, "mlen %d",
+		inst->bits3.generic.msg_length);
+	format (file, " rlen %d",
+		inst->bits3.generic.response_length);
+    }
+    pad (file, 64);
+    if (inst->header.opcode != BRW_OPCODE_NOP) {
+	string (file, "{");
+	space = 1;
+	err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
+	err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
+	err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
+	err |= control (file, "compression control", compr_ctrl, inst->header.compression_control, &space);
+	err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
+	if (inst->header.opcode == BRW_OPCODE_SEND)
+	    err |= control (file, "end of thread", end_of_thread,
+			    inst->bits3.generic.end_of_thread, &space);
+	if (space)
+	    string (file, " ");
+	string (file, "}");
+    }
+    string (file, ";");
+    newline (file);
+    return err;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 785fb784ca..44bb7bd588 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -25,13 +25,15 @@
  * 
  **************************************************************************/
 
-#include <stdlib.h>
 
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/state.h"
-#include "main/api_validate.h"
 #include "main/enums.h"
+#include "tnl/tnl.h"
+#include "vbo/vbo_context.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
 
 #include "brw_draw.h"
 #include "brw_defines.h"
@@ -42,11 +44,6 @@
 #include "intel_batchbuffer.h"
 #include "intel_buffer_objects.h"
 
-#include "tnl/tnl.h"
-#include "vbo/vbo_context.h"
-#include "swrast/swrast.h"
-#include "swrast_setup/swrast_setup.h"
-
 #define FILE_DEBUG_FLAG DEBUG_BATCH
 
 static GLuint prim_to_hw_prim[GL_POLYGON+1] = {
@@ -84,15 +81,17 @@ static const GLenum reduced_prim[GL_POLYGON+1] = {
  */
 static GLuint brw_set_prim(struct brw_context *brw, GLenum prim)
 {
+   GLcontext *ctx = &brw->intel.ctx;
+
    if (INTEL_DEBUG & DEBUG_PRIMS)
       _mesa_printf("PRIM: %s\n", _mesa_lookup_enum_by_nr(prim));
    
    /* Slight optimization to avoid the GS program when not needed:
     */
    if (prim == GL_QUAD_STRIP &&
-       brw->attribs.Light->ShadeModel != GL_FLAT &&
-       brw->attribs.Polygon->FrontMode == GL_FILL &&
-       brw->attribs.Polygon->BackMode == GL_FILL)
+       ctx->Light.ShadeModel != GL_FLAT &&
+       ctx->Polygon.FrontMode == GL_FILL &&
+       ctx->Polygon.BackMode == GL_FILL)
       prim = GL_TRIANGLE_STRIP;
 
    if (prim != brw->primitive) {
@@ -125,6 +124,7 @@ static void brw_emit_prim(struct brw_context *brw,
 			  uint32_t hw_prim)
 {
    struct brw_3d_primitive prim_packet;
+   struct intel_context *intel = &brw->intel;
 
    if (INTEL_DEBUG & DEBUG_PRIMS)
       _mesa_printf("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode), 
@@ -138,16 +138,35 @@ static void brw_emit_prim(struct brw_context *brw,
 
    prim_packet.verts_per_instance = trim(prim->mode, prim->count);
    prim_packet.start_vert_location = prim->start;
+   if (prim->indexed)
+      prim_packet.start_vert_location += brw->ib.start_vertex_offset;
    prim_packet.instance_count = 1;
    prim_packet.start_instance_location = 0;
-   prim_packet.base_vert_location = 0;
+   prim_packet.base_vert_location = prim->basevertex;
 
    /* Can't wrap here, since we rely on the validated state. */
    brw->no_batch_wrap = GL_TRUE;
+
+   /* If we're set to always flush, do it before and after the primitive emit.
+    * We want to catch both missed flushes that hurt instruction/state cache
+    * and missed flushes of the render cache as it heads to other parts of
+    * the besides the draw code.
+    */
+   if (intel->always_flush_cache) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH(intel->vtbl.flush_cmd());
+      ADVANCE_BATCH();
+   }
    if (prim_packet.verts_per_instance) {
       intel_batchbuffer_data( brw->intel.batch, &prim_packet,
 			      sizeof(prim_packet), LOOP_CLIPRECTS);
    }
+   if (intel->always_flush_cache) {
+      BEGIN_BATCH(1, IGNORE_CLIPRECTS);
+      OUT_BATCH(intel->vtbl.flush_cmd());
+      ADVANCE_BATCH();
+   }
+
    brw->no_batch_wrap = GL_FALSE;
 }
 
@@ -165,24 +184,16 @@ static void brw_merge_inputs( struct brw_context *brw,
 
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
       brw->vb.inputs[i].glarray = arrays[i];
+      brw->vb.inputs[i].attrib = (gl_vert_attrib) i;
 
-      /* XXX: metaops passes null arrays */
-      if (arrays[i]) {
-	 if (arrays[i]->StrideB != 0)
-	    brw->vb.info.varying |= 1 << i;
-
+      if (arrays[i]->StrideB != 0)
 	 brw->vb.info.sizes[i/16] |= (brw->vb.inputs[i].glarray->Size - 1) <<
 	    ((i%16) * 2);
-      }
    }
 
-   /* Raise statechanges if input sizes and varying have changed: 
-    */
+   /* Raise statechanges if input sizes have changed. */
    if (memcmp(brw->vb.info.sizes, old.sizes, sizeof(old.sizes)) != 0)
       brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS;
-
-   if (brw->vb.info.varying != old.varying)
-      brw->state.dirty.brw |= BRW_NEW_INPUT_VARYING;
 }
 
 /* XXX: could split the primitive list to fallback only on the
@@ -192,12 +203,20 @@ static GLboolean check_fallbacks( struct brw_context *brw,
 				  const struct _mesa_prim *prim,
 				  GLuint nr_prims )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
-   if (!brw->intel.strict_conformance)
+   /* If we don't require strict OpenGL conformance, never 
+    * use fallbacks.  If we're forcing fallbacks, always
+    * use fallfacks.
+    */
+   if (brw->intel.conformance_mode == 0)
       return GL_FALSE;
 
-   if (brw->attribs.Polygon->SmoothFlag) {
+   if (brw->intel.conformance_mode == 2)
+      return GL_TRUE;
+
+   if (ctx->Polygon.SmoothFlag) {
       for (i = 0; i < nr_prims; i++)
 	 if (reduced_prim[prim[i].mode] == GL_TRIANGLES) 
 	    return GL_TRUE;
@@ -206,7 +225,7 @@ static GLboolean check_fallbacks( struct brw_context *brw,
    /* BRW hardware will do AA lines, but they are non-conformant it
     * seems.  TBD whether we keep this fallback:
     */
-   if (brw->attribs.Line->SmoothFlag) {
+   if (ctx->Line.SmoothFlag) {
       for (i = 0; i < nr_prims; i++)
 	 if (reduced_prim[prim[i].mode] == GL_LINES) 
 	    return GL_TRUE;
@@ -215,28 +234,61 @@ static GLboolean check_fallbacks( struct brw_context *brw,
    /* Stipple -- these fallbacks could be resolved with a little
     * bit of work?
     */
-   if (brw->attribs.Line->StippleFlag) {
+   if (ctx->Line.StippleFlag) {
       for (i = 0; i < nr_prims; i++) {
 	 /* GS doesn't get enough information to know when to reset
 	  * the stipple counter?!?
 	  */
-	 if (prim[i].mode == GL_LINE_LOOP) 
+	 if (prim[i].mode == GL_LINE_LOOP || prim[i].mode == GL_LINE_STRIP) 
 	    return GL_TRUE;
 	    
 	 if (prim[i].mode == GL_POLYGON &&
-	     (brw->attribs.Polygon->FrontMode == GL_LINE ||
-	      brw->attribs.Polygon->BackMode == GL_LINE))
+	     (ctx->Polygon.FrontMode == GL_LINE ||
+	      ctx->Polygon.BackMode == GL_LINE))
 	    return GL_TRUE;
       }
    }
 
-
-   if (brw->attribs.Point->SmoothFlag) {
+   if (ctx->Point.SmoothFlag) {
       for (i = 0; i < nr_prims; i++)
 	 if (prim[i].mode == GL_POINTS) 
 	    return GL_TRUE;
    }
+
+   /* BRW hardware doesn't handle GL_CLAMP texturing correctly;
+    * brw_wm_sampler_state:translate_wrap_mode() treats GL_CLAMP
+    * as GL_CLAMP_TO_EDGE instead.  If we're using GL_CLAMP, and
+    * we want strict conformance, force the fallback.
+    * Right now, we only do this for 2D textures.
+    */
+   {
+      int u;
+      for (u = 0; u < ctx->Const.MaxTextureCoordUnits; u++) {
+         struct gl_texture_unit *texUnit = &ctx->Texture.Unit[u];
+         if (texUnit->Enabled) {
+            if (texUnit->Enabled & TEXTURE_1D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_1D_INDEX]->WrapS == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_2D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_2D_INDEX]->WrapT == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+            if (texUnit->Enabled & TEXTURE_3D_BIT) {
+               if (texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapS == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapT == GL_CLAMP ||
+                   texUnit->CurrentTex[TEXTURE_3D_INDEX]->WrapR == GL_CLAMP) {
+                   return GL_TRUE;
+               }
+            }
+         }
+      }
+   }
       
+   /* Nothing stopping us from the fast path now */
    return GL_FALSE;
 }
 
@@ -261,11 +313,18 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    if (ctx->NewState)
       _mesa_update_state( ctx );
 
+   /* We have to validate the textures *before* checking for fallbacks;
+    * otherwise, the software fallback won't be able to rely on the
+    * texture state, the firstLevel and lastLevel fields won't be
+    * set in the intel texture object (they'll both be 0), and the 
+    * software fallback will segfault if it attempts to access any
+    * texture level other than level 0.
+    */
+   brw_validate_textures( brw );
+
    if (check_fallbacks(brw, prim, nr_prims))
       return GL_FALSE;
 
-   brw_validate_textures( brw );
-
    /* Bind all inputs, derive varying and size information:
     */
    brw_merge_inputs( brw, arrays );
@@ -346,9 +405,13 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
       retval = GL_TRUE;
    }
 
+   if (intel->always_flush_batch)
+      intel_batchbuffer_flush(intel->batch);
  out:
    UNLOCK_HARDWARE(intel);
 
+   brw_state_cache_check_size(brw);
+
    if (warn)
       fprintf(stderr, "i965: Single primitive emit potentially exceeded "
 	      "available aperture space\n");
@@ -359,54 +422,31 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
    return retval;
 }
 
-static GLboolean brw_need_rebase( GLcontext *ctx,
-				  const struct gl_client_array *arrays[],
-				  const struct _mesa_index_buffer *ib,
-				  GLuint min_index )
-{
-   if (min_index == 0) 
-      return GL_FALSE;
-
-   if (ib) {
-      if (!vbo_all_varyings_in_vbos(arrays))
-	 return GL_TRUE;
-      else
-	 return GL_FALSE;
-   }
-   else {
-      /* Hmm.  This isn't quite what I wanted.  BRW can actually
-       * handle the mixed case well enough that we shouldn't need to
-       * rebase.  However, it's probably not very common, nor hugely
-       * expensive to do it this way:
-       */
-      if (!vbo_all_varyings_in_vbos(arrays))
-	 return GL_TRUE;
-      else
-	 return GL_FALSE;
-   }
-}
-				  
-
 void brw_draw_prims( GLcontext *ctx,
 		     const struct gl_client_array *arrays[],
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
+		     GLboolean index_bounds_valid,
 		     GLuint min_index,
 		     GLuint max_index )
 {
    GLboolean retval;
 
-   /* Decide if we want to rebase.  If so we end up recursing once
-    * only into this function.
-    */
-   if (brw_need_rebase( ctx, arrays, ib, min_index )) {
-      vbo_rebase_prims( ctx, arrays, 
-			prim, nr_prims, 
-			ib, min_index, max_index, 
-			brw_draw_prims );
-      
-      return;
+   if (!vbo_all_varyings_in_vbos(arrays)) {
+      if (!index_bounds_valid)
+	 vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+
+      /* Decide if we want to rebase.  If so we end up recursing once
+       * only into this function.
+       */
+      if (min_index != 0) {
+	 vbo_rebase_prims(ctx, arrays,
+			  prim, nr_prims,
+			  ib, min_index, max_index,
+			  brw_draw_prims );
+	 return;
+      }
    }
 
    /* Make a first attempt at drawing:
diff --git a/src/mesa/drivers/dri/i965/brw_draw.h b/src/mesa/drivers/dri/i965/brw_draw.h
index 9aebbdb1b8..2a14db217f 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.h
+++ b/src/mesa/drivers/dri/i965/brw_draw.h
@@ -39,6 +39,7 @@ void brw_draw_prims( GLcontext *ctx,
 		     const struct _mesa_prim *prims,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
+		     GLboolean index_bounds_valid,
 		     GLuint min_index,
 		     GLuint max_index );
 
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 73d6dea01e..765ae5a2fe 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -25,9 +25,9 @@
  * 
  **************************************************************************/
 
-#include <stdlib.h>
 
 #include "main/glheader.h"
+#include "main/bufferobj.h"
 #include "main/context.h"
 #include "main/state.h"
 #include "main/api_validate.h"
@@ -156,7 +156,13 @@ static GLuint byte_types_scale[5] = {
 };
 
 
-static GLuint get_surface_type( GLenum type, GLuint size, GLboolean normalized )
+/**
+ * Given vertex array type/size/format/normalized info, return
+ * the appopriate hardware surface type.
+ * Format will be GL_RGBA or possibly GL_BGRA for GLubyte[4] color arrays.
+ */
+static GLuint get_surface_type( GLenum type, GLuint size,
+                                GLenum format, GLboolean normalized )
 {
    if (INTEL_DEBUG & DEBUG_VERTS)
       _mesa_printf("type %s size %d normalized %d\n", 
@@ -171,11 +177,20 @@ static GLuint get_surface_type( GLenum type, GLuint size, GLboolean normalized )
       case GL_BYTE: return byte_types_norm[size];
       case GL_UNSIGNED_INT: return uint_types_norm[size];
       case GL_UNSIGNED_SHORT: return ushort_types_norm[size];
-      case GL_UNSIGNED_BYTE: return ubyte_types_norm[size];
+      case GL_UNSIGNED_BYTE:
+         if (format == GL_BGRA) {
+            /* See GL_EXT_vertex_array_bgra */
+            assert(size == 4);
+            return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+         }
+         else {
+            return ubyte_types_norm[size];
+         }
       default: assert(0); return 0;
       }      
    }
    else {
+      assert(format == GL_RGBA); /* sanity check */
       switch (type) {
       case GL_DOUBLE: return double_types[size];
       case GL_FLOAT: return float_types[size];
@@ -262,6 +277,7 @@ copy_array_to_vbo_array( struct brw_context *brw,
 			 struct brw_vertex_element *element,
 			 GLuint dst_stride)
 {
+   struct intel_context *intel = &brw->intel;
    GLuint size = element->count * dst_stride;
 
    get_space(brw, size, &element->bo, &element->offset);
@@ -274,29 +290,52 @@ copy_array_to_vbo_array( struct brw_context *brw,
    }
 
    if (dst_stride == element->glarray->StrideB) {
-      dri_bo_subdata(element->bo,
-		     element->offset,
-		     size,
-		     element->glarray->Ptr);
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(element->bo);
+	 memcpy((char *)element->bo->virtual + element->offset,
+		element->glarray->Ptr, size);
+	 drm_intel_gem_bo_unmap_gtt(element->bo);
+      } else {
+	 dri_bo_subdata(element->bo,
+			element->offset,
+			size,
+			element->glarray->Ptr);
+      }
    } else {
-      void *data;
       char *dest;
-      const char *src = element->glarray->Ptr;
+      const unsigned char *src = element->glarray->Ptr;
       int i;
 
-      data = _mesa_malloc(dst_stride * element->count);
-      dest = data;
-      for (i = 0; i < element->count; i++) {
-	 memcpy(dest, src, dst_stride);
-	 src += element->glarray->StrideB;
-	 dest += dst_stride;
-      }
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(element->bo);
+	 dest = element->bo->virtual;
+	 dest += element->offset;
+
+	 for (i = 0; i < element->count; i++) {
+	    memcpy(dest, src, dst_stride);
+	    src += element->glarray->StrideB;
+	    dest += dst_stride;
+	 }
 
-      dri_bo_subdata(element->bo,
-		     element->offset,
-		     size,
-		     data);
-      _mesa_free(data);
+	 drm_intel_gem_bo_unmap_gtt(element->bo);
+      } else {
+	 void *data;
+
+	 data = _mesa_malloc(dst_stride * element->count);
+	 dest = data;
+	 for (i = 0; i < element->count; i++) {
+	    memcpy(dest, src, dst_stride);
+	    src += element->glarray->StrideB;
+	    dest += dst_stride;
+	 }
+
+	 dri_bo_subdata(element->bo,
+			element->offset,
+			size,
+			data);
+
+	 _mesa_free(data);
+      }
    }
 }
 
@@ -304,16 +343,13 @@ static void brw_prepare_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLuint tmp = brw->vs.prog_data->inputs_read; 
+   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; 
    GLuint i;
    const unsigned char *ptr = NULL;
    GLuint interleave = 0;
    unsigned int min_index = brw->vb.min_index;
    unsigned int max_index = brw->vb.max_index;
 
-   struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
-   GLuint nr_enabled = 0;
-
    struct brw_vertex_element *upload[VERT_ATTRIB_MAX];
    GLuint nr_uploads = 0;
 
@@ -323,12 +359,13 @@ static void brw_prepare_vertices(struct brw_context *brw)
       _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
    /* Accumulate the list of enabled arrays. */
-   while (tmp) {
-      GLuint i = _mesa_ffsll(tmp)-1;
+   brw->vb.nr_enabled = 0;
+   while (vs_inputs) {
+      GLuint i = _mesa_ffsll(vs_inputs) - 1;
       struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      tmp &= ~(1<<i);
-      enabled[nr_enabled++] = input;
+      vs_inputs &= ~(1 << i);
+      brw->vb.enabled[brw->vb.nr_enabled++] = input;
    }
 
    /* XXX: In the rare cases where this happens we fallback all
@@ -337,18 +374,17 @@ static void brw_prepare_vertices(struct brw_context *brw)
     * cases with > 17 vertex attributes enabled, so it probably
     * isn't an issue at this point.
     */
-   if (nr_enabled >= BRW_VEP_MAX) {
+   if (brw->vb.nr_enabled >= BRW_VEP_MAX) {
       intel->Fallback = 1;
       return;
    }
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       input->element_size = get_size(input->glarray->Type) * input->glarray->Size;
-      input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
 
-      if (input->glarray->BufferObj->Name != 0) {
+      if (_mesa_is_bufferobj(input->glarray->BufferObj)) {
 	 struct intel_buffer_object *intel_buffer =
 	    intel_buffer_object(input->glarray->BufferObj);
 
@@ -359,7 +395,23 @@ static void brw_prepare_vertices(struct brw_context *brw)
 	 dri_bo_reference(input->bo);
 	 input->offset = (unsigned long)input->glarray->Ptr;
 	 input->stride = input->glarray->StrideB;
+	 input->count = input->glarray->_MaxElement;
+
+	 /* This is a common place to reach if the user mistakenly supplies
+	  * a pointer in place of a VBO offset.  If we just let it go through,
+	  * we may end up dereferencing a pointer beyond the bounds of the
+	  * GTT.  We would hope that the VBO's max_index would save us, but
+	  * Mesa appears to hand us min/max values not clipped to the
+	  * array object's _MaxElement, and _MaxElement frequently appears
+	  * to be wrong anyway.
+	  *
+	  * The VBO spec allows application termination in this case, and it's
+	  * probably a service to the poor programmer to do so rather than
+	  * trying to just not render.
+	  */
+	 assert(input->offset < input->bo->size);
       } else {
+	 input->count = input->glarray->StrideB ? max_index + 1 - min_index : 1;
 	 if (input->bo != NULL) {
 	    /* Already-uploaded vertex data is present from a previous
 	     * prepare_vertices, but we had to re-validate state due to
@@ -371,7 +423,7 @@ static void brw_prepare_vertices(struct brw_context *brw)
 	 /* Queue the buffer object up to be uploaded in the next pass,
 	  * when we've decided if we're doing interleaved or not.
 	  */
-	 if (i == 0) {
+	 if (input->attrib == VERT_ATTRIB_POS) {
 	    /* Position array not properly enabled:
 	     */
             if (input->glarray->StrideB == 0) {
@@ -427,8 +479,8 @@ static void brw_prepare_vertices(struct brw_context *brw)
 
    brw_prepare_query_begin(brw);
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       brw_add_validated_bo(brw, input->bo);
    }
@@ -438,34 +490,44 @@ static void brw_emit_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLuint tmp = brw->vs.prog_data->inputs_read;
-   struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
    GLuint i;
-   GLuint nr_enabled = 0;
 
-  /* Accumulate the list of enabled arrays. */
-   while (tmp) {
-      i = _mesa_ffsll(tmp)-1;
-      struct brw_vertex_element *input = &brw->vb.inputs[i];
+   brw_emit_query_begin(brw);
 
-      tmp &= ~(1<<i);
-      enabled[nr_enabled++] = input;
+   /* If the VS doesn't read any inputs (calculating vertex position from
+    * a state variable for some reason, for example), emit a single pad
+    * VERTEX_ELEMENT struct and bail.
+    *
+    * The stale VB state stays in place, but they don't do anything unless
+    * a VE loads from them.
+    */
+   if (brw->vb.nr_enabled == 0) {
+      BEGIN_BATCH(3, IGNORE_CLIPRECTS);
+      OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | 1);
+      OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) |
+		BRW_VE0_VALID |
+		(BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) |
+		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
+      OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) |
+		(BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT));
+      ADVANCE_BATCH();
+      return;
    }
 
-   brw_emit_query_begin(brw);
-
    /* Now emit VB and VEP state packets.
     *
     * This still defines a hardware VB for each input, even if they
     * are interleaved or from the same VBO.  TBD if this makes a
     * performance difference.
     */
-   BEGIN_BATCH(1 + nr_enabled * 4, IGNORE_CLIPRECTS);
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 4, IGNORE_CLIPRECTS);
    OUT_BATCH((CMD_VERTEX_BUFFER << 16) |
-	     ((1 + nr_enabled * 4) - 2));
+	     ((1 + brw->vb.nr_enabled * 4) - 2));
 
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
 
       OUT_BATCH((i << BRW_VB0_INDEX_SHIFT) |
 		BRW_VB0_ACCESS_VERTEXDATA |
@@ -473,17 +535,30 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_RELOC(input->bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
 		input->offset);
-      OUT_BATCH(brw->vb.max_index);
+      if (BRW_IS_IGDNG(brw)) {
+          if (input->stride) {
+              OUT_RELOC(input->bo,
+                        I915_GEM_DOMAIN_VERTEX, 0,
+                        input->offset + input->stride * input->count);
+          } else {
+              assert(input->count == 1);
+              OUT_RELOC(input->bo,
+                        I915_GEM_DOMAIN_VERTEX, 0,
+                        input->offset + input->element_size);
+          }
+      } else
+          OUT_BATCH(input->stride ? input->count : 0);
       OUT_BATCH(0); /* Instance data step rate */
    }
    ADVANCE_BATCH();
 
-   BEGIN_BATCH(1 + nr_enabled * 2, IGNORE_CLIPRECTS);
-   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + nr_enabled * 2) - 2));
-   for (i = 0; i < nr_enabled; i++) {
-      struct brw_vertex_element *input = enabled[i];
+   BEGIN_BATCH(1 + brw->vb.nr_enabled * 2, IGNORE_CLIPRECTS);
+   OUT_BATCH((CMD_VERTEX_ELEMENT << 16) | ((1 + brw->vb.nr_enabled * 2) - 2));
+   for (i = 0; i < brw->vb.nr_enabled; i++) {
+      struct brw_vertex_element *input = brw->vb.enabled[i];
       uint32_t format = get_surface_type(input->glarray->Type,
 					 input->glarray->Size,
+					 input->glarray->Format,
 					 input->glarray->Normalized);
       uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC;
       uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC;
@@ -502,11 +577,18 @@ static void brw_emit_vertices(struct brw_context *brw)
 		BRW_VE0_VALID |
 		(format << BRW_VE0_FORMAT_SHIFT) |
 		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
-      OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
-		(comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
-		(comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
-		(comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
-		((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
+
+      if (BRW_IS_IGDNG(brw))
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
+      else
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
+                    ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
    }
    ADVANCE_BATCH();
 }
@@ -530,26 +612,36 @@ static void brw_prepare_indices(struct brw_context *brw)
    dri_bo *bo = NULL;
    struct gl_buffer_object *bufferobj;
    GLuint offset;
+   GLuint ib_type_size;
 
    if (index_buffer == NULL)
       return;
 
-   ib_size = get_size(index_buffer->type) * index_buffer->count;
+   ib_type_size = get_size(index_buffer->type);
+   ib_size = ib_type_size * index_buffer->count;
    bufferobj = index_buffer->obj;;
 
    /* Turn into a proper VBO:
     */
-   if (!bufferobj->Name) {
-     
+   if (!_mesa_is_bufferobj(bufferobj)) {
+      brw->ib.start_vertex_offset = 0;
+
       /* Get new bufferobj, offset:
        */
       get_space(brw, ib_size, &bo, &offset);
 
       /* Straight upload
        */
-      dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+      if (intel->intelScreen->kernel_exec_fencing) {
+	 drm_intel_gem_bo_map_gtt(bo);
+	 memcpy((char *)bo->virtual + offset, index_buffer->ptr, ib_size);
+	 drm_intel_gem_bo_unmap_gtt(bo);
+      } else {
+	 dri_bo_subdata(bo, offset, ib_size, index_buffer->ptr);
+      }
    } else {
-      offset = (GLuint)index_buffer->ptr;
+      offset = (GLuint) (unsigned long) index_buffer->ptr;
+      brw->ib.start_vertex_offset = 0;
 
       /* If the index buffer isn't aligned to its element size, we have to
        * rebase it into a temporary.
@@ -570,39 +662,62 @@ static void brw_prepare_indices(struct brw_context *brw)
 	  bo = intel_bufferobj_buffer(intel, intel_buffer_object(bufferobj),
 				      INTEL_READ);
 	  dri_bo_reference(bo);
+
+	  /* Use CMD_3D_PRIM's start_vertex_offset to avoid re-uploading
+	   * the index buffer state when we're just moving the start index
+	   * of our drawing.
+	   */
+	  brw->ib.start_vertex_offset = offset / ib_type_size;
+	  offset = 0;
+	  ib_size = bo->size;
        }
    }
 
-   dri_bo_unreference(brw->ib.bo);
-   brw->ib.bo = bo;
-   brw->ib.offset = offset;
+   if (brw->ib.bo != bo ||
+       brw->ib.offset != offset ||
+       brw->ib.size != ib_size)
+   {
+      drm_intel_bo_unreference(brw->ib.bo);
+      brw->ib.bo = bo;
+      brw->ib.offset = offset;
+      brw->ib.size = ib_size;
+
+      brw->state.dirty.brw |= BRW_NEW_INDEX_BUFFER;
+   } else {
+      drm_intel_bo_unreference(bo);
+   }
 
    brw_add_validated_bo(brw, brw->ib.bo);
 }
 
-static void brw_emit_indices(struct brw_context *brw)
+const struct brw_tracked_state brw_indices = {
+   .dirty = {
+      .mesa = 0,
+      .brw = BRW_NEW_INDICES,
+      .cache = 0,
+   },
+   .prepare = brw_prepare_indices,
+};
+
+static void brw_emit_index_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
-   GLuint ib_size;
 
    if (index_buffer == NULL)
       return;
 
-   ib_size = get_size(index_buffer->type) * index_buffer->count;
-
    /* Emit the indexbuffer packet:
     */
    {
       struct brw_indexbuffer ib;
 
       memset(&ib, 0, sizeof(ib));
-   
+
       ib.header.bits.opcode = CMD_INDEX_BUFFER;
       ib.header.bits.length = sizeof(ib)/4 - 2;
       ib.header.bits.index_format = get_index_type(index_buffer->type);
       ib.header.bits.cut_index_enable = 0;
-   
 
       BEGIN_BATCH(4, IGNORE_CLIPRECTS);
       OUT_BATCH( ib.header.dword );
@@ -611,18 +726,17 @@ static void brw_emit_indices(struct brw_context *brw)
 		brw->ib.offset);
       OUT_RELOC(brw->ib.bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
-		brw->ib.offset + ib_size);
+		brw->ib.offset + brw->ib.size);
       OUT_BATCH( 0 );
       ADVANCE_BATCH();
    }
 }
 
-const struct brw_tracked_state brw_indices = {
+const struct brw_tracked_state brw_index_buffer = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_BATCH | BRW_NEW_INDICES,
+      .brw = BRW_NEW_BATCH | BRW_NEW_INDEX_BUFFER,
       .cache = 0,
    },
-   .prepare = brw_prepare_indices,
-   .emit = brw_emit_indices,
+   .emit = brw_emit_index_buffer,
 };
diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c
index b3ae4eef33..1df561386e 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.c
+++ b/src/mesa/drivers/dri/i965/brw_eu.c
@@ -62,7 +62,7 @@ void brw_set_predicate_control( struct brw_compile *p, GLuint pc )
 
 void brw_set_conditionalmod( struct brw_compile *p, GLuint conditional )
 {
-   p->current->header.destreg__conditonalmod = conditional;
+   p->current->header.destreg__conditionalmod = conditional;
 }
 
 void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
@@ -129,3 +129,126 @@ const GLuint *brw_get_program( struct brw_compile *p,
    return (const GLuint *)p->store;
 }
 
+
+
+/**
+ * Subroutine calls require special attention.
+ * Mesa instructions may be expanded into multiple hardware instructions
+ * so the prog_instruction::BranchTarget field can't be used as an index
+ * into the hardware instructions.
+ *
+ * The BranchTarget field isn't needed, however.  Mesa's GLSL compiler
+ * emits CAL and BGNSUB instructions with labels that can be used to map
+ * subroutine calls to actual subroutine code blocks.
+ *
+ * The structures and function here implement patching of CAL instructions
+ * so they jump to the right subroutine code...
+ */
+
+
+/**
+ * For each OPCODE_BGNSUB we create one of these.
+ */
+struct brw_glsl_label
+{
+   const char *name; /**< the label string */
+   GLuint position;  /**< the position of the brw instruction for this label */
+   struct brw_glsl_label *next;  /**< next in linked list */
+};
+
+
+/**
+ * For each OPCODE_CAL we create one of these.
+ */
+struct brw_glsl_call
+{
+   GLuint call_inst_pos;  /**< location of the CAL instruction */
+   const char *sub_name;  /**< name of subroutine to call */
+   struct brw_glsl_call *next;  /**< next in linked list */
+};
+
+
+/**
+ * Called for each OPCODE_BGNSUB.
+ */
+void
+brw_save_label(struct brw_compile *c, const char *name, GLuint position)
+{
+   struct brw_glsl_label *label = CALLOC_STRUCT(brw_glsl_label);
+   label->name = name;
+   label->position = position;
+   label->next = c->first_label;
+   c->first_label = label;
+}
+
+
+/**
+ * Called for each OPCODE_CAL.
+ */
+void
+brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos)
+{
+   struct brw_glsl_call *call = CALLOC_STRUCT(brw_glsl_call);
+   call->call_inst_pos = call_pos;
+   call->sub_name = name;
+   call->next = c->first_call;
+   c->first_call = call;
+}
+
+
+/**
+ * Lookup a label, return label's position/offset.
+ */
+static GLuint
+brw_lookup_label(struct brw_compile *c, const char *name)
+{
+   const struct brw_glsl_label *label;
+   for (label = c->first_label; label; label = label->next) {
+      if (strcmp(name, label->name) == 0) {
+         return label->position;
+      }
+   }
+   abort();  /* should never happen */
+   return ~0;
+}
+
+
+/**
+ * When we're done generating code, this function is called to resolve
+ * subroutine calls.
+ */
+void
+brw_resolve_cals(struct brw_compile *c)
+{
+    const struct brw_glsl_call *call;
+
+    for (call = c->first_call; call; call = call->next) {
+        const GLuint sub_loc = brw_lookup_label(c, call->sub_name);
+	struct brw_instruction *brw_call_inst = &c->store[call->call_inst_pos];
+	struct brw_instruction *brw_sub_inst = &c->store[sub_loc];
+	GLint offset = brw_sub_inst - brw_call_inst;
+
+	/* patch brw_inst1 to point to brw_inst2 */
+	brw_set_src1(brw_call_inst, brw_imm_d(offset * 16));
+    }
+
+    /* free linked list of calls */
+    {
+        struct brw_glsl_call *call, *next;
+        for (call = c->first_call; call; call = next) {
+	    next = call->next;
+	    _mesa_free(call);
+	}
+	c->first_call = NULL;
+    }
+
+    /* free linked list of labels */
+    {
+        struct brw_glsl_label *label, *next;
+	for (label = c->first_label; label; label = next) {
+	    next = label->next;
+	    _mesa_free(label);
+	}
+	c->first_label = NULL;
+    }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 9e2b39af9b..30603bdd0e 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -91,8 +91,13 @@ struct brw_indirect {
 };
 
 
+struct brw_glsl_label;
+struct brw_glsl_call;
+
+
+
 #define BRW_EU_MAX_INSN_STACK 5
-#define BRW_EU_MAX_INSN 1200
+#define BRW_EU_MAX_INSN 10000
 
 struct brw_compile {
    struct brw_instruction store[BRW_EU_MAX_INSN];
@@ -106,9 +111,22 @@ struct brw_compile {
    GLuint flag_value;
    GLboolean single_program_flow;
    struct brw_context *brw;
+
+   struct brw_glsl_label *first_label;  /**< linked list of labels */
+   struct brw_glsl_call *first_call;    /**< linked list of CALs */
 };
 
 
+void
+brw_save_label(struct brw_compile *c, const char *name, GLuint position);
+
+void
+brw_save_call(struct brw_compile *c, const char *name, GLuint call_pos);
+
+void
+brw_resolve_cals(struct brw_compile *c);
+
+
 
 static INLINE int type_sz( GLuint type )
 {
@@ -152,6 +170,13 @@ static INLINE struct brw_reg brw_reg( GLuint file,
                                       GLuint writemask )
 {
    struct brw_reg reg;
+   if (type == BRW_GENERAL_REGISTER_FILE)
+      assert(nr < BRW_MAX_GRF);
+   else if (type == BRW_MESSAGE_REGISTER_FILE)
+      assert(nr < BRW_MAX_MRF);
+   else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(nr <= BRW_ARF_IP);
+
    reg.type = type;
    reg.file = file;
    reg.nr = nr;
@@ -513,6 +538,7 @@ static INLINE struct brw_reg brw_mask_reg( GLuint subnr )
 
 static INLINE struct brw_reg brw_message_reg( GLuint nr )
 {
+   assert(nr < BRW_MAX_MRF);
    return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
 		       nr,
 		       0);
@@ -705,6 +731,13 @@ static INLINE struct brw_indirect brw_indirect( GLuint addr_subnr, GLint offset
    return ptr;
 }
 
+/** Do two brw_regs refer to the same register? */
+static INLINE GLboolean
+brw_same_reg(struct brw_reg r1, struct brw_reg r2)
+{
+   return r1.file == r2.file && r1.nr == r2.nr;
+}
+
 static INLINE struct brw_instruction *current_insn( struct brw_compile *p)
 {
    return &p->store[p->nr_insn];
@@ -783,6 +816,19 @@ void brw_urb_WRITE(struct brw_compile *p,
 		   GLuint offset,
 		   GLuint swizzle);
 
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
 void brw_fb_WRITE(struct brw_compile *p,
 		   struct brw_reg dest,
 		   GLuint msg_reg_nr,
@@ -802,7 +848,9 @@ void brw_SAMPLE(struct brw_compile *p,
 		GLuint msg_type,
 		GLuint response_length,
 		GLuint msg_length,
-		GLboolean eot);
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode);
 
 void brw_math_16( struct brw_compile *p,
 		  struct brw_reg dest,
@@ -823,12 +871,24 @@ void brw_math( struct brw_compile *p,
 
 void brw_dp_READ_16( struct brw_compile *p,
 		     struct brw_reg dest,
-		     GLuint msg_reg_nr,
 		     GLuint scratch_offset );
 
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index );
+
+void brw_dp_READ_4_vs( struct brw_compile *p,
+                       struct brw_reg dest,
+                       GLuint oword,
+                       GLboolean relAddr,
+                       struct brw_reg addrReg,
+                       GLuint location,
+                       GLuint bind_table_index );
+
 void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
-		      GLuint msg_reg_nr,
 		      GLuint scratch_offset );
 
 /* If/else/endif.  Works by manipulating the execution flags on each
diff --git a/src/mesa/drivers/dri/i965/brw_eu_debug.c b/src/mesa/drivers/dri/i965/brw_eu_debug.c
index 91dbbd5af6..29f3f6d02f 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_debug.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_debug.c
@@ -65,6 +65,7 @@ void brw_print_reg( struct brw_reg hwreg )
        hwreg.width == BRW_WIDTH_8 &&
        hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
        hwreg.type == BRW_REGISTER_TYPE_F) {
+      /* vector register */
       _mesa_printf("vec%d", hwreg.nr);
    }
    else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
@@ -72,8 +73,12 @@ void brw_print_reg( struct brw_reg hwreg )
 	    hwreg.width == BRW_WIDTH_1 &&
 	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
 	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      /* "scalar" register */
       _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
    }
+   else if (hwreg.file == BRW_IMMEDIATE_VALUE) {
+      _mesa_printf("imm %f", hwreg.dw1.f);
+   }
    else {
       _mesa_printf("%s%d.%d<%d;%d,%d>:%s", 
 		   file[hwreg.file],
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 4e099b5945..241cdc33f8 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -55,6 +55,9 @@ static void guess_execution_size( struct brw_instruction *insn,
 static void brw_set_dest( struct brw_instruction *insn,
 			  struct brw_reg dest )
 {
+   if (dest.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(dest.nr < 128);
+
    insn->bits1.da1.dest_reg_file = dest.file;
    insn->bits1.da1.dest_reg_type = dest.type;
    insn->bits1.da1.dest_address_mode = dest.address_mode;
@@ -96,10 +99,13 @@ static void brw_set_dest( struct brw_instruction *insn,
 }
 
 static void brw_set_src0( struct brw_instruction *insn,
-		      struct brw_reg reg )
+                          struct brw_reg reg )
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 
+   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
    insn->bits1.da1.src0_reg_file = reg.file;
    insn->bits1.da1.src0_reg_type = reg.type;
    insn->bits2.da1.src0_abs = reg.abs;
@@ -169,10 +175,12 @@ static void brw_set_src0( struct brw_instruction *insn,
 
 
 void brw_set_src1( struct brw_instruction *insn,
-			  struct brw_reg reg )
+                   struct brw_reg reg )
 {
    assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
 
+   assert(reg.nr < 128);
+
    insn->bits1.da1.src1_reg_file = reg.file;
    insn->bits1.da1.src1_reg_type = reg.type;
    insn->bits3.da1.src1_abs = reg.abs;
@@ -233,7 +241,8 @@ void brw_set_src1( struct brw_instruction *insn,
 
 
 
-static void brw_set_math_message( struct brw_instruction *insn,
+static void brw_set_math_message( struct brw_context *brw,
+				  struct brw_instruction *insn,
 				  GLuint msg_length,
 				  GLuint response_length,
 				  GLuint function,
@@ -244,18 +253,35 @@ static void brw_set_math_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.math.function = function;
-   insn->bits3.math.int_type = integer_type;
-   insn->bits3.math.precision = low_precision;
-   insn->bits3.math.saturate = saturate;
-   insn->bits3.math.data_type = dataType;
-   insn->bits3.math.response_length = response_length;
-   insn->bits3.math.msg_length = msg_length;
-   insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
-   insn->bits3.math.end_of_thread = 0;
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.math_igdng.function = function;
+       insn->bits3.math_igdng.int_type = integer_type;
+       insn->bits3.math_igdng.precision = low_precision;
+       insn->bits3.math_igdng.saturate = saturate;
+       insn->bits3.math_igdng.data_type = dataType;
+       insn->bits3.math_igdng.snapshot = 0;
+       insn->bits3.math_igdng.header_present = 0;
+       insn->bits3.math_igdng.response_length = response_length;
+       insn->bits3.math_igdng.msg_length = msg_length;
+       insn->bits3.math_igdng.end_of_thread = 0;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_MATH;
+       insn->bits2.send_igdng.end_of_thread = 0;
+   } else {
+       insn->bits3.math.function = function;
+       insn->bits3.math.int_type = integer_type;
+       insn->bits3.math.precision = low_precision;
+       insn->bits3.math.saturate = saturate;
+       insn->bits3.math.data_type = dataType;
+       insn->bits3.math.response_length = response_length;
+       insn->bits3.math.msg_length = msg_length;
+       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+       insn->bits3.math.end_of_thread = 0;
+   }
 }
 
-static void brw_set_urb_message( struct brw_instruction *insn,
+
+static void brw_set_ff_sync_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
 				 GLboolean allocate,
 				 GLboolean used,
 				 GLuint msg_length,
@@ -265,21 +291,64 @@ static void brw_set_urb_message( struct brw_instruction *insn,
 				 GLuint offset,
 				 GLuint swizzle_control )
 {
-   brw_set_src1(insn, brw_imm_d(0));
+	brw_set_src1(insn, brw_imm_d(0));
+
+	insn->bits3.urb_igdng.opcode = 1;
+	insn->bits3.urb_igdng.offset = offset;
+	insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+	insn->bits3.urb_igdng.allocate = allocate;
+	insn->bits3.urb_igdng.used = used;
+	insn->bits3.urb_igdng.complete = complete;
+	insn->bits3.urb_igdng.header_present = 1;
+	insn->bits3.urb_igdng.response_length = response_length;
+	insn->bits3.urb_igdng.msg_length = msg_length;
+	insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+	insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+	insn->bits2.send_igdng.end_of_thread = end_of_thread;
+}
 
-   insn->bits3.urb.opcode = 0;	/* ? */
-   insn->bits3.urb.offset = offset;
-   insn->bits3.urb.swizzle_control = swizzle_control;
-   insn->bits3.urb.allocate = allocate;
-   insn->bits3.urb.used = used;	/* ? */
-   insn->bits3.urb.complete = complete;
-   insn->bits3.urb.response_length = response_length;
-   insn->bits3.urb.msg_length = msg_length;
-   insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
-   insn->bits3.urb.end_of_thread = end_of_thread;
+static void brw_set_urb_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
+				 GLboolean allocate,
+				 GLboolean used,
+				 GLuint msg_length,
+				 GLuint response_length,
+				 GLboolean end_of_thread,
+				 GLboolean complete,
+				 GLuint offset,
+				 GLuint swizzle_control )
+{
+    brw_set_src1(insn, brw_imm_d(0));
+
+    if (BRW_IS_IGDNG(brw)) {
+        insn->bits3.urb_igdng.opcode = 0;	/* ? */
+        insn->bits3.urb_igdng.offset = offset;
+        insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+        insn->bits3.urb_igdng.allocate = allocate;
+        insn->bits3.urb_igdng.used = used;	/* ? */
+        insn->bits3.urb_igdng.complete = complete;
+        insn->bits3.urb_igdng.header_present = 1;
+        insn->bits3.urb_igdng.response_length = response_length;
+        insn->bits3.urb_igdng.msg_length = msg_length;
+        insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+        insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+        insn->bits2.send_igdng.end_of_thread = end_of_thread;
+    } else {
+        insn->bits3.urb.opcode = 0;	/* ? */
+        insn->bits3.urb.offset = offset;
+        insn->bits3.urb.swizzle_control = swizzle_control;
+        insn->bits3.urb.allocate = allocate;
+        insn->bits3.urb.used = used;	/* ? */
+        insn->bits3.urb.complete = complete;
+        insn->bits3.urb.response_length = response_length;
+        insn->bits3.urb.msg_length = msg_length;
+        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+        insn->bits3.urb.end_of_thread = end_of_thread;
+    }
 }
 
-static void brw_set_dp_write_message( struct brw_instruction *insn,
+static void brw_set_dp_write_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
 				      GLuint binding_table_index,
 				      GLuint msg_control,
 				      GLuint msg_type,
@@ -290,18 +359,33 @@ static void brw_set_dp_write_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_write.binding_table_index = binding_table_index;
-   insn->bits3.dp_write.msg_control = msg_control;
-   insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
-   insn->bits3.dp_write.msg_type = msg_type;
-   insn->bits3.dp_write.send_commit_msg = 0;
-   insn->bits3.dp_write.response_length = response_length;
-   insn->bits3.dp_write.msg_length = msg_length;
-   insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-   insn->bits3.urb.end_of_thread = end_of_thread;
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_write_igdng.msg_control = msg_control;
+       insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write_igdng.msg_type = msg_type;
+       insn->bits3.dp_write_igdng.send_commit_msg = 0;
+       insn->bits3.dp_write_igdng.header_present = 1;
+       insn->bits3.dp_write_igdng.response_length = response_length;
+       insn->bits3.dp_write_igdng.msg_length = msg_length;
+       insn->bits3.dp_write_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_write.binding_table_index = binding_table_index;
+       insn->bits3.dp_write.msg_control = msg_control;
+       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write.msg_type = msg_type;
+       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.response_length = response_length;
+       insn->bits3.dp_write.msg_length = msg_length;
+       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits3.dp_write.end_of_thread = end_of_thread;
+   }
 }
 
-static void brw_set_dp_read_message( struct brw_instruction *insn,
+static void brw_set_dp_read_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
 				      GLuint binding_table_index,
 				      GLuint msg_control,
 				      GLuint msg_type,
@@ -312,28 +396,57 @@ static void brw_set_dp_read_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_read.binding_table_index = binding_table_index;
-   insn->bits3.dp_read.msg_control = msg_control;
-   insn->bits3.dp_read.msg_type = msg_type;
-   insn->bits3.dp_read.target_cache = target_cache;
-   insn->bits3.dp_read.response_length = response_length;
-   insn->bits3.dp_read.msg_length = msg_length;
-   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ;
-   insn->bits3.dp_read.end_of_thread = end_of_thread;
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_read_igdng.msg_control = msg_control;
+       insn->bits3.dp_read_igdng.msg_type = msg_type;
+       insn->bits3.dp_read_igdng.target_cache = target_cache;
+       insn->bits3.dp_read_igdng.header_present = 1;
+       insn->bits3.dp_read_igdng.response_length = response_length;
+       insn->bits3.dp_read_igdng.msg_length = msg_length;
+       insn->bits3.dp_read_igdng.pad1 = 0;
+       insn->bits3.dp_read_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
+       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
+       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
+       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
+       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   }
 }
 
 static void brw_set_sampler_message(struct brw_context *brw,
-                 struct brw_instruction *insn,
-				     GLuint binding_table_index,
-				     GLuint sampler,
-				     GLuint msg_type,
-				     GLuint response_length,
-				     GLuint msg_length,
-				     GLboolean eot)
+                                    struct brw_instruction *insn,
+                                    GLuint binding_table_index,
+                                    GLuint sampler,
+                                    GLuint msg_type,
+                                    GLuint response_length,
+                                    GLuint msg_length,
+                                    GLboolean eot,
+                                    GLuint header_present,
+                                    GLuint simd_mode)
 {
+   assert(eot == 0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_G4X(brw)) {
+   if (BRW_IS_IGDNG(brw)) {
+      insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
+      insn->bits3.sampler_igdng.sampler = sampler;
+      insn->bits3.sampler_igdng.msg_type = msg_type;
+      insn->bits3.sampler_igdng.simd_mode = simd_mode;
+      insn->bits3.sampler_igdng.header_present = header_present;
+      insn->bits3.sampler_igdng.response_length = response_length;
+      insn->bits3.sampler_igdng.msg_length = msg_length;
+      insn->bits3.sampler_igdng.end_of_thread = eot;
+      insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
+      insn->bits2.send_igdng.end_of_thread = eot;
+   } else if (BRW_IS_G4X(brw)) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -368,8 +481,8 @@ static struct brw_instruction *next_insn( struct brw_compile *p,
    /* Reset this one-shot flag: 
     */
 
-   if (p->current->header.destreg__conditonalmod) {
-      p->current->header.destreg__conditonalmod = 0;   
+   if (p->current->header.destreg__conditionalmod) {
+      p->current->header.destreg__conditionalmod = 0;
       p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
    }
 
@@ -407,7 +520,7 @@ static struct brw_instruction *brw_alu2(struct brw_compile *p,
  * Convenience routines.
  */
 #define ALU1(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
 	      struct brw_reg dest,			\
 	      struct brw_reg src0)   			\
 {							\
@@ -415,7 +528,7 @@ struct brw_instruction *brw_##OP(struct brw_compile *p,			\
 }
 
 #define ALU2(OP)					\
-struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
 	      struct brw_reg dest,			\
 	      struct brw_reg src0,			\
 	      struct brw_reg src1)   			\
@@ -469,12 +582,16 @@ void brw_NOP(struct brw_compile *p)
  */
 
 struct brw_instruction *brw_JMPI(struct brw_compile *p, 
-	      struct brw_reg dest,
-	      struct brw_reg src0,
-	      struct brw_reg src1)
+                                 struct brw_reg dest,
+                                 struct brw_reg src0,
+                                 struct brw_reg src1)
 {
    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 
+   insn->header.execution_size = 1;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+
    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
    return insn;
@@ -531,6 +648,10 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 				 struct brw_instruction *if_insn)
 {
    struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
 
    if (p->single_program_flow) {
       insn = next_insn(p, BRW_OPCODE_ADD);
@@ -557,8 +678,8 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
    } else {
       assert(if_insn->header.opcode == BRW_OPCODE_IF);
 
-      if_insn->bits3.if_else.jump_count = insn - if_insn;
-      if_insn->bits3.if_else.pop_count = 1;
+      if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
+      if_insn->bits3.if_else.pop_count = 0;
       if_insn->bits3.if_else.pad0 = 0;
    }
 
@@ -568,6 +689,11 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 void brw_ENDIF(struct brw_compile *p, 
 	       struct brw_instruction *patch_insn)
 {
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2; 
+ 
    if (p->single_program_flow) {
       /* In single program flow mode, there's no need to execute an ENDIF,
        * since we don't need to do any stack operations, and if we're executing
@@ -599,11 +725,11 @@ void brw_ENDIF(struct brw_compile *p,
 	 /* Automagically turn it into an IFF:
 	  */
 	 patch_insn->header.opcode = BRW_OPCODE_IFF;
-	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
 	 patch_insn->bits3.if_else.pop_count = 0;
 	 patch_insn->bits3.if_else.pad0 = 0;
       } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
-	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
 	 patch_insn->bits3.if_else.pop_count = 1;
 	 patch_insn->bits3.if_else.pad0 = 0;
       } else {
@@ -674,9 +800,13 @@ struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 
 
 struct brw_instruction *brw_WHILE(struct brw_compile *p, 
-	       struct brw_instruction *do_insn)
+                                  struct brw_instruction *do_insn)
 {
    struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
 
    if (p->single_program_flow)
       insn = next_insn(p, BRW_OPCODE_ADD);
@@ -697,7 +827,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
       insn->header.execution_size = do_insn->header.execution_size;
 
       assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = do_insn - insn + 1;
+      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
       insn->bits3.if_else.pop_count = 0;
       insn->bits3.if_else.pad0 = 0;
    }
@@ -716,11 +846,15 @@ void brw_land_fwd_jump(struct brw_compile *p,
 		       struct brw_instruction *jmp_insn)
 {
    struct brw_instruction *landing = &p->store[p->nr_insn];
+   GLuint jmpi = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
    assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
 
-   jmp_insn->bits3.ud = (landing - jmp_insn) - 1; 
+   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
 
 
@@ -737,7 +871,7 @@ void brw_CMP(struct brw_compile *p,
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
-   insn->header.destreg__conditonalmod = conditional;
+   insn->header.destreg__conditionalmod = conditional;
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
@@ -762,7 +896,7 @@ void brw_CMP(struct brw_compile *p,
  * Helpers for the various SEND message types:
  */
 
-/* Invert 8 values
+/** Extended math function, float[8].
  */
 void brw_math( struct brw_compile *p,
 	       struct brw_reg dest,
@@ -781,11 +915,12 @@ void brw_math( struct brw_compile *p,
     * instructions.
     */
    insn->header.predicate_control = 0; 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw,
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -794,7 +929,9 @@ void brw_math( struct brw_compile *p,
 			data_type);
 }
 
-/* Use 2 send instructions to invert 16 elements
+/**
+ * Extended math function, float[16].
+ * Use 2 send instructions.
  */
 void brw_math_16( struct brw_compile *p,
 		  struct brw_reg dest,
@@ -815,11 +952,12 @@ void brw_math_16( struct brw_compile *p,
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw,
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -831,11 +969,12 @@ void brw_math_16( struct brw_compile *p,
     */
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
-   insn->header.destreg__conditonalmod = msg_reg_nr+1;
+   insn->header.destreg__conditionalmod = msg_reg_nr+1;
 
    brw_set_dest(insn, offset(dest,1));
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw, 
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -847,22 +986,26 @@ void brw_math_16( struct brw_compile *p,
 }
 
 
-
-
+/**
+ * Write block of 16 dwords/floats to the data port Render Cache scratch buffer.
+ * Scratch offset should be a multiple of 64.
+ * Used for register spilling.
+ */
 void brw_dp_WRITE_16( struct brw_compile *p,
 		      struct brw_reg src,
-		      GLuint msg_reg_nr,
 		      GLuint scratch_offset )
 {
+   GLuint msg_reg_nr = 1;
    {
       brw_push_insn_state(p);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
+      /* set message header global offset field (reg 0, element 2) */
       brw_MOV(p,
 	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
 	      brw_imm_d(scratch_offset));
-			   
+
       brw_pop_insn_state(p);
    }
 
@@ -873,13 +1016,14 @@ void brw_dp_WRITE_16( struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
   
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src);
 
-      brw_set_dp_write_message(insn,
-			       255, /* bti */
+      brw_set_dp_write_message(p->brw,
+			       insn,
+			       255, /* binding table index (255=stateless) */
 			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
 			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
 			       msg_length,
@@ -887,24 +1031,29 @@ void brw_dp_WRITE_16( struct brw_compile *p,
 			       0, /* response_length */
 			       0); /* eot */
    }
-
 }
 
 
+/**
+ * Read block of 16 dwords/floats from the data port Render Cache scratch buffer.
+ * Scratch offset should be a multiple of 64.
+ * Used for register spilling.
+ */
 void brw_dp_READ_16( struct brw_compile *p,
 		      struct brw_reg dest,
-		      GLuint msg_reg_nr,
 		      GLuint scratch_offset )
 {
+   GLuint msg_reg_nr = 1;
    {
       brw_push_insn_state(p);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_set_mask_control(p, BRW_MASK_DISABLE);
 
+      /* set message header global offset field (reg 0, element 2) */
       brw_MOV(p,
 	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
 	      brw_imm_d(scratch_offset));
-			   
+
       brw_pop_insn_state(p);
    }
 
@@ -913,16 +1062,17 @@ void brw_dp_READ_16( struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE; 
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
   
       brw_set_dest(insn, dest);	/* UW? */
       brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
 
-      brw_set_dp_read_message(insn,
-			      255, /* bti */
-			      3,  /* msg_control */
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      255, /* binding table index (255=stateless) */
+			      3,  /* msg_control (3 means 4 Owords) */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-			      1, /* target cache */
+			      1, /* target cache (render/scratch) */
 			      1, /* msg_length */
 			      2, /* response_length */
 			      0); /* eot */
@@ -930,24 +1080,156 @@ void brw_dp_READ_16( struct brw_compile *p,
 }
 
 
+/**
+ * Read a float[4] vector from the data port Data Cache (const buffer).
+ * Location (in buffer) should be a multiple of 16.
+ * Used for fetching shader constants.
+ * If relAddr is true, we'll do an indirect fetch using the address register.
+ */
+void brw_dp_READ_4( struct brw_compile *p,
+                    struct brw_reg dest,
+                    GLboolean relAddr,
+                    GLuint location,
+                    GLuint bind_table_index )
+{
+   /* XXX: relAddr not implemented */
+   GLuint msg_reg_nr = 1;
+   {
+      struct brw_reg b;
+      brw_push_insn_state(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Setup MRF[1] with location/offset into const buffer */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      brw_MOV(p, b, brw_imm_ud(location));
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+  
+      /* cast dest to a uword[8] vector */
+      dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      bind_table_index,
+			      0,  /* msg_control (0 means 1 Oword) */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+/**
+ * Read float[4] constant(s) from VS constant buffer.
+ * For relative addressing, two float[4] constants will be read into 'dest'.
+ * Otherwise, one float[4] constant will be read into the lower half of 'dest'.
+ */
+void brw_dp_READ_4_vs(struct brw_compile *p,
+                      struct brw_reg dest,
+                      GLuint oword,
+                      GLboolean relAddr,
+                      struct brw_reg addrReg,
+                      GLuint location,
+                      GLuint bind_table_index)
+{
+   GLuint msg_reg_nr = 1;
+
+   assert(oword < 2);
+   /*
+   printf("vs const read msg, location %u, msg_reg_nr %d\n",
+          location, msg_reg_nr);
+   */
+
+   /* Setup MRF[1] with location/offset into const buffer */
+   {
+      struct brw_reg b;
+
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      /*brw_set_access_mode(p, BRW_ALIGN_16);*/
+
+      /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
+       * when the docs say only dword[2] should be set.  Hmmm.  But it works.
+       */
+      b = brw_message_reg(msg_reg_nr);
+      b = retype(b, BRW_REGISTER_TYPE_UD);
+      /*b = get_element_ud(b, 2);*/
+      if (relAddr) {
+         brw_ADD(p, b, addrReg, brw_imm_ud(location));
+      }
+      else {
+         brw_MOV(p, b, brw_imm_ud(location));
+      }
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      insn->header.compression_control = BRW_COMPRESSION_NONE; 
+      insn->header.destreg__conditionalmod = msg_reg_nr;
+      insn->header.mask_control = BRW_MASK_DISABLE;
+      /*insn->header.access_mode = BRW_ALIGN_16;*/
+  
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, brw_null_reg());
+
+      brw_set_dp_read_message(p->brw,
+			      insn,
+			      bind_table_index,
+			      oword,  /* 0 = lower Oword, 1 = upper Oword */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      0, /* source cache = data cache */
+			      1, /* msg_length */
+			      1, /* response_length (1 Oword) */
+			      0); /* eot */
+   }
+}
+
+
+
 void brw_fb_WRITE(struct brw_compile *p,
-		   struct brw_reg dest,
-		   GLuint msg_reg_nr,
-		   struct brw_reg src0,
-		   GLuint binding_table_index,
-		   GLuint msg_length,
-		   GLuint response_length,
-		   GLboolean eot)
+                  struct brw_reg dest,
+                  GLuint msg_reg_nr,
+                  struct brw_reg src0,
+                  GLuint binding_table_index,
+                  GLuint msg_length,
+                  GLuint response_length,
+                  GLboolean eot)
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
    
    insn->header.predicate_control = 0; /* XXX */
    insn->header.compression_control = BRW_COMPRESSION_NONE; 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
   
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
-   brw_set_dp_write_message(insn,
+   brw_set_dp_write_message(p->brw,
+			    insn,
 			    binding_table_index,
 			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
 			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
@@ -958,7 +1240,11 @@ void brw_fb_WRITE(struct brw_compile *p,
 }
 
 
-
+/**
+ * Texture sample instruction.
+ * Note: the msg_type plus msg_length values determine exactly what kind
+ * of sampling operation is performed.  See volume 4, page 161 of docs.
+ */
 void brw_SAMPLE(struct brw_compile *p,
 		struct brw_reg dest,
 		GLuint msg_reg_nr,
@@ -969,12 +1255,14 @@ void brw_SAMPLE(struct brw_compile *p,
 		GLuint msg_type,
 		GLuint response_length,
 		GLuint msg_length,
-		GLboolean eot)
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode)
 {
    GLboolean need_stall = 0;
    
-   if(writemask == 0) {
-/*       _mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+   if (writemask == 0) {
+      /*_mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
    
@@ -1006,7 +1294,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-/* 	 _mesa_printf("need stall %x %x\n", newmask , writemask); */
+         /* _mesa_printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
@@ -1034,7 +1322,7 @@ void brw_SAMPLE(struct brw_compile *p,
    
       insn->header.predicate_control = 0; /* XXX */
       insn->header.compression_control = BRW_COMPRESSION_NONE;
-      insn->header.destreg__conditonalmod = msg_reg_nr;
+      insn->header.destreg__conditionalmod = msg_reg_nr;
 
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src0);
@@ -1044,17 +1332,18 @@ void brw_SAMPLE(struct brw_compile *p,
 			      msg_type,
 			      response_length, 
 			      msg_length,
-			      eot);
+			      eot,
+			      header_present,
+			      simd_mode);
    }
 
-   if (need_stall)
-   {
+   if (need_stall) {
       struct brw_reg reg = vec8(offset(dest, response_length-1));
 
       /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
        */
       brw_push_insn_state(p);
-      brw_set_compression_control(p, GL_FALSE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
       brw_MOV(p, reg, reg);	      
       brw_pop_insn_state(p);
    }
@@ -1080,15 +1369,16 @@ void brw_urb_WRITE(struct brw_compile *p,
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
 
-   assert(msg_length < 16);
+   assert(msg_length < BRW_MAX_MRF);
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->header.destreg__conditonalmod = msg_reg_nr;
+   insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_urb_message(insn,
+   brw_set_urb_message(p->brw,
+		       insn,
 		       allocate,
 		       used,
 		       msg_length,
@@ -1099,3 +1389,37 @@ void brw_urb_WRITE(struct brw_compile *p,
 		       swizzle);
 }
 
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditionalmod = msg_reg_nr;
+
+   brw_set_ff_sync_message(p->brw,
+		       insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length, 
+		       eot, 
+		       writes_complete, 
+		       offset,
+		       swizzle);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index 4ea660a51a..d27c6c24ca 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -37,6 +37,9 @@
 #include "tnl/tnl.h"
 #include "brw_context.h"
 #include "brw_fallback.h"
+#include "intel_chipset.h"
+#include "intel_fbo.h"
+#include "intel_regions.h"
 
 #include "glapi/glapi.h"
 
@@ -44,23 +47,16 @@
 
 static GLboolean do_check_fallback(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
-   /* BRW_NEW_METAOPS
-    */
-   if (brw->metaops.active)
-      return GL_FALSE;
-
    if (brw->intel.no_rast) {
       DBG("FALLBACK: rasterization disabled\n");
       return GL_TRUE;
    }
 
    /* _NEW_RENDERMODE
-    *
-    * XXX: need to save/restore RenderMode in metaops state, or
-    * somehow move to a new attribs pointer:
     */
    if (ctx->RenderMode != GL_RENDER) {
       DBG("FALLBACK: render mode\n");
@@ -70,7 +66,7 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    /* _NEW_TEXTURE:
     */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[i];
+      struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
       if (texUnit->_ReallyEnabled) {
 	 struct intel_texture_object *intelObj = intel_texture_object(texUnit->_Current);
 	 struct gl_texture_image *texImage = intelObj->base.Image[0][intelObj->firstLevel];
@@ -83,12 +79,39 @@ static GLboolean do_check_fallback(struct brw_context *brw)
    
    /* _NEW_STENCIL 
     */
-   if (brw->attribs.Stencil->Enabled && 
-       !brw->intel.hw_stencil) {
+   if (ctx->Stencil._Enabled &&
+       (ctx->DrawBuffer->Name == 0 && !brw->intel.hw_stencil)) {
       DBG("FALLBACK: stencil\n");
       return GL_TRUE;
    }
 
+   /* _NEW_BUFFERS */
+   if (IS_965(intel->intelScreen->deviceID) &&
+       !IS_G4X(intel->intelScreen->deviceID)) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+	 /* The original gen4 hardware couldn't set up WM surfaces pointing
+	  * at an offset within a tile, which can happen when rendering to
+	  * anything but the base level of a texture or the +X face/0 depth.
+	  * This was fixed with the 4 Series hardware.
+	  *
+	  * For these original chips, you would have to make the depth and
+	  * color destination surfaces include information on the texture
+	  * type, LOD, face, and various limits to use them as a destination.
+	  * I would have done this, but there's also a nasty requirement that
+	  * the depth and the color surfaces all be of the same LOD, which
+	  * may be a worse requirement than this alignment.  (Also, we may
+	  * want to just demote the texture to untiled, instead).
+	  */
+	 if (irb->region && irb->region->tiling != I915_TILING_NONE &&
+	     (irb->region->draw_offset & 4095)) {
+	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
+	    return GL_TRUE;
+	 }
+      }
+   }
 
    return GL_FALSE;
 }
@@ -101,7 +124,7 @@ static void check_fallback(struct brw_context *brw)
 const struct brw_tracked_state brw_check_fallback = {
    .dirty = {
       .mesa = _NEW_BUFFERS | _NEW_RENDERMODE | _NEW_TEXTURE | _NEW_STENCIL,
-      .brw  = BRW_NEW_METAOPS,
+      .brw  = 0,
       .cache = 0
    },
    .prepare = check_fallback
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index a8b74a0afe..48c2b9a41c 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -54,12 +54,17 @@ static void compile_gs_prog( struct brw_context *brw,
    memset(&c, 0, sizeof(c));
    
    c.key = *key;
-
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.nr_attrs = brw_count_bits(c.key.attrs);
-   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
    
diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
index 18a4537c32..bbb991ea2e 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.h
+++ b/src/mesa/drivers/dri/i965/brw_gs.h
@@ -62,6 +62,7 @@ struct brw_gs_compile {
    GLuint nr_attrs;
    GLuint nr_regs;
    GLuint nr_bytes;
+   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index 22e0d25c2e..a9b2aa2eac 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -101,6 +101,23 @@ static void brw_gs_emit_vue(struct brw_gs_compile *c,
 		 BRW_URB_SWIZZLE_NONE);
 }
 
+static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
+{
+	struct brw_compile *p = &c->func;
+	brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
+	brw_ff_sync(p, 
+				c->reg.R0,
+				0,
+				c->reg.R0,
+				1,	
+				1,		/* used */
+				1,  	/* msg length */
+				1,		/* response length */
+				0,		/* eot */
+				1,		/* write compelete */
+				0,		/* urb offset */
+				BRW_URB_SWIZZLE_NONE);
+}
 
 
 void brw_gs_quads( struct brw_gs_compile *c )
@@ -110,6 +127,8 @@ void brw_gs_quads( struct brw_gs_compile *c )
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
     */
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);    
    brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
@@ -120,6 +139,8 @@ void brw_gs_quad_strip( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 4);
    
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
@@ -129,6 +150,9 @@ void brw_gs_quad_strip( struct brw_gs_compile *c )
 void brw_gs_tris( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 3);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
    brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
@@ -137,6 +161,9 @@ void brw_gs_tris( struct brw_gs_compile *c )
 void brw_gs_lines( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 2);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
 }
@@ -144,6 +171,9 @@ void brw_gs_lines( struct brw_gs_compile *c )
 void brw_gs_points( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 1);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index 27023cf034..ed9d2ffe60 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -93,7 +93,13 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
    gs.thread4.nr_urb_entries = key->nr_urb_entries;
    gs.thread4.urb_entry_allocation_size = key->urb_size - 1;
 
-   gs.thread4.max_threads = 0; /* Hardware requirement */
+   if (key->nr_urb_entries >= 8)
+      gs.thread4.max_threads = 1;
+   else
+      gs.thread4.max_threads = 0;
+
+   if (BRW_IS_IGDNG(brw))
+      gs.thread4.rendering_enable = 1;
 
    if (INTEL_DEBUG & DEBUG_STATS)
       gs.thread4.stats_enable = 1;
diff --git a/src/mesa/drivers/dri/i965/brw_metaops.c b/src/mesa/drivers/dri/i965/brw_metaops.c
deleted file mode 100644
index 41bfa2e256..0000000000
--- a/src/mesa/drivers/dri/i965/brw_metaops.c
+++ /dev/null
@@ -1,583 +0,0 @@
-/*
- Copyright (C) Intel Corp.  2006.  All Rights Reserved.
- Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
- develop this 3D driver.
- 
- Permission is hereby granted, free of charge, to any person obtaining
- a copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sublicense, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- 
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial
- portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- 
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  *   frame buffer texture by Gary Wong <gtw@gnu.org>
-  */
- 
-
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/macros.h"
-
-#include "shader/arbprogparse.h"
-
-#include "intel_screen.h"
-#include "intel_batchbuffer.h"
-#include "intel_regions.h"
-#include "intel_buffers.h"
-
-#include "brw_context.h"
-#include "brw_defines.h"
-#include "brw_draw.h"
-#include "brw_fallback.h"
-
-#define INIT(brw, STRUCT, ATTRIB) 		\
-do {						\
-   brw->attribs.ATTRIB = &ctx->ATTRIB;		\
-} while (0)
-
-#define DUP(brw, STRUCT, ATTRIB) 		\
-do {						\
-   brw->metaops.attribs.ATTRIB = MALLOC_STRUCT(STRUCT);	\
-   memcpy(brw->metaops.attribs.ATTRIB, 			\
-	  brw->attribs.ATTRIB,			\
-	  sizeof(struct STRUCT));		\
-} while (0)
-
-
-#define INSTALL(brw, ATTRIB, STATE)		\
-do {						\
-   brw->attribs.ATTRIB = brw->metaops.attribs.ATTRIB;	\
-   brw->state.dirty.mesa |= STATE;		\
-} while (0)
-
-#define RESTORE(brw, ATTRIB, STATE)			\
-do {							\
-   brw->attribs.ATTRIB = &brw->intel.ctx.ATTRIB;	\
-   brw->state.dirty.mesa |= STATE;			\
-} while (0)
-
-static void init_attribs( struct brw_context *brw )
-{
-   DUP(brw, gl_colorbuffer_attrib, Color);
-   DUP(brw, gl_depthbuffer_attrib, Depth);
-   DUP(brw, gl_fog_attrib, Fog);
-   DUP(brw, gl_hint_attrib, Hint);
-   DUP(brw, gl_light_attrib, Light);
-   DUP(brw, gl_line_attrib, Line);
-   DUP(brw, gl_point_attrib, Point);
-   DUP(brw, gl_polygon_attrib, Polygon);
-   DUP(brw, gl_scissor_attrib, Scissor);
-   DUP(brw, gl_stencil_attrib, Stencil);
-   DUP(brw, gl_texture_attrib, Texture);
-   DUP(brw, gl_transform_attrib, Transform);
-   DUP(brw, gl_viewport_attrib, Viewport);
-   DUP(brw, gl_vertex_program_state, VertexProgram);
-   DUP(brw, gl_fragment_program_state, FragmentProgram);
-}
-
-static void install_attribs( struct brw_context *brw )
-{
-   INSTALL(brw, Color, _NEW_COLOR);
-   INSTALL(brw, Depth, _NEW_DEPTH);
-   INSTALL(brw, Fog, _NEW_FOG);
-   INSTALL(brw, Hint, _NEW_HINT);
-   INSTALL(brw, Light, _NEW_LIGHT);
-   INSTALL(brw, Line, _NEW_LINE);
-   INSTALL(brw, Point, _NEW_POINT);
-   INSTALL(brw, Polygon, _NEW_POLYGON);
-   INSTALL(brw, Scissor, _NEW_SCISSOR);
-   INSTALL(brw, Stencil, _NEW_STENCIL);
-   INSTALL(brw, Texture, _NEW_TEXTURE);
-   INSTALL(brw, Transform, _NEW_TRANSFORM);
-   INSTALL(brw, Viewport, _NEW_VIEWPORT);
-   INSTALL(brw, VertexProgram, _NEW_PROGRAM);
-   INSTALL(brw, FragmentProgram, _NEW_PROGRAM);
-}
-
-static void restore_attribs( struct brw_context *brw )
-{
-   RESTORE(brw, Color, _NEW_COLOR);
-   RESTORE(brw, Depth, _NEW_DEPTH);
-   RESTORE(brw, Fog, _NEW_FOG);
-   RESTORE(brw, Hint, _NEW_HINT);
-   RESTORE(brw, Light, _NEW_LIGHT);
-   RESTORE(brw, Line, _NEW_LINE);
-   RESTORE(brw, Point, _NEW_POINT);
-   RESTORE(brw, Polygon, _NEW_POLYGON);
-   RESTORE(brw, Scissor, _NEW_SCISSOR);
-   RESTORE(brw, Stencil, _NEW_STENCIL);
-   RESTORE(brw, Texture, _NEW_TEXTURE);
-   RESTORE(brw, Transform, _NEW_TRANSFORM);
-   RESTORE(brw, Viewport, _NEW_VIEWPORT);
-   RESTORE(brw, VertexProgram, _NEW_PROGRAM);
-   RESTORE(brw, FragmentProgram, _NEW_PROGRAM);
-}
-
-
-static const char *vp_prog =
-      "!!ARBvp1.0\n"
-      "MOV  result.color, vertex.color;\n"
-      "MOV  result.position, vertex.position;\n"
-      "END\n";
-
-static const char *fp_prog =
-      "!!ARBfp1.0\n"
-      "MOV result.color, fragment.color;\n"
-      "END\n";
-
-static const char *fp_tex_prog =
-      "!!ARBfp1.0\n"
-      "TEMP a;\n"
-      "ADD a, fragment.position, program.local[0];\n"
-      "MUL a, a, program.local[1];\n"
-      "TEX result.color, a, texture[0], 2D;\n"
-      "MOV result.depth.z, fragment.position;\n"
-      "END\n";
-
-/* Derived values of importance:
- *
- *   FragmentProgram->_Current
- *   VertexProgram->_Enabled
- *   brw->vertex_program
- *   DrawBuffer->_ColorDrawBufferIndexes[0]
- * 
- *
- * More if drawpixels-through-texture is added.  
- */
-static void init_metaops_state( struct brw_context *brw )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-
-   brw->metaops.vbo = ctx->Driver.NewBufferObject(ctx, 1, GL_ARRAY_BUFFER_ARB);
-
-   ctx->Driver.BufferData(ctx,
-			  GL_ARRAY_BUFFER_ARB,
-			  4096,
-			  NULL,
-			  GL_DYNAMIC_DRAW_ARB,
-			  brw->metaops.vbo);
-
-   brw->metaops.fp = (struct gl_fragment_program *)
-      ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 1 );
-
-   brw->metaops.fp_tex = (struct gl_fragment_program *)
-      ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 1 );
-
-   brw->metaops.vp = (struct gl_vertex_program *)
-      ctx->Driver.NewProgram(ctx, GL_VERTEX_PROGRAM_ARB, 1 );
-
-   _mesa_parse_arb_fragment_program(ctx, GL_FRAGMENT_PROGRAM_ARB, 
-				    fp_prog, strlen(fp_prog),
-				    brw->metaops.fp);
-
-   _mesa_parse_arb_fragment_program(ctx, GL_FRAGMENT_PROGRAM_ARB, 
-				    fp_tex_prog, strlen(fp_tex_prog),
-				    brw->metaops.fp_tex);
-
-   _mesa_parse_arb_vertex_program(ctx, GL_VERTEX_PROGRAM_ARB, 
-				  vp_prog, strlen(vp_prog),
-				  brw->metaops.vp);
-
-   brw->metaops.attribs.VertexProgram->_Current = brw->metaops.vp;
-   brw->metaops.attribs.VertexProgram->_Enabled = GL_TRUE;
-
-   brw->metaops.attribs.FragmentProgram->_Current = brw->metaops.fp;
-}
-
-static void meta_flat_shade( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   brw->metaops.attribs.Light->ShadeModel = GL_FLAT;
-   brw->state.dirty.mesa |= _NEW_LIGHT;
-}
-
-
-static void meta_no_stencil_write( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   brw->metaops.attribs.Stencil->Enabled = GL_FALSE;
-   brw->metaops.attribs.Stencil->WriteMask[0] = GL_FALSE; 
-   brw->state.dirty.mesa |= _NEW_STENCIL;
-}
-
-static void meta_no_depth_write( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   brw->metaops.attribs.Depth->Test = GL_FALSE;
-   brw->metaops.attribs.Depth->Mask = GL_FALSE;
-   brw->state.dirty.mesa |= _NEW_DEPTH;
-}
-
-
-static void meta_depth_replace( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   /* ctx->Driver.Enable( ctx, GL_DEPTH_TEST, GL_TRUE )
-    * ctx->Driver.DepthMask( ctx, GL_TRUE )
-    */
-   brw->metaops.attribs.Depth->Test = GL_TRUE;
-   brw->metaops.attribs.Depth->Mask = GL_TRUE;
-   brw->state.dirty.mesa |= _NEW_DEPTH;
-
-   /* ctx->Driver.DepthFunc( ctx, GL_ALWAYS )
-    */
-   brw->metaops.attribs.Depth->Func = GL_ALWAYS;
-
-   brw->state.dirty.mesa |= _NEW_DEPTH;
-}
-
-
-static void meta_stencil_replace( struct intel_context *intel,
-				 GLuint s_mask,
-				 GLuint s_clear)
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   brw->metaops.attribs.Stencil->Enabled = GL_TRUE;
-   brw->metaops.attribs.Stencil->WriteMask[0] = s_mask;
-   brw->metaops.attribs.Stencil->ValueMask[0] = 0xff;
-   brw->metaops.attribs.Stencil->Ref[0] = s_clear;
-   brw->metaops.attribs.Stencil->Function[0] = GL_ALWAYS;
-   brw->metaops.attribs.Stencil->FailFunc[0] = GL_REPLACE;
-   brw->metaops.attribs.Stencil->ZPassFunc[0] = GL_REPLACE;
-   brw->metaops.attribs.Stencil->ZFailFunc[0] = GL_REPLACE;
-   brw->state.dirty.mesa |= _NEW_STENCIL;
-}
-
-
-static void meta_color_mask( struct intel_context *intel, GLboolean state )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   if (state)
-      COPY_4V(brw->metaops.attribs.Color->ColorMask, 
-	      brw->intel.ctx.Color.ColorMask); 
-   else
-      ASSIGN_4V(brw->metaops.attribs.Color->ColorMask, 0, 0, 0, 0);
-
-   brw->state.dirty.mesa |= _NEW_COLOR;
-}
-
-static void meta_no_texture( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-   
-   brw->metaops.attribs.FragmentProgram->_Current = brw->metaops.fp;
-   
-   brw->metaops.attribs.Texture->CurrentUnit = 0;
-   brw->metaops.attribs.Texture->_EnabledUnits = 0;
-   brw->metaops.attribs.Texture->_EnabledCoordUnits = 0;
-   brw->metaops.attribs.Texture->Unit[ 0 ].Enabled = 0;
-   brw->metaops.attribs.Texture->Unit[ 0 ]._ReallyEnabled = 0;
-
-   brw->state.dirty.mesa |= _NEW_TEXTURE | _NEW_PROGRAM;
-}
-
-static void meta_texture_blend_replace(struct intel_context *intel)
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   brw->metaops.attribs.Texture->CurrentUnit = 0;
-   brw->metaops.attribs.Texture->_EnabledUnits = 1;
-   brw->metaops.attribs.Texture->_EnabledCoordUnits = 1;
-   brw->metaops.attribs.Texture->Unit[ 0 ].Enabled = TEXTURE_2D_BIT;
-   brw->metaops.attribs.Texture->Unit[ 0 ]._ReallyEnabled = TEXTURE_2D_BIT;
-   brw->metaops.attribs.Texture->Unit[ 0 ].Current2D =
-      intel->frame_buffer_texobj;
-   brw->metaops.attribs.Texture->Unit[ 0 ]._Current =
-      intel->frame_buffer_texobj;
-
-   brw->state.dirty.mesa |= _NEW_TEXTURE | _NEW_PROGRAM;
-}
-
-static void meta_import_pixel_state(struct intel_context *intel)
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-   
-   RESTORE(brw, Color, _NEW_COLOR);
-   RESTORE(brw, Depth, _NEW_DEPTH);
-   RESTORE(brw, Fog, _NEW_FOG);
-   RESTORE(brw, Scissor, _NEW_SCISSOR);
-   RESTORE(brw, Stencil, _NEW_STENCIL);
-   RESTORE(brw, Texture, _NEW_TEXTURE);
-   RESTORE(brw, FragmentProgram, _NEW_PROGRAM);
-}
-
-static void meta_frame_buffer_texture( struct intel_context *intel,
-				       GLint xoff, GLint yoff )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-   struct intel_region *region = intel_drawbuf_region( intel );
-   
-   INSTALL(brw, FragmentProgram, _NEW_PROGRAM);
-
-   brw->metaops.attribs.FragmentProgram->_Current = brw->metaops.fp_tex;
-   /* This is unfortunate, but seems to be necessary, since later on we
-      will end up calling _mesa_load_state_parameters to lookup the
-      local params (below), and that will want to look in ctx.FragmentProgram
-      instead of brw->attribs.FragmentProgram. */
-   intel->ctx.FragmentProgram.Current = brw->metaops.fp_tex;
-
-   brw->metaops.fp_tex->Base.LocalParams[ 0 ][ 0 ] = xoff;
-   brw->metaops.fp_tex->Base.LocalParams[ 0 ][ 1 ] = yoff;
-   brw->metaops.fp_tex->Base.LocalParams[ 0 ][ 2 ] = 0.0;
-   brw->metaops.fp_tex->Base.LocalParams[ 0 ][ 3 ] = 0.0;
-   brw->metaops.fp_tex->Base.LocalParams[ 1 ][ 0 ] =
-      1.0 / region->pitch;
-   brw->metaops.fp_tex->Base.LocalParams[ 1 ][ 1 ] =
-      -1.0 / region->height;
-   brw->metaops.fp_tex->Base.LocalParams[ 1 ][ 2 ] = 0.0;
-   brw->metaops.fp_tex->Base.LocalParams[ 1 ][ 3 ] = 1.0;
-   
-   brw->state.dirty.mesa |= _NEW_PROGRAM;
-}
-
-
-static void meta_draw_region( struct intel_context *intel,
-			     struct intel_region *draw_region,
-			     struct intel_region *depth_region )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   if (!brw->metaops.saved_draw_region) {
-      brw->metaops.saved_draw_region = brw->state.draw_regions[0];
-      brw->metaops.saved_nr_draw_regions = brw->state.nr_draw_regions;
-      brw->metaops.saved_depth_region = brw->state.depth_region;
-   }
-
-   brw->state.draw_regions[0] = draw_region;
-   brw->state.nr_draw_regions = 1;
-   brw->state.depth_region = depth_region;
-
-   if (intel->frame_buffer_texobj != NULL)
-      brw_FrameBufferTexDestroy(brw);
-
-   if (draw_region)
-       brw_FrameBufferTexInit(brw, draw_region);
-
-   brw->state.dirty.mesa |= _NEW_BUFFERS;
-}
-
-
-static void meta_draw_quad(struct intel_context *intel, 
-			   GLfloat x0, GLfloat x1,
-			   GLfloat y0, GLfloat y1, 
-			   GLfloat z,
-			   GLuint color,
-			   GLfloat s0, GLfloat s1,
-			   GLfloat t0, GLfloat t1)
-{
-   GLcontext *ctx = &intel->ctx;
-   struct brw_context *brw = brw_context(&intel->ctx);
-   struct gl_client_array pos_array;
-   struct gl_client_array color_array;
-   struct gl_client_array *attribs[VERT_ATTRIB_MAX];
-   struct _mesa_prim prim[1];
-   GLfloat pos[4][3];
-
-   ctx->Driver.BufferData(ctx,
-			  GL_ARRAY_BUFFER_ARB,
-			  sizeof(pos) + sizeof(color),
-			  NULL,
-			  GL_DYNAMIC_DRAW_ARB,
-			  brw->metaops.vbo);
-
-   pos[0][0] = x0;
-   pos[0][1] = y0;
-   pos[0][2] = z;
-
-   pos[1][0] = x1;
-   pos[1][1] = y0;
-   pos[1][2] = z;
-
-   pos[2][0] = x1;
-   pos[2][1] = y1;
-   pos[2][2] = z;
-
-   pos[3][0] = x0;
-   pos[3][1] = y1;
-   pos[3][2] = z;
-
-   ctx->Driver.BufferSubData(ctx,
-			     GL_ARRAY_BUFFER_ARB,
-			     0,
-			     sizeof(pos),
-			     pos,
-			     brw->metaops.vbo);
-
-   /* Convert incoming ARGB to required RGBA */
-   /* Note this color is stored as GL_UNSIGNED_BYTE */
-   color = (color & 0xff00ff00) | (((color >> 16) | (color << 16)) & 0xff00ff);
-
-   ctx->Driver.BufferSubData(ctx,
-			     GL_ARRAY_BUFFER_ARB,
-			     sizeof(pos),
-			     sizeof(color),
-			     &color,
-			     brw->metaops.vbo);
-
-   /* Ignoring texture coords. 
-    */
-
-   memset(attribs, 0, VERT_ATTRIB_MAX * sizeof(*attribs));
-
-   attribs[VERT_ATTRIB_POS] = &pos_array;
-   attribs[VERT_ATTRIB_POS]->Ptr = 0;
-   attribs[VERT_ATTRIB_POS]->Type = GL_FLOAT;
-   attribs[VERT_ATTRIB_POS]->Enabled = 1;
-   attribs[VERT_ATTRIB_POS]->Size = 3;
-   attribs[VERT_ATTRIB_POS]->StrideB = 3 * sizeof(GLfloat);
-   attribs[VERT_ATTRIB_POS]->Stride = 3 * sizeof(GLfloat);
-   attribs[VERT_ATTRIB_POS]->_MaxElement = 4;
-   attribs[VERT_ATTRIB_POS]->Normalized = 0;
-   attribs[VERT_ATTRIB_POS]->BufferObj = brw->metaops.vbo;
-
-   attribs[VERT_ATTRIB_COLOR0] = &color_array;
-   attribs[VERT_ATTRIB_COLOR0]->Ptr = (const GLubyte *)sizeof(pos);
-   attribs[VERT_ATTRIB_COLOR0]->Type = GL_UNSIGNED_BYTE;
-   attribs[VERT_ATTRIB_COLOR0]->Enabled = 1;
-   attribs[VERT_ATTRIB_COLOR0]->Size = 4;
-   attribs[VERT_ATTRIB_COLOR0]->StrideB = 0;
-   attribs[VERT_ATTRIB_COLOR0]->Stride = 0;
-   attribs[VERT_ATTRIB_COLOR0]->_MaxElement = 1;
-   attribs[VERT_ATTRIB_COLOR0]->Normalized = 1;
-   attribs[VERT_ATTRIB_COLOR0]->BufferObj = brw->metaops.vbo;
-   
-   /* Just ignoring texture coordinates for now. 
-    */
-
-   memset(prim, 0, sizeof(*prim));
-
-   prim[0].mode = GL_TRIANGLE_FAN;
-   prim[0].begin = 1;
-   prim[0].end = 1;
-   prim[0].weak = 0;
-   prim[0].pad = 0;
-   prim[0].start = 0;
-   prim[0].count = 4;
-
-   brw_draw_prims(&brw->intel.ctx, 
-		  (const struct gl_client_array **)attribs,
-		  prim, 1,
-		  NULL,
-		  0,
-		  3 );
-}
-
-
-static void install_meta_state( struct intel_context *intel )
-{
-   GLcontext *ctx = &intel->ctx;
-   struct brw_context *brw = brw_context(ctx);
-   GLuint i;
-
-   if (!brw->metaops.vbo) {
-      init_metaops_state(brw);
-   }
-
-   install_attribs(brw);
-   
-   meta_no_texture(&brw->intel);
-   meta_flat_shade(&brw->intel);
-   for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
-      brw->metaops.restore_draw_buffers[i]
-         = ctx->DrawBuffer->_ColorDrawBufferIndexes[i];
-   }
-   brw->metaops.restore_num_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
-
-   brw->metaops.restore_fp = ctx->FragmentProgram.Current;
-
-   /* This works without adjusting refcounts.  Fix later? 
-    */
-   brw->metaops.saved_draw_region = brw->state.draw_regions[0];
-   brw->metaops.saved_nr_draw_regions = brw->state.nr_draw_regions;
-   brw->metaops.saved_depth_region = brw->state.depth_region;
-   brw->metaops.active = 1;
-   
-   brw->state.dirty.brw |= BRW_NEW_METAOPS;
-}
-
-static void leave_meta_state( struct intel_context *intel )
-{
-   GLcontext *ctx = &intel->ctx;
-   struct brw_context *brw = brw_context(ctx);
-   GLuint i;
-
-   restore_attribs(brw);
-
-   for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
-      ctx->DrawBuffer->_ColorDrawBufferIndexes[i]
-         = brw->metaops.restore_draw_buffers[i];
-   }
-   ctx->DrawBuffer->_NumColorDrawBuffers = brw->metaops.restore_num_draw_buffers;
-
-   ctx->FragmentProgram.Current = brw->metaops.restore_fp;
-
-   brw->state.draw_regions[0] = brw->metaops.saved_draw_region;
-   brw->state.nr_draw_regions = brw->metaops.saved_nr_draw_regions;
-   brw->state.depth_region = brw->metaops.saved_depth_region;
-   brw->metaops.saved_draw_region = NULL;
-   brw->metaops.saved_depth_region = NULL;
-   brw->metaops.active = 0;
-
-   brw->state.dirty.mesa |= _NEW_BUFFERS;
-   brw->state.dirty.brw |= BRW_NEW_METAOPS;
-}
-
-
-
-void brw_init_metaops( struct brw_context *brw )
-{
-   init_attribs(brw);
-
-
-   brw->intel.vtbl.install_meta_state = install_meta_state;
-   brw->intel.vtbl.leave_meta_state = leave_meta_state;
-   brw->intel.vtbl.meta_no_depth_write = meta_no_depth_write;
-   brw->intel.vtbl.meta_no_stencil_write = meta_no_stencil_write;
-   brw->intel.vtbl.meta_stencil_replace = meta_stencil_replace;
-   brw->intel.vtbl.meta_depth_replace = meta_depth_replace;
-   brw->intel.vtbl.meta_color_mask = meta_color_mask;
-   brw->intel.vtbl.meta_no_texture = meta_no_texture;
-   brw->intel.vtbl.meta_import_pixel_state = meta_import_pixel_state;
-   brw->intel.vtbl.meta_frame_buffer_texture = meta_frame_buffer_texture;
-   brw->intel.vtbl.meta_draw_region = meta_draw_region;
-   brw->intel.vtbl.meta_draw_quad = meta_draw_quad;
-   brw->intel.vtbl.meta_texture_blend_replace = meta_texture_blend_replace;
-/*    brw->intel.vtbl.meta_tex_rect_source = meta_tex_rect_source; */
-/*    brw->intel.vtbl.meta_draw_format = set_draw_format; */
-}
-
-void brw_destroy_metaops( struct brw_context *brw )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-
-   if (brw->metaops.vbo)
-      ctx->Driver.DeleteBuffer( ctx, brw->metaops.vbo );
-
-/*    ctx->Driver.DeleteProgram( ctx, brw->metaops.fp ); */
-/*    ctx->Driver.DeleteProgram( ctx, brw->metaops.fp_tex ); */
-/*    ctx->Driver.DeleteProgram( ctx, brw->metaops.vp ); */
-}
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 627705fa9b..ea71857548 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -48,15 +48,16 @@
 
 static void upload_blend_constant_color(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_blend_constant_color bcc;
 
    memset(&bcc, 0, sizeof(bcc));      
    bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
    bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = brw->attribs.Color->BlendColor[0];
-   bcc.blend_constant_color[1] = brw->attribs.Color->BlendColor[1];
-   bcc.blend_constant_color[2] = brw->attribs.Color->BlendColor[2];
-   bcc.blend_constant_color[3] = brw->attribs.Color->BlendColor[3];
+   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
+   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
+   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
+   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
 
    BRW_CACHED_BATCH_STRUCT(brw, &bcc);
 }
@@ -100,6 +101,7 @@ const struct brw_tracked_state brw_drawing_rect = {
 
 static void prepare_binding_table_pointers(struct brw_context *brw)
 {
+   brw_add_validated_bo(brw, brw->vs.bind_bo);
    brw_add_validated_bo(brw, brw->wm.bind_bo);
 }
 
@@ -116,13 +118,14 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
-   OUT_BATCH(0); /* vs */
+   if (brw->vs.bind_bo != NULL)
+      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+   else
+      OUT_BATCH(0);
    OUT_BATCH(0); /* gs */
    OUT_BATCH(0); /* clip */
    OUT_BATCH(0); /* sf */
-   OUT_RELOC(brw->wm.bind_bo,
-	     I915_GEM_DOMAIN_SAMPLER, 0,
-	     0);
+   OUT_RELOC(brw->wm.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* wm/ps */
    ADVANCE_BATCH();
 }
 
@@ -154,10 +157,7 @@ static void upload_pipelined_state_pointers(struct brw_context *brw )
       OUT_RELOC(brw->gs.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
    else
       OUT_BATCH(0);
-   if (!brw->metaops.active)
-      OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
-   else
-      OUT_BATCH(0);
+   OUT_RELOC(brw->clip.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1);
    OUT_RELOC(brw->sf.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    OUT_RELOC(brw->wm.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
    OUT_RELOC(brw->cc.state_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
@@ -172,6 +172,7 @@ static void prepare_psp_urb_cbs(struct brw_context *brw)
    brw_add_validated_bo(brw, brw->vs.state_bo);
    brw_add_validated_bo(brw, brw->gs.state_bo);
    brw_add_validated_bo(brw, brw->clip.state_bo);
+   brw_add_validated_bo(brw, brw->sf.state_bo);
    brw_add_validated_bo(brw, brw->wm.state_bo);
    brw_add_validated_bo(brw, brw->cc.state_bo);
 }
@@ -180,13 +181,13 @@ static void upload_psp_urb_cbs(struct brw_context *brw )
 {
    upload_pipelined_state_pointers(brw);
    brw_upload_urb_fence(brw);
-   brw_upload_constant_buffer_state(brw);
+   brw_upload_cs_urb_state(brw);
 }
 
 const struct brw_tracked_state brw_psp_urb_cbs = {
    .dirty = {
       .mesa = 0,
-      .brw = BRW_NEW_URB_FENCE | BRW_NEW_METAOPS | BRW_NEW_BATCH,
+      .brw = BRW_NEW_URB_FENCE | BRW_NEW_BATCH,
       .cache = (CACHE_NEW_VS_UNIT | 
 		CACHE_NEW_GS_UNIT | 
 		CACHE_NEW_GS_PROG | 
@@ -211,7 +212,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
-   unsigned int len = BRW_IS_G4X(brw) ? 6 : 5;
+   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
 
    if (region == NULL) {
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
@@ -222,7 +223,7 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -244,6 +245,8 @@ static void emit_depthbuffer(struct brw_context *brw)
 	 return;
       }
 
+      assert(region->tiling != I915_TILING_X);
+
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
@@ -259,7 +262,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 		((region->height - 1) << 19));
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -284,6 +287,7 @@ const struct brw_tracked_state brw_depthbuffer = {
 
 static void upload_polygon_stipple(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_polygon_stipple bps;
    GLuint i;
 
@@ -291,8 +295,21 @@ static void upload_polygon_stipple(struct brw_context *brw)
    bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
    bps.header.length = sizeof(bps)/4-2;
 
-   for (i = 0; i < 32; i++)
-      bps.stipple[i] = brw->attribs.PolygonStipple[31 - i]; /* invert */
+   /* Polygon stipple is provided in OpenGL order, i.e. bottom
+    * row first.  If we're rendering to a window (i.e. the
+    * default frame buffer object, 0), then we need to invert
+    * it to match our pixel layout.  But if we're rendering
+    * to a FBO (i.e. any named frame buffer object), we *don't*
+    * need to invert - we already match the layout.
+    */
+   if (ctx->DrawBuffer->Name == 0) {
+      for (i = 0; i < 32; i++)
+         bps.stipple[i] = ctx->PolygonStipple[31 - i]; /* invert */
+   }
+   else {
+      for (i = 0; i < 32; i++)
+         bps.stipple[i] = ctx->PolygonStipple[i]; /* don't invert */
+   }
 
    BRW_CACHED_BATCH_STRUCT(brw, &bps);
 }
@@ -320,8 +337,22 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
    bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
    bpso.header.length = sizeof(bpso)/4-2;
 
-   bpso.bits0.x_offset = (32 - (dPriv->x & 31)) & 31;
-   bpso.bits0.y_offset = (32 - ((dPriv->y + dPriv->h) & 31)) & 31;
+   /* If we're drawing to a system window (ctx->DrawBuffer->Name == 0),
+    * we have to invert the Y axis in order to match the OpenGL
+    * pixel coordinate system, and our offset must be matched
+    * to the window position.  If we're drawing to a FBO
+    * (ctx->DrawBuffer->Name != 0), then our native pixel coordinate
+    * system works just fine, and there's no window system to
+    * worry about.
+    */
+   if (brw->intel.ctx.DrawBuffer->Name == 0) {
+      bpso.bits0.x_offset = (32 - (dPriv->x & 31)) & 31;
+      bpso.bits0.y_offset = (32 - ((dPriv->y + dPriv->h) & 31)) & 31;
+   }
+   else {
+      bpso.bits0.y_offset = 0;
+      bpso.bits0.x_offset = 0;
+   }
 
    BRW_CACHED_BATCH_STRUCT(brw, &bpso);
 }
@@ -344,7 +375,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
    
-   if (!BRW_IS_G4X(brw))
+   if (BRW_IS_965(brw))
       return;
 
    /* use legacy aa line coverage computation */
@@ -370,6 +401,7 @@ const struct brw_tracked_state brw_aa_line_parameters = {
 
 static void upload_line_stipple(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_line_stipple bls;
    GLfloat tmp;
    GLint tmpi;
@@ -378,10 +410,10 @@ static void upload_line_stipple(struct brw_context *brw)
    bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
    bls.header.length = sizeof(bls)/4 - 2;
 
-   bls.bits0.pattern = brw->attribs.Line->StipplePattern;
-   bls.bits1.repeat_count = brw->attribs.Line->StippleFactor;
+   bls.bits0.pattern = ctx->Line.StipplePattern;
+   bls.bits1.repeat_count = ctx->Line.StippleFactor;
 
-   tmp = 1.0 / (GLfloat) brw->attribs.Line->StippleFactor;
+   tmp = 1.0 / (GLfloat) ctx->Line.StippleFactor;
    tmpi = tmp * (1<<13);
 
 
@@ -480,14 +512,27 @@ static void upload_state_base_address( struct brw_context *brw )
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
     */
-   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
-   OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
-   OUT_BATCH(1); /* General state base address */
-   OUT_BATCH(1); /* Surface state base address */
-   OUT_BATCH(1); /* Indirect object base address */
-   OUT_BATCH(1); /* General state upper bound */
-   OUT_BATCH(1); /* Indirect object upper bound */
-   ADVANCE_BATCH();
+   if (BRW_IS_IGDNG(brw)) {
+       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* Instruction base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       OUT_BATCH(1); /* Instruction access upper bound */
+       ADVANCE_BATCH();
+   } else {
+       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       ADVANCE_BATCH();
+   }
 }
 
 const struct brw_tracked_state brw_state_base_address = {
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 0c86911044..bac69187c1 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -38,6 +38,7 @@
 
 #include "brw_context.h"
 #include "brw_util.h"
+#include "brw_wm.h"
 
 static void brwBindProgram( GLcontext *ctx,
 			    GLenum target, 
@@ -94,7 +95,12 @@ static struct gl_program *brwNewProgram( GLcontext *ctx,
 static void brwDeleteProgram( GLcontext *ctx,
 			      struct gl_program *prog )
 {
-   
+   if (prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
+      struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
+      struct brw_fragment_program *brw_fprog = brw_fragment_program(fprog);
+      dri_bo_unreference(brw_fprog->const_buffer);
+   }
+
    _mesa_delete_program( ctx, prog );
 }
 
@@ -110,30 +116,36 @@ static void brwProgramStringNotify( GLcontext *ctx,
 				    GLenum target,
 				    struct gl_program *prog )
 {
+   struct brw_context *brw = brw_context(ctx);
+
    if (target == GL_FRAGMENT_PROGRAM_ARB) {
       struct gl_fragment_program *fprog = (struct gl_fragment_program *) prog;
-      struct brw_context *brw = brw_context(ctx);
-      struct brw_fragment_program *p = (struct brw_fragment_program *)prog;
-      struct brw_fragment_program *fp = (struct brw_fragment_program *)brw->fragment_program;
+      struct brw_fragment_program *newFP = brw_fragment_program(fprog);
+      const struct brw_fragment_program *curFP =
+         brw_fragment_program_const(brw->fragment_program);
+
       if (fprog->FogOption) {
          _mesa_append_fog_code(ctx, fprog);
          fprog->FogOption = GL_NONE;
       }
 
-      if (p == fp)
+      if (newFP == curFP)
 	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
-      p->id = brw->program_id++;      
+      newFP->id = brw->program_id++;      
+      newFP->isGLSL = brw_wm_is_glsl(fprog);
    }
    else if (target == GL_VERTEX_PROGRAM_ARB) {
-      struct brw_context *brw = brw_context(ctx);
-      struct brw_vertex_program *p = (struct brw_vertex_program *)prog;
-      struct brw_vertex_program *vp = (struct brw_vertex_program *)brw->vertex_program;
-      if (p == vp)
+      struct gl_vertex_program *vprog = (struct gl_vertex_program *) prog;
+      struct brw_vertex_program *newVP = brw_vertex_program(vprog);
+      const struct brw_vertex_program *curVP =
+         brw_vertex_program_const(brw->vertex_program);
+
+      if (newVP == curVP)
 	 brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
-      if (p->program.IsPositionInvariant) {
-	 _mesa_insert_mvp_code(ctx, &p->program);
+      if (newVP->program.IsPositionInvariant) {
+	 _mesa_insert_mvp_code(ctx, &newVP->program);
       }
-      p->id = brw->program_id++;      
+      newVP->id = brw->program_id++;      
 
       /* Also tell tnl about it:
        */
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index cb9169e2ee..a195bc32b0 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -146,17 +146,12 @@ static void brw_wait_query(GLcontext *ctx, struct gl_query_object *q)
 
 static void brw_check_query(GLcontext *ctx, struct gl_query_object *q)
 {
-   /* XXX: Need to expose dri_bo_is_idle from bufmgr. */
-#if 0
    struct brw_query_object *query = (struct brw_query_object *)q;
 
-   if (dri_bo_is_idle(query->bo)) {
+   if (query->bo == NULL || !drm_intel_bo_busy(query->bo)) {
       brw_queryobj_get_results(query);
       query->Base.Ready = GL_TRUE;
    }
-#else
-   brw_wait_query(ctx, q);
-#endif
 }
 
 /** Called to set up the query BO and account for its aperture space */
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 1a11d54621..e1c2c7777b 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -46,6 +46,7 @@
 static void compile_sf_prog( struct brw_context *brw,
 			     struct brw_sf_prog_key *key )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_sf_compile c;
    const GLuint *program;
    GLuint program_size;
@@ -74,7 +75,7 @@ static void compile_sf_prog( struct brw_context *brw,
 	 c.idx_to_attr[idx] = i;
 	 if (i >= VERT_RESULT_TEX0 && i <= VERT_RESULT_TEX7) {
             c.point_attrs[i].CoordReplace = 
-               brw->attribs.Point->CoordReplace[i - VERT_RESULT_TEX0];
+               ctx->Point.CoordReplace[i - VERT_RESULT_TEX0];
 	 }
          else {
             c.point_attrs[i].CoordReplace = GL_FALSE;
@@ -128,6 +129,7 @@ static void compile_sf_prog( struct brw_context *brw,
  */
 static void upload_sf_prog(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_sf_prog_key key;
 
    memset(&key, 0, sizeof(key));
@@ -158,15 +160,24 @@ static void upload_sf_prog(struct brw_context *brw)
       break;
    }
 
-   key.do_point_sprite = brw->attribs.Point->PointSprite;
-   key.SpriteOrigin = brw->attribs.Point->SpriteOrigin;
+   key.do_point_sprite = ctx->Point.PointSprite;
+   key.SpriteOrigin = ctx->Point.SpriteOrigin;
    /* _NEW_LIGHT */
-   key.do_flat_shading = (brw->attribs.Light->ShadeModel == GL_FLAT);
-   key.do_twoside_color = (brw->attribs.Light->Enabled && brw->attribs.Light->Model.TwoSide);
+   key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
+   key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
+
+   /* _NEW_HINT */
+   key.linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
 
    /* _NEW_POLYGON */
-   if (key.do_twoside_color)
-      key.frontface_ccw = (brw->attribs.Polygon->FrontFace == GL_CCW);
+   if (key.do_twoside_color) {
+      /* If we're rendering to a FBO, we have to invert the polygon
+       * face orientation, just as we invert the viewport in
+       * sf_unit_create_from_key().  ctx->DrawBuffer->Name will be
+       * nonzero if we're rendering to such an FBO.
+       */
+      key.frontface_ccw = (ctx->Polygon.FrontFace == GL_CCW) ^ (ctx->DrawBuffer->Name != 0);
+   }
 
    dri_bo_unreference(brw->sf.prog_bo);
    brw->sf.prog_bo = brw_search_cache(&brw->cache, BRW_SF_PROG,
@@ -180,7 +191,7 @@ static void upload_sf_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_sf_prog = {
    .dirty = {
-      .mesa  = (_NEW_LIGHT|_NEW_POLYGON|_NEW_POINT),
+      .mesa  = (_NEW_HINT | _NEW_LIGHT | _NEW_POLYGON | _NEW_POINT),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h
index 1c0fb70fe0..6426b6df9f 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.h
+++ b/src/mesa/drivers/dri/i965/brw_sf.h
@@ -51,7 +51,8 @@ struct brw_sf_prog_key {
    GLuint do_flat_shading:1;
    GLuint frontface_ccw:1;
    GLuint do_point_sprite:1;
-   GLuint pad:10;
+   GLuint linear_color:1;  /**< linear interp vs. perspective interp */
+   GLuint pad:25;
    GLenum SpriteOrigin;
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_emit.c b/src/mesa/drivers/dri/i965/brw_sf_emit.c
index ffdb0ae6df..ca8f97f9f9 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_emit.c
@@ -59,37 +59,6 @@ static GLboolean have_attr(struct brw_sf_compile *c,
    return (c->key.attrs & (1<<attr)) ? 1 : 0;
 }
 
-/**
- * Sets VERT_RESULT_FOGC.Y  for gl_FrontFacing
- *
- * This is currently executed if the fragment program uses VERT_RESULT_FOGC
- * at all, but this could be eliminated with a scan of the FP contents.
- */
-static void
-do_front_facing( struct brw_sf_compile *c )
-{
-   struct brw_compile *p = &c->func; 
-   int i;
-
-   if (!have_attr(c, VERT_RESULT_FOGC))
-      return;
-
-   brw_push_insn_state(p);
-   brw_CMP(p, brw_null_reg(), 
-        c->key.frontface_ccw ? BRW_CONDITIONAL_G : BRW_CONDITIONAL_L,
-        c->det, brw_imm_f(0));
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-   for (i = 0; i < 3; i++) {
-       struct brw_reg fogc = get_vert_attr(c, c->vert[i],FRAG_ATTRIB_FOGC);
-       brw_MOV(p, get_element(fogc, 1), brw_imm_f(0));
-       brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-       brw_MOV(p, get_element(fogc, 1), brw_imm_f(1));
-       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-   }
-   brw_pop_insn_state(p);
-}
-
-			 
 /*********************************************************************** 
  * Twoside lighting
  */
@@ -182,6 +151,8 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint jmpi = 1;
+
    if (!nr)
       return;
 
@@ -190,18 +161,21 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
    brw_push_insn_state(p);
    
-   brw_MUL(p, c->pv, c->pv, brw_imm_ud(nr*2+1));
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
    brw_JMPI(p, ip, ip, c->pv);
 
    copy_colors(c, c->vert[1], c->vert[0]);
    copy_colors(c, c->vert[2], c->vert[0]);
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr*4+1));
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*(nr*4+1)));
 
    copy_colors(c, c->vert[0], c->vert[1]);
    copy_colors(c, c->vert[2], c->vert[1]);
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr*2));
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*nr*2));
 
    copy_colors(c, c->vert[0], c->vert[2]);
    copy_colors(c, c->vert[1], c->vert[2]);
@@ -215,7 +189,8 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
-   
+   GLuint jmpi = 1;
+
    if (!nr)
       return;
 
@@ -224,13 +199,16 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
    brw_push_insn_state(p);
    
-   brw_MUL(p, c->pv, c->pv, brw_imm_ud(nr+1));
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
    brw_JMPI(p, ip, ip, c->pv);
    copy_colors(c, c->vert[1], c->vert[0]);
 
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr));
+   brw_JMPI(p, ip, ip, brw_imm_ud(jmpi*nr));
    copy_colors(c, c->vert[0], c->vert[1]);
 
    brw_pop_insn_state(p);
@@ -249,7 +227,7 @@ static void alloc_regs( struct brw_sf_compile *c )
 
    /* Values computed by fixed function unit:
     */
-   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD);
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
    c->det = brw_vec1_grf(1, 2);
    c->dx0 = brw_vec1_grf(1, 3);
    c->dx2 = brw_vec1_grf(1, 4);
@@ -326,9 +304,6 @@ static void invert_det( struct brw_sf_compile *c)
 
 }
 
-#define NON_PERPECTIVE_ATTRS  (FRAG_BIT_WPOS | \
-                               FRAG_BIT_COL0 | \
-			       FRAG_BIT_COL1)
 
 static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLuint reg,
@@ -337,9 +312,16 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLushort *pc_linear)
 {
    GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
-   GLuint persp_mask = c->key.attrs & ~NON_PERPECTIVE_ATTRS;
+   GLuint persp_mask;
    GLuint linear_mask;
 
+   if (c->key.do_flat_shading || c->key.linear_color)
+      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS |
+                                    FRAG_BIT_COL0 |
+                                    FRAG_BIT_COL1);
+   else
+      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS);
+
    if (c->key.do_flat_shading)
       linear_mask = c->key.attrs & ~(FRAG_BIT_COL0|FRAG_BIT_COL1);
    else
@@ -384,7 +366,6 @@ void brw_emit_tri_setup( struct brw_sf_compile *c, GLboolean allocate)
 
    invert_det(c);
    copy_z_inv_w(c);
-   do_front_facing(c);
 
    if (c->key.do_twoside_color) 
       do_twoside_color(c);
@@ -706,7 +687,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 					       (1<<_3DPRIM_POLYGON) |
 					       (1<<_3DPRIM_RECTLIST) |
 					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
@@ -727,7 +708,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 					       (1<<_3DPRIM_LINESTRIP_CONT) |
 					       (1<<_3DPRIM_LINESTRIP_BF) |
 					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
@@ -740,7 +721,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 
    brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
    brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 242b7047a1..bc0f076073 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -43,10 +43,12 @@ static void upload_sf_vp(struct brw_context *brw)
    const GLfloat depth_scale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
    struct brw_sf_viewport sfv;
    GLfloat y_scale, y_bias;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+   const GLfloat *v = ctx->Viewport._WindowMap.m;
 
    memset(&sfv, 0, sizeof(sfv));
 
-   if (intel_rendering_to_texture(ctx)) {
+   if (render_to_fbo) {
       y_scale = 1.0;
       y_bias = 0;
    }
@@ -55,27 +57,18 @@ static void upload_sf_vp(struct brw_context *brw)
       y_bias = ctx->DrawBuffer->Height;
    }
 
-   /* _NEW_VIEWPORT, BRW_NEW_METAOPS */
+   /* _NEW_VIEWPORT */
 
-   if (!brw->metaops.active) {
-      const GLfloat *v = ctx->Viewport._WindowMap.m;
+   sfv.viewport.m00 = v[MAT_SX];
+   sfv.viewport.m11 = v[MAT_SY] * y_scale;
+   sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
+   sfv.viewport.m30 = v[MAT_TX];
+   sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
+   sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
 
-      sfv.viewport.m00 = v[MAT_SX];
-      sfv.viewport.m11 = v[MAT_SY] * y_scale;
-      sfv.viewport.m22 = v[MAT_SZ] * depth_scale;
-      sfv.viewport.m30 = v[MAT_TX];
-      sfv.viewport.m31 = v[MAT_TY] * y_scale + y_bias;
-      sfv.viewport.m32 = v[MAT_TZ] * depth_scale;
-   } else {
-      sfv.viewport.m00 =   1;
-      sfv.viewport.m11 = - 1;
-      sfv.viewport.m22 =   1;
-      sfv.viewport.m30 =   0;
-      sfv.viewport.m31 =   ctx->DrawBuffer->Height;
-      sfv.viewport.m32 =   0;
-   }
-
-   /* _NEW_SCISSOR */
+   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT
+    * for DrawBuffer->_[XY]{min,max}
+    */
 
    /* The scissor only needs to handle the intersection of drawable and
     * scissor rect.  Clipping to the boundaries of static shared buffers
@@ -84,10 +77,20 @@ static void upload_sf_vp(struct brw_context *brw)
     * Note that the hardware's coordinates are inclusive, while Mesa's min is
     * inclusive but max is exclusive.
     */
-   sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
-   sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
-   sfv.scissor.ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax;
-   sfv.scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
+   if (render_to_fbo) {
+      /* texmemory: Y=0=bottom */
+      sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
+      sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
+      sfv.scissor.ymin = ctx->DrawBuffer->_Ymin;
+      sfv.scissor.ymax = ctx->DrawBuffer->_Ymax - 1;
+   }
+   else {
+      /* memory: Y=0=top */
+      sfv.scissor.xmin = ctx->DrawBuffer->_Xmin;
+      sfv.scissor.xmax = ctx->DrawBuffer->_Xmax - 1;
+      sfv.scissor.ymin = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymax;
+      sfv.scissor.ymax = ctx->DrawBuffer->Height - ctx->DrawBuffer->_Ymin - 1;
+   }
 
    dri_bo_unreference(brw->sf.vp_bo);
    brw->sf.vp_bo = brw_cache_data( &brw->cache, BRW_SF_VP, &sfv, NULL, 0 );
@@ -96,8 +99,9 @@ static void upload_sf_vp(struct brw_context *brw)
 const struct brw_tracked_state brw_sf_vp = {
    .dirty = {
       .mesa  = (_NEW_VIEWPORT | 
-		_NEW_SCISSOR),
-      .brw   = BRW_NEW_METAOPS,
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
+      .brw   = 0,
       .cache = 0
    },
    .prepare = upload_sf_vp
@@ -109,16 +113,20 @@ struct brw_sf_unit_key {
 
    unsigned int nr_urb_entries, urb_size, sfsize;
 
-   GLenum front_face, cull_face;
-   GLboolean scissor, line_smooth, point_sprite, point_attenuated;
+   GLenum front_face, cull_face, provoking_vertex;
+   unsigned scissor:1;
+   unsigned line_smooth:1;
+   unsigned point_sprite:1;
+   unsigned point_attenuated:1;
+   unsigned render_to_fbo:1;
    float line_width;
    float point_size;
-   GLboolean render_to_texture;
 };
 
 static void
 sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_SF_PROG */
@@ -130,22 +138,25 @@ sf_unit_populate_key(struct brw_context *brw, struct brw_sf_unit_key *key)
    key->urb_size = brw->urb.vsize;
    key->sfsize = brw->urb.sfsize;
 
-   key->scissor = brw->attribs.Scissor->Enabled;
-   key->front_face = brw->attribs.Polygon->FrontFace;
+   key->scissor = ctx->Scissor.Enabled;
+   key->front_face = ctx->Polygon.FrontFace;
 
-   if (brw->attribs.Polygon->CullFlag)
-      key->cull_face = brw->attribs.Polygon->CullFaceMode;
+   if (ctx->Polygon.CullFlag)
+      key->cull_face = ctx->Polygon.CullFaceMode;
    else
       key->cull_face = GL_NONE;
 
-   key->line_width = brw->attribs.Line->Width;
-   key->line_smooth = brw->attribs.Line->SmoothFlag;
+   key->line_width = ctx->Line.Width;
+   key->line_smooth = ctx->Line.SmoothFlag;
+
+   key->point_sprite = ctx->Point.PointSprite;
+   key->point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
+   key->point_attenuated = ctx->Point._Attenuated;
 
-   key->point_sprite = brw->attribs.Point->PointSprite;
-   key->point_size = brw->attribs.Point->Size;
-   key->point_attenuated = brw->attribs.Point->_Attenuated;
+   /* _NEW_LIGHT */
+   key->provoking_vertex = ctx->Light.ProvokingVertex;
 
-   key->render_to_texture = intel_rendering_to_texture(&brw->intel.ctx);
+   key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
 }
 
 static dri_bo *
@@ -154,7 +165,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 {
    struct brw_sf_unit_state sf;
    dri_bo *bo;
-
+   int chipset_max_threads;
    memset(&sf, 0, sizeof(sf));
 
    sf.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
@@ -163,13 +174,26 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
    sf.thread3.dispatch_grf_start_reg = 3;
-   sf.thread3.urb_entry_read_offset = 1;
+
+   if (BRW_IS_IGDNG(brw))
+       sf.thread3.urb_entry_read_offset = 3;
+   else
+       sf.thread3.urb_entry_read_offset = 1;
+
    sf.thread3.urb_entry_read_length = key->urb_entry_read_length;
 
    sf.thread4.nr_urb_entries = key->nr_urb_entries;
    sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
-   /* Each SF thread produces 1 PUE, and there can be up to 24 threads */
-   sf.thread4.max_threads = MIN2(24, key->nr_urb_entries) - 1;
+
+   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
+    * 48(IGDNG) threads 
+    */
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 48;
+   else
+      chipset_max_threads = 24;
+
+   sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
 
    if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
       sf.thread4.max_threads = 0;
@@ -192,10 +216,10 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    else
       sf.sf5.front_winding = BRW_FRONTWINDING_CW;
 
-   /* The viewport is inverted for rendering to texture, and that inverts
+   /* The viewport is inverted for rendering to a FBO, and that inverts
     * polygon front/back orientation.
     */
-   sf.sf5.front_winding ^= key->render_to_texture;
+   sf.sf5.front_winding ^= key->render_to_fbo;
 
    switch (key->cull_face) {
    case GL_FRONT:
@@ -225,10 +249,37 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    else if (sf.sf6.line_width <= 0x2)
        sf.sf6.line_width = 0;
 
-   /* _NEW_POINT */
-   sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;	/* opengl conventions */
+   /* _NEW_BUFFERS */
+   key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
+   if (!key->render_to_fbo) {
+      /* Rendering to an OpenGL window */
+      sf.sf6.point_rast_rule = BRW_RASTRULE_UPPER_RIGHT;
+   }
+   else {
+      /* If rendering to an FBO, the pixel coordinate system is
+       * inverted with respect to the normal OpenGL coordinate
+       * system, so BRW_RASTRULE_LOWER_RIGHT is correct.
+       * But this value is listed as "Reserved, but not seen as useful"
+       * in Intel documentation (page 212, "Point Rasterization Rule",
+       * section 7.4 "SF Pipeline State Summary", of document
+       * "Intel® 965 Express Chipset Family and Intel® G35 Express
+       * Chipset Graphics Controller Programmer's Reference Manual,
+       * Volume 2: 3D/Media", Revision 1.0b as of January 2008,
+       * available at 
+       *     http://intellinuxgraphics.org/documentation.html
+       * at the time of this writing).
+       *
+       * It does work on at least some devices, if not all;
+       * if devices that don't support it can be identified,
+       * the likely failure case is that points are rasterized
+       * incorrectly, which is no worse than occurs without
+       * the value, so we're using it here.
+       */
+      sf.sf6.point_rast_rule = BRW_RASTRULE_LOWER_RIGHT;
+   }
    /* XXX clamp max depends on AA vs. non-AA */
 
+   /* _NEW_POINT */
    sf.sf7.sprite_point = key->point_sprite;
    sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3);
    sf.sf7.use_point_size_state = !key->point_attenuated;
@@ -236,9 +287,15 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 
    /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
     */
-   sf.sf7.trifan_pv = 2;
-   sf.sf7.linestrip_pv = 1;
-   sf.sf7.tristrip_pv = 2;
+   if (key->provoking_vertex == GL_LAST_VERTEX_CONVENTION) {
+      sf.sf7.trifan_pv = 2;
+      sf.sf7.linestrip_pv = 1;
+      sf.sf7.tristrip_pv = 2;
+   } else {
+      sf.sf7.trifan_pv = 1;
+      sf.sf7.linestrip_pv = 0;
+      sf.sf7.tristrip_pv = 0;
+   }
    sf.sf7.line_last_pixel_enable = 0;
 
    /* Set bias for OpenGL rasterization rules:
@@ -252,6 +309,9 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 			 &sf, sizeof(sf),
 			 NULL, NULL);
 
+   /* STATE_PREFETCH command description describes this state as being
+    * something loaded through the GPE (L2 ISC), so it's INSTRUCTION domain.
+    */
    /* Emit SF program relocation */
    dri_bo_emit_reloc(bo,
 		     I915_GEM_DOMAIN_INSTRUCTION, 0,
@@ -292,11 +352,12 @@ static void upload_sf_unit( struct brw_context *brw )
 const struct brw_tracked_state brw_sf_unit = {
    .dirty = {
       .mesa  = (_NEW_POLYGON | 
+		_NEW_LIGHT |
 		_NEW_LINE | 
 		_NEW_POINT | 
-		_NEW_SCISSOR),
-      .brw   = (BRW_NEW_URB_FENCE |
-		BRW_NEW_METAOPS),
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
+      .brw   = BRW_NEW_URB_FENCE,
       .cache = (CACHE_NEW_SF_VP |
 		CACHE_NEW_SF_PROG)
    },
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index df839c5b30..78572356a3 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -52,7 +52,6 @@ const struct brw_tracked_state brw_cc_vp;
 const struct brw_tracked_state brw_check_fallback;
 const struct brw_tracked_state brw_clip_prog;
 const struct brw_tracked_state brw_clip_unit;
-const struct brw_tracked_state brw_constant_buffer_state;
 const struct brw_tracked_state brw_constant_buffer;
 const struct brw_tracked_state brw_curbe_offsets;
 const struct brw_tracked_state brw_invarient_state;
@@ -73,11 +72,13 @@ const struct brw_tracked_state brw_sf_vp;
 const struct brw_tracked_state brw_state_base_address;
 const struct brw_tracked_state brw_urb_fence;
 const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_surfaces;
 const struct brw_tracked_state brw_vs_prog;
 const struct brw_tracked_state brw_vs_unit;
 const struct brw_tracked_state brw_wm_input_sizes;
 const struct brw_tracked_state brw_wm_prog;
 const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_constant_surface;
 const struct brw_tracked_state brw_wm_surfaces;
 const struct brw_tracked_state brw_wm_unit;
 
@@ -91,6 +92,21 @@ const struct brw_tracked_state brw_clear_batch_cache;
 const struct brw_tracked_state brw_drawing_rect;
 const struct brw_tracked_state brw_indices;
 const struct brw_tracked_state brw_vertices;
+const struct brw_tracked_state brw_index_buffer;
+
+/**
+ * Use same key for WM and VS surfaces.
+ */
+struct brw_surface_key {
+   GLenum target, depthmode;
+   dri_bo *bo;
+   GLint format, internal_format;
+   GLint first_level, last_level;
+   GLint width, height, depth;
+   GLint pitch, cpp;
+   uint32_t tiling;
+   GLuint offset;
+};
 
 /***********************************************************************
  * brw_state.c
@@ -136,8 +152,8 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
 			  void *aux_return);
 void brw_state_cache_check_size( struct brw_context *brw );
 
-void brw_init_cache( struct brw_context *brw );
-void brw_destroy_cache( struct brw_context *brw );
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
 
 /***********************************************************************
  * brw_state_batch.c
@@ -151,4 +167,9 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 void brw_destroy_batch_cache( struct brw_context *brw );
 void brw_clear_batch_cache_flush( struct brw_context *brw );
 
+/* brw_wm_surface_state.c */
+dri_bo *
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key );
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index dc87859f3f..811940edc0 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -97,8 +97,6 @@ void brw_clear_batch_cache_flush( struct brw_context *brw )
 {
    clear_batch_cache(brw);
 
-/*    brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */
-   
    brw->state.dirty.mesa |= ~0;
    brw->state.dirty.brw |= ~0;
    brw->state.dirty.cache |= ~0;
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index d5b5166406..e40d7a0416 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -56,9 +56,9 @@
  * incorrect program is run for the other instance.
  */
 
+#include "main/imports.h"
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
-#include "main/imports.h"
 
 /* XXX: Fixme - have to include these to get the sizes of the prog_key
  * structs:
@@ -69,8 +69,10 @@
 #include "brw_sf.h"
 #include "brw_gs.h"
 
-static GLuint hash_key( const void *key, GLuint key_size,
-			dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+
+static GLuint
+hash_key(const void *key, GLuint key_size,
+         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
 {
    GLuint *ikey = (GLuint *)key;
    GLuint hash = 0, i;
@@ -95,6 +97,7 @@ static GLuint hash_key( const void *key, GLuint key_size,
    return hash;
 }
 
+
 /**
  * Marks a new buffer as being chosen for the given cache id.
  */
@@ -111,6 +114,7 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
+
 static struct brw_cache_item *
 search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 	     GLuint hash, const void *key, GLuint key_size,
@@ -143,7 +147,8 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 }
 
 
-static void rehash( struct brw_cache *cache )
+static void
+rehash(struct brw_cache *cache)
 {
    struct brw_cache_item **items;
    struct brw_cache_item *c, *next;
@@ -164,15 +169,17 @@ static void rehash( struct brw_cache *cache )
    cache->size = size;
 }
 
+
 /**
  * Returns the buffer object matching cache_id and key, or NULL.
  */
-dri_bo *brw_search_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_size,
-			  dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
-			  void *aux_return )
+dri_bo *
+brw_search_cache(struct brw_cache *cache,
+                 enum brw_cache_id cache_id,
+                 const void *key,
+                 GLuint key_size,
+                 dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
+                 void *aux_return)
 {
    struct brw_cache_item *item;
    GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
@@ -192,6 +199,7 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
    return item->bo;
 }
 
+
 dri_bo *
 brw_upload_cache( struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
@@ -265,7 +273,9 @@ brw_upload_cache( struct brw_cache *cache,
    return bo;
 }
 
-/* This doesn't really work with aux data.  Use search/upload instead
+
+/**
+ * This doesn't really work with aux data.  Use search/upload instead
  */
 dri_bo *
 brw_cache_data_sz(struct brw_cache *cache,
@@ -296,6 +306,7 @@ brw_cache_data_sz(struct brw_cache *cache,
    return bo;
 }
 
+
 /**
  * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
  *
@@ -319,21 +330,22 @@ enum pool_type {
    DW_GENERAL_STATE
 };
 
+
 static void
-brw_init_cache_id( struct brw_context *brw,
-		const char *name,
-		enum brw_cache_id id,
-		GLuint key_size,
-		GLuint aux_size)
+brw_init_cache_id(struct brw_cache *cache,
+                  const char *name,
+                  enum brw_cache_id id,
+                  GLuint key_size,
+                  GLuint aux_size)
 {
-   struct brw_cache *cache = &brw->cache;
-
    cache->name[id] = strdup(name);
    cache->key_size[id] = key_size;
    cache->aux_size[id] = aux_size;
 }
 
-void brw_init_cache( struct brw_context *brw )
+
+static void
+brw_init_non_surface_cache(struct brw_context *brw)
 {
    struct brw_cache *cache = &brw->cache;
 
@@ -342,114 +354,136 @@ void brw_init_cache( struct brw_context *brw )
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * 
-		   sizeof(struct brw_cache_item));
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_VP",
 		     BRW_CC_VP,
 		     sizeof(struct brw_cc_viewport),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_UNIT",
 		     BRW_CC_UNIT,
 		     sizeof(struct brw_cc_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_PROG",
 		     BRW_WM_PROG,
 		     sizeof(struct brw_wm_prog_key),
 		     sizeof(struct brw_wm_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER_DEFAULT_COLOR",
 		     BRW_SAMPLER_DEFAULT_COLOR,
 		     sizeof(struct brw_sampler_default_color),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER",
 		     BRW_SAMPLER,
 		     0,		/* variable key/data size */
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_UNIT",
 		     BRW_WM_UNIT,
 		     sizeof(struct brw_wm_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_PROG",
 		     BRW_SF_PROG,
 		     sizeof(struct brw_sf_prog_key),
 		     sizeof(struct brw_sf_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_VP",
 		     BRW_SF_VP,
 		     sizeof(struct brw_sf_viewport),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_UNIT",
 		     BRW_SF_UNIT,
 		     sizeof(struct brw_sf_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_UNIT",
 		     BRW_VS_UNIT,
 		     sizeof(struct brw_vs_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_PROG",
 		     BRW_VS_PROG,
 		     sizeof(struct brw_vs_prog_key),
 		     sizeof(struct brw_vs_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_UNIT",
 		     BRW_CLIP_UNIT,
 		     sizeof(struct brw_clip_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_PROG",
 		     BRW_CLIP_PROG,
 		     sizeof(struct brw_clip_prog_key),
 		     sizeof(struct brw_clip_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_UNIT",
 		     BRW_GS_UNIT,
 		     sizeof(struct brw_gs_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_PROG",
 		     BRW_GS_PROG,
 		     sizeof(struct brw_gs_prog_key),
 		     sizeof(struct brw_gs_prog_data));
+}
+
+
+static void
+brw_init_surface_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->surface_cache;
+
+   cache->brw = brw;
 
-   brw_init_cache_id(brw,
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
 		     "SS_SURFACE",
 		     BRW_SS_SURFACE,
 		     sizeof(struct brw_surface_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SS_SURF_BIND",
 		     BRW_SS_SURF_BIND,
 		     0,
 		     0);
 }
 
+
+void
+brw_init_caches(struct brw_context *brw)
+{
+   brw_init_non_surface_cache(brw);
+   brw_init_surface_cache(brw);
+}
+
+
 static void
-brw_clear_cache( struct brw_context *brw )
+brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    struct brw_cache_item *c, *next;
    GLuint i;
@@ -457,8 +491,8 @@ brw_clear_cache( struct brw_context *brw )
    if (INTEL_DEBUG & DEBUG_STATE)
       _mesa_printf("%s\n", __FUNCTION__);
 
-   for (i = 0; i < brw->cache.size; i++) {
-      for (c = brw->cache.items[i]; c; c = next) {
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
 	 int j;
 
 	 next = c->next;
@@ -468,10 +502,10 @@ brw_clear_cache( struct brw_context *brw )
 	 free((void *)c->key);
 	 free(c);
       }
-      brw->cache.items[i] = NULL;
+      cache->items[i] = NULL;
    }
 
-   brw->cache.n_items = 0;
+   cache->n_items = 0;
 
    if (brw->curbe.last_buf) {
       _mesa_free(brw->curbe.last_buf);
@@ -483,25 +517,46 @@ brw_clear_cache( struct brw_context *brw )
    brw->state.dirty.cache |= ~0;
 }
 
-void brw_state_cache_check_size( struct brw_context *brw )
+
+void
+brw_state_cache_check_size(struct brw_context *brw)
 {
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+
    /* un-tuned guess.  We've got around 20 state objects for a total of around
     * 32k, so 1000 of them is around 1.5MB.
     */
    if (brw->cache.n_items > 1000)
-      brw_clear_cache(brw);
+      brw_clear_cache(brw, &brw->cache);
+
+   if (brw->surface_cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->surface_cache);
 }
 
-void brw_destroy_cache( struct brw_context *brw )
+
+static void
+brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    GLuint i;
 
-   brw_clear_cache(brw);
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
-      dri_bo_unreference(brw->cache.last_bo[i]);
-      free(brw->cache.name[i]);
+      dri_bo_unreference(cache->last_bo[i]);
+      free(cache->name[i]);
    }
-   free(brw->cache.items);
-   brw->cache.items = NULL;
-   brw->cache.size = 0;
+   free(cache->items);
+   cache->items = NULL;
+   cache->size = 0;
+}
+
+
+void
+brw_destroy_caches(struct brw_context *brw)
+{
+   brw_destroy_cache(brw, &brw->cache);
+   brw_destroy_cache(brw, &brw->surface_cache);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index b28c57c2bc..e94fa7d2b4 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -84,6 +84,19 @@ get_965_surfacetype(unsigned int surfacetype)
     }
 }
 
+static const char *
+get_965_surface_format(unsigned int surface_format)
+{
+    switch (surface_format) {
+    case 0x000: return "r32g32b32a32_float";
+    case 0x0c1: return "b8g8r8a8_unorm";
+    case 0x100: return "b5g6r5_unorm";
+    case 0x102: return "b5g5r5a1_unorm";
+    case 0x104: return "b4g4r4a4_unorm";
+    default: return "unknown";
+    }
+}
+
 static void dump_wm_surface_state(struct brw_context *brw)
 {
    int i;
@@ -95,7 +108,7 @@ static void dump_wm_surface_state(struct brw_context *brw)
       char name[20];
 
       if (surf_bo == NULL) {
-	 fprintf(stderr, "WM SS%d: NULL\n", i);
+	 fprintf(stderr, "  WM SS%d: NULL\n", i);
 	 continue;
       }
       dri_bo_map(surf_bo, GL_FALSE);
@@ -103,8 +116,9 @@ static void dump_wm_surface_state(struct brw_context *brw)
       surf = (struct brw_surface_state *)(surf_bo->virtual);
 
       sprintf(name, "WM SS%d", i);
-      state_out(name, surf, surfoff, 0, "%s\n",
-		get_965_surfacetype(surf->ss0.surface_type));
+      state_out(name, surf, surfoff, 0, "%s %s\n",
+		get_965_surfacetype(surf->ss0.surface_type),
+		get_965_surface_format(surf->ss0.surface_format));
       state_out(name, surf, surfoff, 1, "offset\n");
       state_out(name, surf, surfoff, 2, "%dx%d size, %d mips\n",
 		surf->ss2.width + 1, surf->ss2.height + 1, surf->ss2.mip_count);
@@ -112,6 +126,8 @@ static void dump_wm_surface_state(struct brw_context *brw)
 		surf->ss3.pitch + 1, surf->ss3.tiled_surface ? "" : "not ");
       state_out(name, surf, surfoff, 4, "mip base %d\n",
 		surf->ss4.min_lod);
+      state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n",
+		surf->ss5.x_offset, surf->ss5.y_offset);
 
       dri_bo_unmap(surf_bo);
    }
@@ -162,6 +178,14 @@ static void brw_debug_prog(const char *name, dri_bo *prog)
       fprintf(stderr, "%8s: 0x%08x: 0x%08x 0x%08x 0x%08x 0x%08x\n",
 	      name, (unsigned int)prog->offset + i * 4 * 4,
 	      data[i * 4], data[i * 4 + 1], data[i * 4 + 2], data[i * 4 + 3]);
+      /* Stop at the end of the program.  It'd be nice to keep track of the actual
+       * intended program size instead of guessing like this.
+       */
+      if (data[i * 4 + 0] == 0 &&
+	  data[i * 4 + 1] == 0 &&
+	  data[i * 4 + 2] == 0 &&
+	  data[i * 4 + 3] == 0)
+	 break;
    }
 
    dri_bo_unmap(prog);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 4845859b3e..414620d0b3 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -59,11 +59,12 @@ const struct brw_tracked_state *atoms[] =
    &brw_curbe_offsets,
    &brw_recalculate_urb_fence,
 
-
    &brw_cc_vp,
    &brw_cc_unit,
 
-   &brw_wm_surfaces,		/* must do before samplers */
+   &brw_vs_surfaces,		/* must do before unit */
+   &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
+   &brw_wm_surfaces,		/* must do before samplers and unit */
    &brw_wm_samplers,
 
    &brw_wm_unit,
@@ -88,54 +89,27 @@ const struct brw_tracked_state *atoms[] =
 
    &brw_line_stipple,
    &brw_aa_line_parameters,
-   /* Ordering of the commands below is documented as fixed.  
-    */
-#if 0
-   &brw_pipelined_state_pointers,
-   &brw_urb_fence,
-   &brw_constant_buffer_state,
-#else
+
    &brw_psp_urb_cbs,
-#endif
 
    &brw_drawing_rect,
    &brw_indices,
+   &brw_index_buffer,
    &brw_vertices,
 
-   NULL,			/* brw_constant_buffer */
+   &brw_constant_buffer
 };
 
 
 void brw_init_state( struct brw_context *brw )
 {
-   GLuint i;
-
-   brw_init_cache(brw);
-
-   brw->state.atoms = _mesa_malloc(sizeof(atoms));
-   brw->state.nr_atoms = sizeof(atoms)/sizeof(*atoms);
-   _mesa_memcpy(brw->state.atoms, atoms, sizeof(atoms));
-
-   /* Patch in a pointer to the dynamic state atom:
-    */
-   for (i = 0; i < brw->state.nr_atoms; i++)
-      if (brw->state.atoms[i] == NULL)
-	 brw->state.atoms[i] = &brw->curbe.tracked_state;
-
-   _mesa_memcpy(&brw->curbe.tracked_state, 
-		&brw_constant_buffer,
-		sizeof(brw_constant_buffer));
+   brw_init_caches(brw);
 }
 
 
 void brw_destroy_state( struct brw_context *brw )
 {
-   if (brw->state.atoms) {
-      _mesa_free(brw->state.atoms);
-      brw->state.atoms = NULL;
-   }
-
-   brw_destroy_cache(brw);
+   brw_destroy_caches(brw);
    brw_destroy_batch_cache(brw);
 }
 
@@ -218,6 +192,7 @@ static struct dirty_bit_map mesa_bits[] = {
    DEFINE_BIT(_NEW_MULTISAMPLE),
    DEFINE_BIT(_NEW_TRACK_MATRIX),
    DEFINE_BIT(_NEW_PROGRAM),
+   DEFINE_BIT(_NEW_PROGRAM_CONSTANTS),
    {0, 0, 0}
 };
 
@@ -231,11 +206,10 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_PRIMITIVE),
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
-   DEFINE_BIT(BRW_NEW_INPUT_VARYING),
    DEFINE_BIT(BRW_NEW_PSP),
-   DEFINE_BIT(BRW_NEW_METAOPS),
    DEFINE_BIT(BRW_NEW_FENCE),
    DEFINE_BIT(BRW_NEW_INDICES),
+   DEFINE_BIT(BRW_NEW_INDEX_BUFFER),
    DEFINE_BIT(BRW_NEW_VERTICES),
    DEFINE_BIT(BRW_NEW_BATCH),
    DEFINE_BIT(BRW_NEW_DEPTH_BUFFER),
@@ -298,6 +272,7 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map, int32_t bits)
  */
 void brw_validate_state( struct brw_context *brw )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
    struct brw_state_flags *state = &brw->state.dirty;
    GLuint i;
@@ -314,13 +289,13 @@ void brw_validate_state( struct brw_context *brw )
       state->brw |= ~0;
    }
 
-   if (brw->fragment_program != brw->attribs.FragmentProgram->_Current) {
-      brw->fragment_program = brw->attribs.FragmentProgram->_Current;
+   if (brw->fragment_program != ctx->FragmentProgram._Current) {
+      brw->fragment_program = ctx->FragmentProgram._Current;
       brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
    }
 
-   if (brw->vertex_program != brw->attribs.VertexProgram->_Current) {
-      brw->vertex_program = brw->attribs.VertexProgram->_Current;
+   if (brw->vertex_program != ctx->VertexProgram._Current) {
+      brw->vertex_program = ctx->VertexProgram._Current;
       brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
    }
 
@@ -336,7 +311,7 @@ void brw_validate_state( struct brw_context *brw )
 
    /* do prepare stage for all atoms */
    for (i = 0; i < Elements(atoms); i++) {
-      const struct brw_tracked_state *atom = brw->state.atoms[i];
+      const struct brw_tracked_state *atom = atoms[i];
 
       if (brw->intel.Fallback)
          break;
@@ -347,6 +322,19 @@ void brw_validate_state( struct brw_context *brw )
         }
       }
    }
+
+   /* Make sure that the textures which are referenced by the current
+    * brw fragment program are actually present/valid.
+    * If this fails, we can experience GPU lock-ups.
+    */
+   {
+      const struct brw_fragment_program *fp;
+      fp = brw_fragment_program_const(brw->fragment_program);
+      if (fp) {
+         assert((fp->tex_units_used & ctx->Texture._EnabledUnits)
+                == fp->tex_units_used);
+      }
+   }
 }
 
 
@@ -367,8 +355,8 @@ void brw_upload_state(struct brw_context *brw)
       _mesa_memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < brw->state.nr_atoms; i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
 	 assert(atom->dirty.mesa ||
@@ -397,7 +385,7 @@ void brw_upload_state(struct brw_context *brw)
    }
    else {
       for (i = 0; i < Elements(atoms); i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+	 const struct brw_tracked_state *atom = atoms[i];
 
 	 if (brw->intel.Fallback)
 	    break;
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 4e577d0f6a..66d4127271 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -33,6 +33,14 @@
 #ifndef BRW_STRUCTS_H
 #define BRW_STRUCTS_H
 
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF 16
+
+
 /* Command packets:
  */
 struct header 
@@ -434,12 +442,12 @@ struct brw_urb_fence
    {
       GLuint sf_fence:10;  
       GLuint vf_fence:10;  
-      GLuint cs_fence:10;  
-      GLuint pad:2;
+      GLuint cs_fence:11;  
+      GLuint pad:1;
    } bits1;
 };
 
-struct brw_constant_buffer_state /* previously brw_command_streamer */
+struct brw_cs_urb_state
 {
    struct header header;
 
@@ -815,7 +823,9 @@ struct brw_gs_unit_state
 
    struct
    {
-      GLuint pad0:10;
+      GLuint pad0:8;
+      GLuint rendering_enable:1; /* for IGDNG */
+      GLuint pad4:1;
       GLuint stats_enable:1; 
       GLuint nr_urb_entries:7; 
       GLuint pad1:1;
@@ -923,6 +933,28 @@ struct brw_wm_unit_state
    
    GLfloat global_depth_offset_constant;  
    GLfloat global_depth_offset_scale;   
+   
+   /* for IGDNG only */
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_1:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_1:26;
+   } wm8;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_2:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_2:26;
+   } wm9;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_3:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_3:26;
+   } wm10;       
 };
 
 struct brw_sampler_default_color {
@@ -1031,10 +1063,10 @@ struct brw_surface_state
       GLuint writedisable_green:1; 
       GLuint writedisable_red:1; 
       GLuint writedisable_alpha:1; 
-      GLuint surface_format:9; 
+      GLuint surface_format:9;     /**< BRW_SURFACEFORMAT_x */
       GLuint data_return_format:1; 
       GLuint pad0:1;
-      GLuint surface_type:3; 
+      GLuint surface_type:3;       /**< BRW_SURFACE_1D/2D/3D/CUBE */
    } ss0;
    
    struct {
@@ -1075,7 +1107,7 @@ struct brw_surface_state
       GLuint y_offset:4;
       GLuint pad0:1;
       GLuint x_offset:7;
-   } ss5;   /* NEW in Integrated Graphics Device */
+   } ss5;   /* New in G4X */
 
 };
 
@@ -1168,7 +1200,7 @@ struct brw_instruction
       GLuint predicate_control:4;
       GLuint predicate_inverse:1;
       GLuint execution_size:3;
-      GLuint destreg__conditonalmod:4; /* destreg - send, conditionalmod - others */
+      GLuint destreg__conditionalmod:4; /* destreg - send, conditionalmod - others */
       GLuint pad0:2;
       GLuint debug_control:1;
       GLuint saturate:1;
@@ -1196,7 +1228,9 @@ struct brw_instruction
 	 GLuint dest_reg_type:3;
 	 GLuint src0_reg_file:2;
 	 GLuint src0_reg_type:3;
-	 GLuint pad:6;
+	 GLuint src1_reg_file:2;        /* 0x00000c00 */
+	 GLuint src1_reg_type:3;        /* 0x00007000 */
+	 GLuint pad:1;
 	 GLint dest_indirect_offset:10;	/* offset against the deref'd address reg */
 	 GLuint dest_subreg_nr:3; /* subnr for the address reg a0.x */
 	 GLuint dest_horiz_stride:2;
@@ -1211,7 +1245,7 @@ struct brw_instruction
 	 GLuint src0_reg_type:3;
 	 GLuint src1_reg_file:2;
 	 GLuint src1_reg_type:3;
-	 GLuint pad0:1;
+	 GLuint pad:1;
 	 GLuint dest_writemask:4;
 	 GLuint dest_subreg_nr:1;
 	 GLuint dest_reg_nr:8;
@@ -1298,6 +1332,14 @@ struct brw_instruction
 	 GLuint pad1:6;
       } ia16;
 
+       struct 
+       {
+           GLuint pad:26;
+           GLuint end_of_thread:1;
+           GLuint pad1:1;
+           GLuint sfid:4;
+       } send_igdng;  /* for IGDNG only */
+
    } bits2;
 
    union
@@ -1308,7 +1350,7 @@ struct brw_instruction
 	 GLuint src1_reg_nr:8;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_horiz_stride:2;
 	 GLuint src1_width:3;
 	 GLuint src1_vert_stride:4;
@@ -1323,7 +1365,7 @@ struct brw_instruction
 	 GLuint src1_reg_nr:8;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad0:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_swz_z:2;
 	 GLuint src1_swz_w:2;
 	 GLuint pad1:1;
@@ -1337,7 +1379,7 @@ struct brw_instruction
 	 GLuint src1_subreg_nr:3;
 	 GLuint src1_abs:1;
 	 GLuint src1_negate:1;
-	 GLuint pad0:1;
+	 GLuint src1_address_mode:1;
 	 GLuint src1_horiz_stride:2;
 	 GLuint src1_width:3;
 	 GLuint src1_vert_stride:4;
@@ -1385,6 +1427,21 @@ struct brw_instruction
       } math;
 
       struct {
+	 GLuint function:4;
+	 GLuint int_type:1;
+	 GLuint precision:1;
+	 GLuint saturate:1;
+	 GLuint data_type:1;
+	 GLuint snapshot:1;
+	 GLuint pad0:10;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } math_igdng;
+
+      struct {
 	 GLuint binding_table_index:8;
 	 GLuint sampler:4;
 	 GLuint return_format:2; 
@@ -1407,9 +1464,38 @@ struct brw_instruction
          GLuint end_of_thread:1;
       } sampler_g4x;
 
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint sampler:4;
+	 GLuint msg_type:4;
+	 GLuint simd_mode:2;
+	 GLuint pad0:1;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } sampler_igdng;
+
       struct brw_urb_immediate urb;
 
       struct {
+	 GLuint opcode:4;
+	 GLuint offset:6;
+	 GLuint swizzle_control:2; 
+	 GLuint pad:1;
+	 GLuint allocate:1;
+	 GLuint used:1;
+	 GLuint complete:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } urb_igdng;
+
+      struct {
 	 GLuint binding_table_index:8;
 	 GLuint msg_control:4;  
 	 GLuint msg_type:2;  
@@ -1423,6 +1509,19 @@ struct brw_instruction
 
       struct {
 	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;  
+	 GLuint msg_type:3;  
+	 GLuint target_cache:2;    
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_read_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
 	 GLuint msg_control:3;
 	 GLuint pixel_scoreboard_clear:1;
 	 GLuint msg_type:3;    
@@ -1435,6 +1534,20 @@ struct brw_instruction
       } dp_write;
 
       struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:3;    
+	 GLuint send_commit_msg:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_write_igdng;
+
+      struct {
 	 GLuint pad:16;
 	 GLuint response_length:4;
 	 GLuint msg_length:4;
@@ -1443,8 +1556,18 @@ struct brw_instruction
 	 GLuint end_of_thread:1;
       } generic;
 
+      struct {
+	 GLuint pad:19;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } generic_igdng;
+
       GLint d;
       GLuint ud;
+      float f;
    } bits3;
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_tex.c b/src/mesa/drivers/dri/i965/brw_tex.c
index 0bb6f176a0..71bff166dd 100644
--- a/src/mesa/drivers/dri/i965/brw_tex.c
+++ b/src/mesa/drivers/dri/i965/brw_tex.c
@@ -32,21 +32,12 @@
 
 #include "main/glheader.h"
 #include "main/mtypes.h"
-#include "main/imports.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "main/image.h"
 #include "main/teximage.h"
-#include "main/texstore.h"
-#include "main/texformat.h"
-
-#include "texmem.h"
 
 #include "intel_context.h"
 #include "intel_regions.h"
 #include "intel_tex.h"
 #include "brw_context.h"
-#include "brw_defines.h"
 
 
 void brw_FrameBufferTexInit( struct brw_context *brw,
@@ -86,11 +77,12 @@ void brw_FrameBufferTexDestroy( struct brw_context *brw )
  */
 void brw_validate_textures( struct brw_context *brw )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = &brw->intel;
    int i;
 
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[i];
+      struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
 
       if (texUnit->_ReallyEnabled) {
 	 intel_finalize_mipmap_tree(intel, i);
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 51a617fcb4..5986cbffad 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -28,7 +28,6 @@
   * Authors:
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
-        
 
 /* Code to layout images in a mipmap tree for i965.
  */
@@ -37,17 +36,93 @@
 #include "intel_tex_layout.h"
 #include "intel_context.h"
 #include "main/macros.h"
+#include "intel_chipset.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
-GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_tree *mt )
+GLboolean brw_miptree_layout(struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling)
 {
-   /* XXX: these vary depending on image format: 
-    */
-/*    GLint align_w = 4; */
+   /* XXX: these vary depending on image format: */
+   /* GLint align_w = 4; */
 
    switch (mt->target) {
-   case GL_TEXTURE_CUBE_MAP: 
+   case GL_TEXTURE_CUBE_MAP:
+      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+          GLuint align_h = 2, align_w = 4;
+          GLuint level;
+          GLuint x = 0;
+          GLuint y = 0;
+          GLuint width = mt->width0;
+          GLuint height = mt->height0;
+          GLuint qpitch = 0;
+          GLuint y_pitch = 0;
+
+          mt->pitch = mt->width0;
+          intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+          y_pitch = ALIGN(height, align_h);
+
+          if (mt->compressed) {
+              mt->pitch = ALIGN(mt->width0, align_w);
+          }
+
+          if (mt->first_level != mt->last_level) {
+              GLuint mip1_width;
+
+              if (mt->compressed) {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + ALIGN(minify(minify(mt->width0)), align_w);
+              } else {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + minify(minify(mt->width0));
+              }
+
+              if (mip1_width > mt->pitch) {
+                  mt->pitch = mip1_width;
+              }
+          }
+
+          mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
+
+          if (mt->compressed) {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
+          } else {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
+          }
+
+          for (level = mt->first_level; level <= mt->last_level; level++) {
+              GLuint img_height;
+              GLuint nr_images = 6;
+              GLuint q = 0;
+
+              intel_miptree_set_level_info(mt, level, nr_images, x, y, width, 
+                                           height, 1);
+
+              for (q = 0; q < nr_images; q++)
+                  intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch);
+
+              if (mt->compressed)
+                  img_height = MAX2(1, height/4);
+              else
+                  img_height = ALIGN(height, align_h);
+
+              if (level == mt->first_level + 1) {
+                  x += ALIGN(width, align_w);
+              }
+              else {
+                  y += img_height;
+              }
+
+              width  = minify(width);
+              height = minify(height);
+          }
+
+          break;
+      }
+
    case GL_TEXTURE_3D: {
       GLuint width  = mt->width0;
       GLuint height = mt->height0;
@@ -59,25 +134,25 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
       GLuint align_w = 4;
 
       mt->total_height = 0;
-      
+      intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+
       if (mt->compressed) {
-          align_w = intel_compressed_alignment(mt->internal_format);
           mt->pitch = ALIGN(width, align_w);
           pack_y_pitch = (height + 3) / 4;
       } else {
-          mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
-          pack_y_pitch = ALIGN(mt->height0, align_h);
+	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
+	 pack_y_pitch = ALIGN(mt->height0, align_h);
       }
 
-      pack_x_pitch = mt->pitch;
+      pack_x_pitch = width;
       pack_x_nr = 1;
 
-      for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
+      for (level = mt->first_level ; level <= mt->last_level ; level++) {
 	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
 	 GLint x = 0;
 	 GLint y = 0;
 	 GLint q, j;
-	    
+
 	 intel_miptree_set_level_info(mt, level, nr_images,
 				      0, mt->total_height,
 				      width, height, depth);
@@ -89,7 +164,7 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 	    }
 
 	    x = 0;
-	    y += pack_y_pitch;	    
+	    y += pack_y_pitch;
 	 }
 
 
@@ -98,40 +173,50 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 	 height = minify(height);
 	 depth  = minify(depth);
 
-    if (mt->compressed) {
-        pack_y_pitch = (height + 3) / 4;
-        
-        if (pack_x_pitch > ALIGN(width, align_w)) {
-            pack_x_pitch = ALIGN(width, align_w);
-            pack_x_nr <<= 1;
-        }
-    } else {
-        if (pack_x_pitch > 4) {
-            pack_x_pitch >>= 1;
-            pack_x_nr <<= 1;
-            assert(pack_x_pitch * pack_x_nr <= mt->pitch);
-        }
-
-        if (pack_y_pitch > 2) {
-            pack_y_pitch >>= 1;
-            pack_y_pitch = ALIGN(pack_y_pitch, align_h);
-        }
-    }
+	 if (mt->compressed) {
+	    pack_y_pitch = (height + 3) / 4;
+
+	    if (pack_x_pitch > ALIGN(width, align_w)) {
+	       pack_x_pitch = ALIGN(width, align_w);
+	       pack_x_nr <<= 1;
+	    }
+	 } else {
+	    if (pack_x_pitch > 4) {
+	       pack_x_pitch >>= 1;
+	       pack_x_nr <<= 1;
+	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
+	    }
+
+	    if (pack_y_pitch > 2) {
+	       pack_y_pitch >>= 1;
+	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+	    }
+	 }
 
       }
+      /* The 965's sampler lays cachelines out according to how accesses
+       * in the texture surfaces run, so they may be "vertical" through
+       * memory.  As a result, the docs say in Surface Padding Requirements:
+       * Sampling Engine Surfaces that two extra rows of padding are required.
+       * We don't know of similar requirements for pre-965, but given that
+       * those docs are silent on padding requirements in general, let's play
+       * it safe.
+       */
+      if (mt->target == GL_TEXTURE_CUBE_MAP)
+	 mt->total_height += 2;
       break;
    }
 
    default:
-      i945_miptree_layout_2d(intel, mt);
+      i945_miptree_layout_2d(intel, mt, tiling);
       break;
    }
-   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__, 
-		mt->pitch, 
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		mt->pitch,
 		mt->total_height,
 		mt->cpp,
 		mt->pitch * mt->total_height * mt->cpp );
-		
+
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 7673dd36eb..8c6f4355a6 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -143,7 +143,29 @@ static void recalculate_urb_fence( struct brw_context *brw )
       brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
       brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;	
       brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;	
-      
+
+      brw->urb.constrained = 0;
+
+      if (BRW_IS_IGDNG(brw)) {
+         brw->urb.nr_vs_entries = 128;
+         brw->urb.nr_sf_entries = 48;
+         if (check_urb_layout(brw)) {
+            goto done;
+         } else {
+            brw->urb.constrained = 1;
+            brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+            brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+         }
+      } else if (BRW_IS_G4X(brw)) {
+	 brw->urb.nr_vs_entries = 64;
+	 if (check_urb_layout(brw)) {
+	    goto done;
+	 } else {
+	    brw->urb.constrained = 1;
+	    brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+	 }
+      }
+
       if (!check_urb_layout(brw)) {
 	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;	
 	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;	
@@ -169,9 +191,8 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	 if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
 	    _mesa_printf("URB CONSTRAINED\n");
       }
-      else 
-	 brw->urb.constrained = 0;
 
+done:
       if (INTEL_DEBUG & DEBUG_URB)
 	 _mesa_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 1db7ceebcf..e3111c6680 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -85,6 +85,7 @@ static void do_vs_prog( struct brw_context *brw,
 
 static void brw_upload_vs_prog(struct brw_context *brw)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct brw_vs_prog_key key;
    struct brw_vertex_program *vp = 
       (struct brw_vertex_program *)brw->vertex_program;
@@ -97,14 +98,9 @@ static void brw_upload_vs_prog(struct brw_context *brw)
     * the inputs it asks for, whether they are varying or not.
     */
    key.program_string_id = vp->id;
-   key.nr_userclip = brw_count_bits(brw->attribs.Transform->ClipPlanesEnabled);
-   key.copy_edgeflag = (brw->attribs.Polygon->FrontMode != GL_FILL ||
-			brw->attribs.Polygon->BackMode != GL_FILL);
-
-   /* BRW_NEW_METAOPS
-    */
-   if (brw->metaops.active)
-      key.know_w_is_one = 1;
+   key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
+   key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
+			ctx->Polygon.BackMode != GL_FILL);
 
    /* Make an early check for the key.
     */
@@ -123,7 +119,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
       .mesa  = _NEW_TRANSFORM | _NEW_POLYGON,
-      .brw   = BRW_NEW_VERTEX_PROGRAM | BRW_NEW_METAOPS,
+      .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
    .prepare = brw_upload_vs_prog
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 22388ec99d..4a591365c9 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -43,7 +43,6 @@ struct brw_vs_prog_key {
    GLuint program_string_id;
    GLuint nr_userclip:4;
    GLuint copy_edgeflag:1;
-   GLuint know_w_is_one:1;
    GLuint pad:26;
 };
 
@@ -59,6 +58,7 @@ struct brw_vs_compile {
 
    GLuint first_output;
    GLuint nr_outputs;
+   GLuint first_overflow_output; /**< VERT_ATTRIB_x */
 
    GLuint first_tmp;
    GLuint last_tmp;
@@ -76,6 +76,11 @@ struct brw_vs_compile {
 
    struct brw_reg userplane[6];
 
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
 };
 
 void brw_vs_emit( struct brw_vs_compile *c );
diff --git a/src/mesa/drivers/dri/i965/brw_vs_constval.c b/src/mesa/drivers/dri/i965/brw_vs_constval.c
index 6fbac02de6..249a800bf4 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_constval.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_constval.c
@@ -39,8 +39,8 @@
  */
 struct tracker {
    GLboolean twoside;
-   GLubyte active[PROGRAM_OUTPUT+1][128];
-   GLuint size_masks[4];
+   GLubyte active[PROGRAM_OUTPUT+1][MAX_PROGRAM_TEMPS];
+   GLbitfield size_masks[4];  /**< one bit per fragment program input attrib */
 };
 
 
@@ -53,8 +53,10 @@ static void set_active_component( struct tracker *t,
    case PROGRAM_TEMPORARY:
    case PROGRAM_INPUT:
    case PROGRAM_OUTPUT:
+      assert(file < PROGRAM_OUTPUT + 1);
+      assert(index < Elements(t->active[0]));
       t->active[file][index] |= active;
-
+      break;
    default:
       break;
    }
@@ -96,7 +98,7 @@ static GLubyte get_active( struct tracker *t,
 			   struct prog_src_register src )
 {
    GLuint i;
-   GLubyte active = src.NegateBase; /* NOTE! */
+   GLubyte active = src.Negate; /* NOTE! */
 
    if (src.RelAddr)
       return 0xf;
@@ -108,10 +110,15 @@ static GLubyte get_active( struct tracker *t,
    return active;
 }
 
+/**
+ * Return the size (1,2,3 or 4) of the output/result for VERT_RESULT_idx.
+ */
 static GLubyte get_output_size( struct tracker *t,
 				GLuint idx )
 {
-   GLubyte active = t->active[PROGRAM_OUTPUT][idx];
+   GLubyte active;
+   assert(idx < VERT_RESULT_MAX);
+   active = t->active[PROGRAM_OUTPUT][idx];
    if (active & (1<<3)) return 4;
    if (active & (1<<2)) return 3;
    if (active & (1<<1)) return 2;
@@ -123,7 +130,7 @@ static GLubyte get_output_size( struct tracker *t,
  */
 static void calc_sizes( struct tracker *t )
 {
-   GLuint i;
+   GLint vertRes;
 
    if (t->twoside) {
       t->active[PROGRAM_OUTPUT][VERT_RESULT_COL0] |= 
@@ -133,12 +140,27 @@ static void calc_sizes( struct tracker *t )
 	 t->active[PROGRAM_OUTPUT][VERT_RESULT_BFC1];
    }
 
-   for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-      switch (get_output_size(t, i)) {
-      case 4: t->size_masks[4-1] |= 1<<i;
-      case 3: t->size_masks[3-1] |= 1<<i;
-      case 2: t->size_masks[2-1] |= 1<<i;
-      case 1: t->size_masks[1-1] |= 1<<i;
+   /* Examine vertex program output sizes to set the size_masks[] info
+    * which describes the fragment program input sizes.
+    */
+   for (vertRes = VERT_RESULT_TEX0; vertRes < VERT_RESULT_MAX; vertRes++) {
+      GLint fragAttrib;
+
+      /* map vertex program output index to fragment program input index */
+      if (vertRes <= VERT_RESULT_TEX7)
+         fragAttrib = FRAG_ATTRIB_TEX0 + vertRes - VERT_RESULT_TEX0;
+      else if (vertRes >= VERT_RESULT_VAR0)
+         fragAttrib = FRAG_ATTRIB_VAR0 + vertRes - VERT_RESULT_VAR0;
+      else
+         continue;
+      assert(fragAttrib >= FRAG_ATTRIB_TEX0);
+      assert(fragAttrib <= FRAG_ATTRIB_MAX);
+
+      switch (get_output_size(t, vertRes)) {
+      case 4: t->size_masks[4-1] |= 1 << fragAttrib;
+      case 3: t->size_masks[3-1] |= 1 << fragAttrib;
+      case 2: t->size_masks[2-1] |= 1 << fragAttrib;
+      case 1: t->size_masks[1-1] |= 1 << fragAttrib;
 	 break;
       }
    }
@@ -168,9 +190,10 @@ static GLuint get_input_size(struct brw_context *brw,
  */
 static void calc_wm_input_sizes( struct brw_context *brw )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    /* BRW_NEW_VERTEX_PROGRAM */
-   struct brw_vertex_program *vp = 
-      (struct brw_vertex_program *)brw->vertex_program;
+   const struct brw_vertex_program *vp =
+      brw_vertex_program_const(brw->vertex_program);
    /* BRW_NEW_INPUT_DIMENSIONS */
    struct tracker t;
    GLuint insn;
@@ -179,7 +202,7 @@ static void calc_wm_input_sizes( struct brw_context *brw )
    memset(&t, 0, sizeof(t));
 
    /* _NEW_LIGHT */
-   if (brw->attribs.Light->Model.TwoSide)
+   if (ctx->Light.Model.TwoSide)
       t.twoside = 1;
 
    for (i = 0; i < VERT_ATTRIB_MAX; i++) 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 174331a765..1638ef8111 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -38,18 +38,55 @@
 #include "brw_vs.h"
 
 
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+			       
+static void release_tmps( struct brw_vs_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
 
-/* Do things as simply as possible.  Allocate and populate all regs
+/**
+ * Preallocate GRF register before code emit.
+ * Do things as simply as possible.  Allocate and populate all regs
  * ahead of time.
  */
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    GLuint i, reg = 0, mrf;
-   GLuint nr_params;
+   int attributes_in_vue;
+
+   /* Determine whether to use a real constant buffer or use a block
+    * of GRF registers for constants.  The later is faster but only
+    * works if everything fits in the GRF.
+    * XXX this heuristic/check may need some fine tuning...
+    */
+   if (c->vp->program.Base.Parameters->NumParameters +
+       c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
+      c->vp->use_const_buffer = GL_TRUE;
+   else
+      c->vp->use_const_buffer = GL_FALSE;
+
+   /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 
    /* r0 -- reserved as usual
     */
-   c->r0 = brw_vec8_grf(reg, 0); reg++;
+   c->r0 = brw_vec8_grf(reg, 0);
+   reg++;
 
    /* User clip planes from curbe: 
     */
@@ -60,39 +97,59 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
       /* Deal with curbe alignment:
        */
-      reg += ((6+c->key.nr_userclip+3)/4)*2;
+      reg += ((6 + c->key.nr_userclip + 3) / 4) * 2;
    }
 
    /* Vertex program parameters from curbe:
     */
-   nr_params = c->vp->program.Base.Parameters->NumParameters;
-   for (i = 0; i < nr_params; i++) {
-      c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
-   }     
-   reg += (nr_params+1)/2;
+   if (c->vp->use_const_buffer) {
+      /* get constants from a real constant buffer */
+      c->prog_data.curb_read_length = 0;
+      c->prog_data.nr_params = 4; /* XXX 0 causes a bug elsewhere... */
+   }
+   else {
+      /* use a section of the GRF for constants */
+      GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
+      for (i = 0; i < nr_params; i++) {
+         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+      }
+      reg += (nr_params + 1) / 2;
+      c->prog_data.curb_read_length = reg - 1;
 
-   c->prog_data.curb_read_length = reg - 1;
+      c->prog_data.nr_params = nr_params * 4;
+   }
 
    /* Allocate input regs:  
     */
    c->nr_inputs = 0;
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      if (c->prog_data.inputs_read & (1<<i)) {
+      if (c->prog_data.inputs_read & (1 << i)) {
 	 c->nr_inputs++;
 	 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
 	 reg++;
       }
    }
+   /* If there are no inputs, we'll still be reading one attribute's worth
+    * because it's required -- see urb_read_length setting.
+    */
+   if (c->nr_inputs == 0)
+      reg++;
 
-   /* Allocate outputs: TODO: could organize the non-position outputs
-    * to go straight into message regs.
+   /* Allocate outputs.  The non-position outputs go straight into message regs.
     */
    c->nr_outputs = 0;
    c->first_output = reg;
-   mrf = 4;
+   c->first_overflow_output = 0;
+
+   if (BRW_IS_IGDNG(c->func.brw))
+       mrf = 8;
+   else
+       mrf = 4;
+
    for (i = 0; i < VERT_RESULT_MAX; i++) {
-      if (c->prog_data.outputs_written & (1<<i)) {
+      if (c->prog_data.outputs_written & (1 << i)) {
 	 c->nr_outputs++;
+         assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 	 if (i == VERT_RESULT_HPOS) {
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	    reg++;
@@ -103,8 +160,17 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
 	 }
 	 else {
-	    c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
-	    mrf++;
+            if (mrf < 16) {
+               c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
+               mrf++;
+            }
+            else {
+               /* too many vertex results to fit in MRF, use GRF for overflow */
+               if (!c->first_overflow_output)
+                  c->first_overflow_output = i;
+               c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+               reg++;
+            }
 	 }
       }
    }     
@@ -132,17 +198,24 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       reg++;
    }
 
+   if (c->vp->use_const_buffer) {
+      for (i = 0; i < 3; i++) {
+         c->current_const[i].index = -1;
+         c->current_const[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+
    for (i = 0; i < 128; i++) {
-       if (c->output_regs[i].used_in_src) {
-            c->output_regs[i].reg = brw_vec8_grf(reg, 0);
-            reg++;
-        }
+      if (c->output_regs[i].used_in_src) {
+         c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
    }
 
    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
    reg += 2;
- 
-   
+
    /* Some opcodes need an internal temporary:
     */
    c->first_tmp = reg;
@@ -152,35 +225,38 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     * urb_read_length is the number of registers read from *each*
     * vertex urb, so is half the amount:
     */
-   c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
-
-   c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
-   c->prog_data.total_grf = reg;
-}
-
+   c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
+   /* Setting this field to 0 leads to undefined behavior according to the
+    * the VS_STATE docs.  Our VUEs will always have at least one attribute
+    * sitting in them, even if it's padding.
+    */
+   if (c->prog_data.urb_read_length == 0)
+      c->prog_data.urb_read_length = 1;
 
-static struct brw_reg get_tmp( struct brw_vs_compile *c )
-{
-   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+   /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
+    * them to fit the biggest thing they need to.
+    */
+   attributes_in_vue = MAX2(c->nr_outputs, c->nr_inputs);
 
-   if (++c->last_tmp > c->prog_data.total_grf)
-      c->prog_data.total_grf = c->last_tmp;
+   if (BRW_IS_IGDNG(c->func.brw))
+       c->prog_data.urb_entry_size = (attributes_in_vue + 6 + 3) / 4;
+   else
+       c->prog_data.urb_entry_size = (attributes_in_vue + 2 + 3) / 4;
 
-   return tmp;
-}
+   c->prog_data.total_grf = reg;
 
-static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
-{
-   if (tmp.nr == c->last_tmp-1)
-      c->last_tmp--;
-}
-			       
-static void release_tmps( struct brw_vs_compile *c )
-{
-   c->last_tmp = c->first_tmp;
+   if (INTEL_DEBUG & DEBUG_VS) {
+      _mesa_printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
+      _mesa_printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
+      _mesa_printf("%s reg = %d\n", __FUNCTION__, reg);
+   }
 }
 
 
+/**
+ * If an instruction uses a temp reg both as a src and the dest, we
+ * sometimes need to allocate an intermediate temporary.
+ */
 static void unalias1( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
@@ -200,6 +276,10 @@ static void unalias1( struct brw_vs_compile *c,
    }
 }
 
+/**
+ * \sa unalias2
+ * Checkes if 2-operand instruction needs an intermediate temporary.
+ */
 static void unalias2( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
@@ -222,6 +302,10 @@ static void unalias2( struct brw_vs_compile *c,
    }
 }
 
+/**
+ * \sa unalias2
+ * Checkes if 3-operand instruction needs an intermediate temporary.
+ */
 static void unalias3( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0,
@@ -615,6 +699,8 @@ static void emit_lit_noalias( struct brw_vs_compile *c,
    }
 
    brw_ENDIF(p, if_insn);
+
+   release_tmp(c, tmp);
 }
 
 static void emit_lrp_noalias(struct brw_vs_compile *c,
@@ -655,13 +741,84 @@ static void emit_nrm( struct brw_vs_compile *c,
 }
 
 
+static struct brw_reg
+get_constant(struct brw_vs_compile *c,
+             const struct prog_instruction *inst,
+             GLuint argIndex)
+{
+   const struct prog_src_register *src = &inst->SrcReg[argIndex];
+   struct brw_compile *p = &c->func;
+   struct brw_reg const_reg;
+   struct brw_reg const2_reg;
+   const GLboolean relAddr = src->RelAddr;
+
+   assert(argIndex < 3);
+
+   if (c->current_const[argIndex].index != src->Index || relAddr) {
+      struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
+
+      c->current_const[argIndex].index = src->Index;
+
+#if 0
+      printf("  fetch const[%d] for arg %d into reg %d\n",
+             src->Index, argIndex, c->current_const[argIndex].reg.nr);
+#endif
+      /* need to fetch the constant now */
+      brw_dp_READ_4_vs(p,
+                       c->current_const[argIndex].reg,/* writeback dest */
+                       0,                             /* oword */
+                       relAddr,                       /* relative indexing? */
+                       addrReg,                       /* address register */
+                       16 * src->Index,               /* byte offset */
+                       SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
+                       );
+
+      if (relAddr) {
+         /* second read */
+         const2_reg = get_tmp(c);
+
+         /* use upper half of address reg for second read */
+         addrReg = stride(addrReg, 0, 4, 0);
+         addrReg.subnr = 16;
+
+         brw_dp_READ_4_vs(p,
+                          const2_reg,              /* writeback dest */
+                          1,                       /* oword */
+                          relAddr,                 /* relative indexing? */
+                          addrReg,                 /* address register */
+                          16 * src->Index,         /* byte offset */
+                          SURF_INDEX_VERT_CONST_BUFFER
+                          );
+      }
+   }
+
+   const_reg = c->current_const[argIndex].reg;
+
+   if (relAddr) {
+      /* merge the two Owords into the constant register */
+      /* const_reg[7..4] = const2_reg[7..4] */
+      brw_MOV(p,
+              suboffset(stride(const_reg, 0, 4, 1), 4),
+              suboffset(stride(const2_reg, 0, 4, 1), 4));
+      release_tmp(c, const2_reg);
+   }
+   else {
+      /* replicate lower four floats into upper half (to get XYZWXYZW) */
+      const_reg = stride(const_reg, 0, 4, 0);
+      const_reg.subnr = 0;
+   }
+
+   return const_reg;
+}
+
+
+
 /* TODO: relative addressing!
  */
 static struct brw_reg get_reg( struct brw_vs_compile *c,
-			       GLuint file,
+			       gl_register_file file,
 			       GLuint index )
 {
-
    switch (file) {
    case PROGRAM_TEMPORARY:
    case PROGRAM_INPUT:
@@ -690,13 +847,17 @@ static struct brw_reg get_reg( struct brw_vs_compile *c,
 }
 
 
+/**
+ * Indirect addressing:  get reg[[arg] + offset].
+ */
 static struct brw_reg deref( struct brw_vs_compile *c,
 			     struct brw_reg arg,
 			     GLint offset)
 {
    struct brw_compile *p = &c->func;
    struct brw_reg tmp = vec4(get_tmp(c));
-   struct brw_reg vp_address = retype(vec1(get_reg(c, PROGRAM_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
+   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_UW);
    GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
    struct brw_reg indirect = brw_vec4_indirect(0,0);
 
@@ -717,10 +878,67 @@ static struct brw_reg deref( struct brw_vs_compile *c,
       brw_pop_insn_state(p);
    }
    
+   /* NOTE: tmp not released */
    return vec8(tmp);
 }
 
 
+/**
+ * Get brw reg corresponding to the instruction's [argIndex] src reg.
+ * TODO: relative addressing!
+ */
+static struct brw_reg
+get_src_reg( struct brw_vs_compile *c,
+             const struct prog_instruction *inst,
+             GLuint argIndex )
+{
+   const GLuint file = inst->SrcReg[argIndex].File;
+   const GLint index = inst->SrcReg[argIndex].Index;
+   const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
+
+   switch (file) {
+   case PROGRAM_TEMPORARY:
+   case PROGRAM_INPUT:
+   case PROGRAM_OUTPUT:
+      if (relAddr) {
+         return deref(c, c->regs[file][0], index);
+      }
+      else {
+         assert(c->regs[file][index].nr != 0);
+         return c->regs[file][index];
+      }
+
+   case PROGRAM_STATE_VAR:
+   case PROGRAM_CONSTANT:
+   case PROGRAM_UNIFORM:
+   case PROGRAM_ENV_PARAM:
+      if (c->vp->use_const_buffer) {
+         return get_constant(c, inst, argIndex);
+      }
+      else if (relAddr) {
+         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index);
+      }
+      else {
+         assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
+         return c->regs[PROGRAM_STATE_VAR][index];
+      }
+   case PROGRAM_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case PROGRAM_UNDEFINED:
+      /* this is a normal case since we loop over all three src args */
+      return brw_null_reg();
+
+   case PROGRAM_LOCAL_PARAM: 
+   case PROGRAM_WRITE_ONLY:
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
 static void emit_arl( struct brw_vs_compile *c,
 		      struct brw_reg dst,
 		      struct brw_reg arg0 )
@@ -732,30 +950,31 @@ static void emit_arl( struct brw_vs_compile *c,
    if (need_tmp) 
       tmp = get_tmp(c);
 
-   brw_RNDD(p, tmp, arg0);
-   brw_MUL(p, dst, tmp, brw_imm_d(16));
+   brw_RNDD(p, tmp, arg0);               /* tmp = round(arg0) */
+   brw_MUL(p, dst, tmp, brw_imm_d(16));  /* dst = tmp * 16 */
 
    if (need_tmp)
       release_tmp(c, tmp);
 }
 
 
-/* Will return mangled results for SWZ op.  The emit_swz() function
+/**
+ * Return the brw reg for the given instruction's src argument.
+ * Will return mangled results for SWZ op.  The emit_swz() function
  * ignores this result and recalculates taking extended swizzles into
  * account.
  */
 static struct brw_reg get_arg( struct brw_vs_compile *c,
-			       struct prog_src_register *src )
+                               const struct prog_instruction *inst,
+                               GLuint argIndex )
 {
+   const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_reg reg;
 
    if (src->File == PROGRAM_UNDEFINED)
       return brw_null_reg();
 
-   if (src->RelAddr) 
-      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
-   else
-      reg = get_reg(c, src->File, src->Index);
+   reg = get_src_reg(c, inst, argIndex);
 
    /* Convert 3-bit swizzle to 2-bit.  
     */
@@ -766,16 +985,38 @@ static struct brw_reg get_arg( struct brw_vs_compile *c,
 
    /* Note this is ok for non-swizzle instructions: 
     */
-   reg.negate = src->NegateBase ? 1 : 0;   
+   reg.negate = src->Negate ? 1 : 0;   
 
    return reg;
 }
 
 
+/**
+ * Get brw register for the given program dest register.
+ */
 static struct brw_reg get_dst( struct brw_vs_compile *c,
 			       struct prog_dst_register dst )
 {
-   struct brw_reg reg = get_reg(c, dst.File, dst.Index);
+   struct brw_reg reg;
+
+   switch (dst.File) {
+   case PROGRAM_TEMPORARY:
+   case PROGRAM_OUTPUT:
+      assert(c->regs[dst.File][dst.Index].nr != 0);
+      reg = c->regs[dst.File][dst.Index];
+      break;
+   case PROGRAM_ADDRESS:
+      assert(dst.Index == 0);
+      reg = c->regs[dst.File][dst.Index];
+      break;
+   case PROGRAM_UNDEFINED:
+      /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
+      reg = brw_null_reg();
+      break;
+   default:
+      assert(0);
+      reg = brw_null_reg();
+   }
 
    reg.dw1.bits.writemask = dst.WriteMask;
 
@@ -785,14 +1026,16 @@ static struct brw_reg get_dst( struct brw_vs_compile *c,
 
 static void emit_swz( struct brw_vs_compile *c, 
 		      struct brw_reg dst,
-		      struct prog_src_register src )
+                      const struct prog_instruction *inst)
 {
+   const GLuint argIndex = 0;
+   const struct prog_src_register src = inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
    GLuint zeros_mask = 0;
    GLuint ones_mask = 0;
    GLuint src_mask = 0;
    GLubyte src_swz[4];
-   GLboolean need_tmp = (src.NegateBase &&
+   GLboolean need_tmp = (src.Negate &&
 			 dst.file != BRW_GENERAL_REGISTER_FILE);
    struct brw_reg tmp = dst;
    GLuint i;
@@ -826,10 +1069,7 @@ static void emit_swz( struct brw_vs_compile *c,
    if (src_mask) {
       struct brw_reg arg0;
 
-      if (src.RelAddr) 
-	 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
-      else
-	 arg0 = get_reg(c, src.File, src.Index);
+      arg0 = get_src_reg(c, inst, argIndex);
 
       arg0 = brw_swizzle(arg0, 
 			 src_swz[0], src_swz[1], 
@@ -844,8 +1084,8 @@ static void emit_swz( struct brw_vs_compile *c,
    if (ones_mask) 
       brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
 
-   if (src.NegateBase)
-      brw_MOV(p, brw_writemask(tmp, src.NegateBase), negate(tmp));
+   if (src.Negate)
+      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
    
    if (need_tmp) {
       brw_MOV(p, dst, tmp);
@@ -863,6 +1103,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
+   int eot;
+   GLuint len_vertext_header = 2;
 
    if (c->key.copy_edgeflag) {
       brw_MOV(p, 
@@ -871,21 +1113,17 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    }
 
    /* Build ndc coords */
-   if (!c->key.know_w_is_one) {
-      ndc = get_tmp(c);
-      emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
-      brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
-   }
-   else {
-      ndc = pos;
-   }
+   ndc = get_tmp(c);
+   /* ndc = 1.0 / pos.w */
+   emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+   /* ndc.xyz = pos * ndc */
+   brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
 
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip ||
-       (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one))
+       c->key.nr_userclip || BRW_IS_965(p->brw))
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -916,7 +1154,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (!BRW_IS_G4X(p->brw) && !c->key.know_w_is_one) {
+      if (BRW_IS_965(p->brw)) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -943,7 +1181,23 @@ static void emit_vertex_write( struct brw_vs_compile *c)
     */
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_MOV(p, offset(m0, 2), ndc);
-   brw_MOV(p, offset(m0, 3), pos);
+
+   if (BRW_IS_IGDNG(p->brw)) {
+       /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
+       brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
+       /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
+        * Seems it is useless for us.
+        * m6 is used for aligning, so that the remainder of vertex element is 
+        * reg-aligned.
+        */
+       brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
+       len_vertext_header = 6;
+   } else {
+       brw_MOV(p, offset(m0, 3), pos);
+       len_vertext_header = 2;
+   }
+
+   eot = (c->first_overflow_output == 0);
 
    brw_urb_WRITE(p, 
 		 brw_null_reg(), /* dest */
@@ -951,62 +1205,126 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 c->r0,		/* src */
 		 0,		/* allocate */
 		 1,		/* used */
-		 c->nr_outputs + 3, /* msg len */
+		 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
 		 0,		/* response len */
-		 1, 		/* eot */
-		 1, 		/* writes complete */
+		 eot, 		/* eot */
+		 eot, 		/* writes complete */
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
+
+   if (c->first_overflow_output > 0) {
+      /* Not all of the vertex outputs/results fit into the MRF.
+       * Move the overflowed attributes from the GRF to the MRF and
+       * issue another brw_urb_WRITE().
+       */
+      /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
+       * at mrf[4] atm...
+       */
+      GLuint i, mrf = 0;
+      for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
+         if (c->prog_data.outputs_written & (1 << i)) {
+            /* move from GRF to MRF */
+            brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+            mrf++;
+         }
+      }
+
+      brw_urb_WRITE(p,
+                    brw_null_reg(), /* dest */
+                    4,              /* starting mrf reg nr */
+                    c->r0,          /* src */
+                    0,              /* allocate */
+                    1,              /* used */
+                    mrf+1,          /* msg len */
+                    0,              /* response len */
+                    1,              /* eot */
+                    1,              /* writes complete */
+                    BRW_MAX_MRF-1,  /* urb destination offset */
+                    BRW_URB_SWIZZLE_INTERLEAVE);
+   }
 }
 
 
+/**
+ * Called after code generation to resolve subroutine calls and the
+ * END instruction.
+ * \param end_inst  points to brw code for END instruction
+ * \param last_inst  points to last instruction emitted before vertex write
+ */
 static void 
-post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
+post_vs_emit( struct brw_vs_compile *c,
+              struct brw_instruction *end_inst,
+              struct brw_instruction *last_inst )
 {
-   GLuint nr_insns = c->vp->program.Base.NumInstructions;
-   GLuint insn, target_insn;
-   struct prog_instruction *inst1, *inst2;
-   struct brw_instruction *brw_inst1, *brw_inst2;
-   int offset;
-   for (insn = 0; insn < nr_insns; insn++) {
-       inst1 = &c->vp->program.Base.Instructions[insn];
-       brw_inst1 = inst1->Data;
-       switch (inst1->Opcode) {
-	   case OPCODE_CAL:
-	   case OPCODE_BRA:
-	       target_insn = inst1->BranchTarget;
-	       inst2 = &c->vp->program.Base.Instructions[target_insn];
-	       brw_inst2 = inst2->Data;
-	       offset = brw_inst2 - brw_inst1;
-	       brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-	       break;
-	   case OPCODE_END:
-	       offset = end_inst - brw_inst1;
-	       brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-	       break;
-	   default:
-	       break;
-       }
+   GLint offset;
+
+   brw_resolve_cals(&c->func);
+
+   /* patch up the END code to jump past subroutines, etc */
+   offset = last_inst - end_inst;
+   if (offset > 1) {
+      brw_set_src1(end_inst, brw_imm_d(offset * 16));
+   } else {
+      end_inst->header.opcode = BRW_OPCODE_NOP;
+   }
+}
+
+static uint32_t
+get_predicate(const struct prog_instruction *inst)
+{
+   if (inst->DstReg.CondMask == COND_TR)
+      return BRW_PREDICATE_NONE;
+
+   /* All of GLSL only produces predicates for COND_NE and one channel per
+    * vector.  Fail badly if someone starts doing something else, as it might
+    * mean infinite looping or something.
+    *
+    * We'd like to support all the condition codes, but our hardware doesn't
+    * quite match the Mesa IR, which is modeled after the NV extensions.  For
+    * those, the instruction may update the condition codes or not, then any
+    * later instruction may use one of those condition codes.  For gen4, the
+    * instruction may update the flags register based on one of the condition
+    * codes output by the instruction, and then further instructions may
+    * predicate on that.  We can probably support this, but it won't
+    * necessarily be easy.
+    */
+   assert(inst->DstReg.CondMask == COND_NE);
+
+   switch (inst->DstReg.CondSwizzle) {
+   case SWIZZLE_XXXX:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case SWIZZLE_YYYY:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case SWIZZLE_ZZZZ:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case SWIZZLE_WWWW:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
+		    inst->DstReg.CondMask);
+      return BRW_PREDICATE_NORMAL;
    }
 }
 
-/* Emit the fragment program instructions here.
+/* Emit the vertex program instructions here.
  */
 void brw_vs_emit(struct brw_vs_compile *c )
 {
-#define MAX_IFSN 32
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
    struct brw_compile *p = &c->func;
-   GLuint nr_insns = c->vp->program.Base.NumInstructions;
-   GLuint insn, if_insn = 0;
-   struct brw_instruction *end_inst;
-   struct brw_instruction *if_inst[MAX_IFSN];
-   struct brw_indirect stack_index = brw_indirect(0, 0);   
-
+   struct brw_context *brw = p->brw;
+   const GLuint nr_insns = c->vp->program.Base.NumInstructions;
+   GLuint insn, if_depth = 0, loop_depth = 0;
+   GLuint end_offset = 0;
+   struct brw_instruction *end_inst, *last_inst;
+   struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+   const struct brw_indirect stack_index = brw_indirect(0, 0);   
    GLuint index;
    GLuint file;
 
    if (INTEL_DEBUG & DEBUG_VS) {
-      _mesa_printf("vs-emit:\n");
+      _mesa_printf("vs-mesa:\n");
       _mesa_print_program(&c->vp->program.Base); 
       _mesa_printf("\n");
    }
@@ -1035,22 +1353,26 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    for (insn = 0; insn < nr_insns; insn++) {
 
-      struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
+      const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
       struct brw_reg args[3], dst;
       GLuint i;
       
+#if 0
+      printf("%d: ", insn);
+      _mesa_print_instruction(inst);
+#endif
+
       /* Get argument regs.  SWZ is special and does this itself.
        */
-      inst->Data = &p->store[p->nr_insn];
       if (inst->Opcode != OPCODE_SWZ)
 	  for (i = 0; i < 3; i++) {
-	      struct prog_src_register *src = &inst->SrcReg[i];
+	      const struct prog_src_register *src = &inst->SrcReg[i];
 	      index = src->Index;
 	      file = src->File;	
-	      if (file == PROGRAM_OUTPUT&&c->output_regs[index].used_in_src)
+	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
 		  args[i] = c->output_regs[index].reg;
 	      else
-		  args[i] = get_arg(c, src);
+                  args[i] = get_arg(c, inst, i);
 	  }
 
       /* Get dest regs.  Note that it is possible for a reg to be both
@@ -1178,7 +1500,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 /* The args[0] value can't be used here as it won't have
 	  * correctly encoded the full swizzle:
 	  */
-	 emit_swz(c, dst, inst->SrcReg[0] );
+	 emit_swz(c, dst, inst);
 	 break;
       case OPCODE_TRUNC:
          /* round toward zero */
@@ -1188,20 +1510,61 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_xpd(p, dst, args[0], args[1]);
 	 break;
       case OPCODE_IF:
-	 assert(if_insn < MAX_IFSN);
-         if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+	 assert(if_depth < MAX_IF_DEPTH);
+	 if_inst[if_depth] = brw_IF(p, BRW_EXECUTE_8);
+	 /* Note that brw_IF smashes the predicate_control field. */
+	 if_inst[if_depth]->header.predicate_control = get_predicate(inst);
+	 if_depth++;
 	 break;
       case OPCODE_ELSE:
-	 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
 	 break;
       case OPCODE_ENDIF:
-         assert(if_insn > 0);
-	 brw_ENDIF(p, if_inst[--if_insn]);
+         assert(if_depth > 0);
+	 brw_ENDIF(p, if_inst[--if_depth]);
 	 break;			
+      case OPCODE_BGNLOOP:
+         loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+         break;
+      case OPCODE_BRK:
+	 brw_set_predicate_control(p, get_predicate(inst));
+         brw_BREAK(p);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+      case OPCODE_CONT:
+	 brw_set_predicate_control(p, get_predicate(inst));
+         brw_CONT(p);
+         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+      case OPCODE_ENDLOOP: 
+         {
+            struct brw_instruction *inst0, *inst1;
+	    GLuint br = 1;
+
+            loop_depth--;
+
+	    if (BRW_IS_IGDNG(brw))
+	       br = 2;
+
+            inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+            /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+            while (inst0 > loop_inst[loop_depth]) {
+               inst0--;
+               if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+                  inst0->bits3.if_else.pop_count = 0;
+               }
+               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+                  inst0->bits3.if_else.pop_count = 0;
+               }
+            }
+         }
+         break;
       case OPCODE_BRA:
-         brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_set_predicate_control(p, get_predicate(inst));
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-         brw_set_predicate_control_flag_value(p, 0xff);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
       case OPCODE_CAL:
 	 brw_set_access_mode(p, BRW_ALIGN_1);
@@ -1209,7 +1572,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 brw_set_access_mode(p, BRW_ALIGN_16);
 	 brw_ADD(p, get_addr_reg(stack_index),
 			 get_addr_reg(stack_index), brw_imm_d(4));
-	 inst->Data = &p->store[p->nr_insn];
+         brw_save_call(p, inst->Comment, p->nr_insn);
 	 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
       case OPCODE_RET:
@@ -1218,14 +1581,23 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 brw_set_access_mode(p, BRW_ALIGN_1);
          brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
 	 brw_set_access_mode(p, BRW_ALIGN_16);
+	 break;
       case OPCODE_END:	
+         end_offset = p->nr_insn;
+         /* this instruction will get patched later to jump past subroutine
+          * code, etc.
+          */
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
          break;
       case OPCODE_PRINT:
+         /* no-op */
+         break;
       case OPCODE_BGNSUB:
+         brw_save_label(p, inst->Comment, p->nr_insn);
+         break;
       case OPCODE_ENDSUB:
-         /* no-op instructions */
-	 break;
+         /* no-op */
+         break;
       default:
 	 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
                        inst->Opcode, inst->Opcode < MAX_OPCODE ?
@@ -1233,6 +1605,19 @@ void brw_vs_emit(struct brw_vs_compile *c )
 				    "unknown");
       }
 
+      /* Set the predication update on the last instruction of the native
+       * instruction sequence.
+       *
+       * This would be problematic if it was set on a math instruction,
+       * but that shouldn't be the case with the current GLSL compiler.
+       */
+      if (inst->CondUpdate) {
+	 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
+
+	 assert(hw_insn->header.destreg__conditionalmod == 0);
+	 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
+      }
+
       if ((inst->DstReg.File == PROGRAM_OUTPUT)
           && (inst->DstReg.Index != VERT_RESULT_HPOS)
           && c->output_regs[inst->DstReg.Index].used_in_src) {
@@ -1263,9 +1648,20 @@ void brw_vs_emit(struct brw_vs_compile *c )
       release_tmps(c);
    }
 
-   end_inst = &p->store[p->nr_insn];
+   end_inst = &p->store[end_offset];
+   last_inst = &p->store[p->nr_insn];
+
+   /* The END instruction will be patched to jump to this code */
    emit_vertex_write(c);
-   post_vs_emit(c, end_inst);
-   for (insn = 0; insn < nr_insns; insn++)
-       c->vp->program.Base.Instructions[insn].Data = NULL;
+
+   post_vs_emit(c, end_inst, last_inst);
+
+   if (INTEL_DEBUG & DEBUG_VS) {
+      int i;
+
+      _mesa_printf("vs-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
+   }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index 942581696d..d790ab6555 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -44,11 +44,15 @@ struct brw_vs_unit_key {
    unsigned int curbe_offset;
 
    unsigned int nr_urb_entries, urb_size;
+
+   unsigned int nr_surfaces;
 };
 
 static void
 vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 {
+   GLcontext *ctx = &brw->intel.ctx;
+
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
@@ -60,8 +64,11 @@ vs_unit_populate_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    key->nr_urb_entries = brw->urb.nr_vs_entries;
    key->urb_size = brw->urb.vsize;
 
+   /* BRW_NEW_NR_VS_SURFACES */
+   key->nr_surfaces = brw->vs.nr_surfaces;
+
    /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
-   if (brw->attribs.Transform->ClipPlanesEnabled) {
+   if (ctx->Transform.ClipPlanesEnabled) {
       /* Note that we read in the userclip planes as well, hence
        * clip_start:
        */
@@ -90,16 +97,28 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     * brw_urb_WRITE() results.
     */
    vs.thread1.single_program_flow = 0;
+
+   if (BRW_IS_IGDNG(brw))
+      vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      vs.thread1.binding_table_entry_count = key->nr_surfaces;
+
    vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
    vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    vs.thread3.dispatch_grf_start_reg = 1;
    vs.thread3.urb_entry_read_offset = 0;
    vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   vs.thread4.nr_urb_entries = key->nr_urb_entries;
+   if (BRW_IS_IGDNG(brw))
+       vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+   else
+       vs.thread4.nr_urb_entries = key->nr_urb_entries;
+
    vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
 
-   if (BRW_IS_G4X(brw))
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 72;
+   else if (BRW_IS_G4X(brw))
       chipset_max_threads = 32;
    else
       chipset_max_threads = 16;
@@ -111,6 +130,8 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 
    /* No samplers for ARB_vp programs:
     */
+   /* It has to be set to 0 for IGDNG
+    */
    vs.vs5.sampler_count = 0;
 
    if (INTEL_DEBUG & DEBUG_STATS)
@@ -156,6 +177,7 @@ const struct brw_tracked_state brw_vs_unit = {
    .dirty = {
       .mesa  = _NEW_TRANSFORM,
       .brw   = (BRW_NEW_CURBE_OFFSETS |
+                BRW_NEW_NR_VS_SURFACES |
 		BRW_NEW_URB_FENCE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
new file mode 100644
index 0000000000..89f47522a1
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -0,0 +1,226 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "main/mtypes.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "shader/prog_parameter.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+/* Creates a new VS constant buffer reflecting the current VS program's
+ * constants, if needed by the VS program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_vs_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
+
+/**
+ * Update the surface state for a VS constant buffer.
+ *
+ * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
+ */
+static void
+brw_update_vs_constant_surface( GLcontext *ctx,
+                                GLuint surf)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+
+   assert(surf == 0);
+
+   /* If we're in this state update atom, we need to update VS constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(vp->const_buffer);
+   vp->const_buffer = brw_vs_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (vp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+      brw->vs.surf_bo[surf] = NULL;
+      return;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = vp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->vs.surf_bo[surf] == NULL) {
+      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+}
+
+
+/**
+ * Constructs the binding table for the VS surface state.
+ */
+static dri_bo *
+brw_vs_get_binding_table(struct brw_context *brw)
+{
+   dri_bo *bind_bo;
+
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+			      NULL, 0,
+			      brw->vs.surf_bo, BRW_VS_MAX_SURF,
+			      NULL);
+
+   if (bind_bo == NULL) {
+      GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
+      uint32_t *data = malloc(data_size);
+      int i;
+
+      for (i = 0; i < BRW_VS_MAX_SURF; i++)
+         if (brw->vs.surf_bo[i])
+            data[i] = brw->vs.surf_bo[i]->offset;
+         else
+            data[i] = 0;
+
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+				  NULL, 0,
+				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
+				  data, data_size,
+				  NULL, NULL);
+
+      /* Emit binding table relocations to surface state */
+      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+	 if (brw->vs.surf_bo[i] != NULL) {
+	    /* The presumed offsets were set in the data values for
+	     * brw_upload_cache.
+	     */
+	    drm_intel_bo_emit_reloc(bind_bo, i * 4,
+				    brw->vs.surf_bo[i], 0,
+				    I915_GEM_DOMAIN_INSTRUCTION, 0);
+	 }
+      }
+
+      free(data);
+   }
+
+   return bind_bo;
+}
+
+/**
+ * Vertex shader surfaces (constant buffer).
+ *
+ * This consumes the state updates for the constant buffer needing
+ * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
+ * CACHE_NEW_SURF_BIND for the binding table upload.
+ */
+static void prepare_vs_surfaces(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   int i;
+   int nr_surfaces = 0;
+
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      if (brw->vs.surf_bo[i] != NULL) {
+	 nr_surfaces = i + 1;
+      }
+   }
+
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
+
+   /* Note that we don't end up updating the bind_bo if we don't have a
+    * surface to be pointing at.  This should be relatively harmless, as it
+    * just slightly increases our working set size.
+    */
+   if (brw->vs.nr_surfaces != 0) {
+      dri_bo_unreference(brw->vs.bind_bo);
+      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+   }
+}
+
+const struct brw_tracked_state brw_vs_surfaces = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_vs_surfaces,
+};
+
+
+
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index f7293ef423..ac11790151 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -23,14 +23,12 @@
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  
- **********************************************************************/
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-            
-
+**********************************************************************/
 
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
 
 #include "main/glheader.h"
 #include "main/mtypes.h"
@@ -44,12 +42,11 @@
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_state.h"
-
 #include "brw_draw.h"
 #include "brw_state.h"
 #include "brw_fallback.h"
 #include "brw_vs.h"
-#include <stdarg.h>
+
 
 static void
 dri_bo_release(dri_bo **bo)
@@ -58,27 +55,31 @@ dri_bo_release(dri_bo **bo)
    *bo = NULL;
 }
 
-/* called from intelDestroyContext()
+
+/**
+ * called from intelDestroyContext()
  */
 static void brw_destroy_context( struct intel_context *intel )
 {
    struct brw_context *brw = brw_context(&intel->ctx);
    int i;
 
-   brw_destroy_metaops(brw);
    brw_destroy_state(brw);
    brw_draw_destroy( brw );
 
+   _mesa_free(brw->wm.compile_data);
+
    brw_FrameBufferTexDestroy( brw );
 
-   for (i = 0; i < brw->state.nr_draw_regions; i++)
-       intel_region_release(&brw->state.draw_regions[i]);
-   brw->state.nr_draw_regions = 0;
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+      intel_region_release(&brw->state.color_regions[i]);
+   brw->state.nr_color_regions = 0;
    intel_region_release(&brw->state.depth_region);
 
    dri_bo_release(&brw->curbe.curbe_bo);
    dri_bo_release(&brw->vs.prog_bo);
    dri_bo_release(&brw->vs.state_bo);
+   dri_bo_release(&brw->vs.bind_bo);
    dri_bo_release(&brw->gs.prog_bo);
    dri_bo_release(&brw->gs.state_bo);
    dri_bo_release(&brw->clip.prog_bo);
@@ -92,6 +93,7 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->wm.bind_bo);
    for (i = 0; i < BRW_WM_MAX_SURF; i++)
       dri_bo_release(&brw->wm.surf_bo[i]);
+   dri_bo_release(&brw->wm.sampler_bo);
    dri_bo_release(&brw->wm.prog_bo);
    dri_bo_release(&brw->wm.state_bo);
    dri_bo_release(&brw->cc.prog_bo);
@@ -99,37 +101,46 @@ static void brw_destroy_context( struct intel_context *intel )
    dri_bo_release(&brw->cc.vp_bo);
 }
 
-/* called from intelDrawBuffer()
+
+/**
+ * called from intelDrawBuffer()
  */
 static void brw_set_draw_region( struct intel_context *intel, 
-				  struct intel_region *draw_regions[],
-				  struct intel_region *depth_region,
-				GLuint num_regions)
+                                 struct intel_region *color_regions[],
+                                 struct intel_region *depth_region,
+                                 GLuint num_color_regions)
 {
    struct brw_context *brw = brw_context(&intel->ctx);
-   int i;
+   GLuint i;
+
+   /* release old color/depth regions */
    if (brw->state.depth_region != depth_region)
       brw->state.dirty.brw |= BRW_NEW_DEPTH_BUFFER;
-   for (i = 0; i < brw->state.nr_draw_regions; i++)
-       intel_region_release(&brw->state.draw_regions[i]);
+   for (i = 0; i < brw->state.nr_color_regions; i++)
+       intel_region_release(&brw->state.color_regions[i]);
    intel_region_release(&brw->state.depth_region);
-   for (i = 0; i < num_regions; i++)
-       intel_region_reference(&brw->state.draw_regions[i], draw_regions[i]);
+
+   /* reference new color/depth regions */
+   for (i = 0; i < num_color_regions; i++)
+       intel_region_reference(&brw->state.color_regions[i], color_regions[i]);
    intel_region_reference(&brw->state.depth_region, depth_region);
-   brw->state.nr_draw_regions = num_regions;
+   brw->state.nr_color_regions = num_color_regions;
 }
 
-/* called from intel_batchbuffer_flush and children before sending a
+
+/**
+ * called from intel_batchbuffer_flush and children before sending a
  * batchbuffer off.
  */
 static void brw_finish_batch(struct intel_context *intel)
 {
    struct brw_context *brw = brw_context(&intel->ctx);
-
    brw_emit_query_end(brw);
 }
 
-/* called from intelFlushBatchLocked
+
+/**
+ * called from intelFlushBatchLocked
  */
 static void brw_new_batch( struct intel_context *intel )
 {
@@ -160,39 +171,12 @@ static void brw_new_batch( struct intel_context *intel )
    }
 }
 
-static void brw_note_fence( struct intel_context *intel, 
-			    GLuint fence )
-{
-   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
-}
- 
-static void brw_note_unlock( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-
-   brw_state_cache_check_size(brw);
-}
-
-
-void brw_do_flush( struct brw_context *brw, 
-		   GLuint flags )
-{
-   struct brw_mi_flush flush;
-   memset(&flush, 0, sizeof(flush));      
-   flush.opcode = CMD_MI_FLUSH;
-   flush.flags = flags;
-   BRW_BATCH_STRUCT(brw, &flush);
-}
-
 
-static void brw_emit_flush( struct intel_context *intel,
-			GLuint unused )
+static void brw_note_fence( struct intel_context *intel, GLuint fence )
 {
-   brw_do_flush(brw_context(&intel->ctx),
-		BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE);
+   brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
 }
 
-
 /* called from intelWaitForIdle() and intelFlush()
  *
  * For now, just flush everything.  Could be smarter later.
@@ -202,10 +186,11 @@ static GLuint brw_flush_cmd( void )
    struct brw_mi_flush flush;
    flush.opcode = CMD_MI_FLUSH;
    flush.pad = 0;
-   flush.flags = BRW_FLUSH_READ_CACHE | BRW_FLUSH_STATE_CACHE;
+   flush.flags = BRW_FLUSH_STATE_CACHE;
    return *(GLuint *)&flush;
 }
 
+
 static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
 {
    /* nothing */
@@ -215,20 +200,17 @@ static void brw_invalidate_state( struct intel_context *intel, GLuint new_state
 void brwInitVtbl( struct brw_context *brw )
 {
    brw->intel.vtbl.check_vertex_size = 0;
-   brw->intel.vtbl.emit_state = 0; 
-   brw->intel.vtbl.reduced_primitive_state = 0;	
+   brw->intel.vtbl.emit_state = 0;
+   brw->intel.vtbl.reduced_primitive_state = 0;
    brw->intel.vtbl.render_start = 0;
-   brw->intel.vtbl.update_texture_state = 0; 
+   brw->intel.vtbl.update_texture_state = 0;
 
-   brw->intel.vtbl.invalidate_state = brw_invalidate_state; 
-   brw->intel.vtbl.note_fence = brw_note_fence; 
-   brw->intel.vtbl.note_unlock = brw_note_unlock; 
+   brw->intel.vtbl.invalidate_state = brw_invalidate_state;
+   brw->intel.vtbl.note_fence = brw_note_fence;
    brw->intel.vtbl.new_batch = brw_new_batch;
    brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
    brw->intel.vtbl.flush_cmd = brw_flush_cmd;
-   brw->intel.vtbl.emit_flush = brw_emit_flush;
    brw->intel.vtbl.debug_batch = brw_debug_batch;
 }
-
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index c50b0d2dd9..2292de94c4 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -40,12 +40,14 @@
 GLuint brw_wm_nr_args( GLuint opcode )
 {
    switch (opcode) {
+   case WM_FRONTFACING:
    case WM_PIXELXY:
+      return 0;
    case WM_CINTERP:
    case WM_WPOSXY:
+   case WM_DELTAXY:
       return 1;
    case WM_LINTERP:
-   case WM_DELTAXY:
    case WM_PIXELW:
       return 2;
    case WM_FB_WRITE:
@@ -80,6 +82,58 @@ GLuint brw_wm_is_scalar_result( GLuint opcode )
 }
 
 
+/**
+ * Do GPU code generation for non-GLSL shader.  non-GLSL shaders have
+ * no flow control instructions so we can more readily do SSA-style
+ * optimizations.
+ */
+static void
+brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
+{
+   /* Augment fragment program.  Add instructions for pre- and
+    * post-fragment-program tasks such as interpolation and fogging.
+    */
+   brw_wm_pass_fp(c);
+
+   /* Translate to intermediate representation.  Build register usage
+    * chains.
+    */
+   brw_wm_pass0(c);
+
+   /* Dead code removal.
+    */
+   brw_wm_pass1(c);
+
+   /* Register allocation.
+    * Divide by two because we operate on 16 pixels at a time and require
+    * two GRF entries for each logical shader register.
+    */
+   c->grf_limit = BRW_WM_MAX_GRF / 2;
+
+   brw_wm_pass2(c);
+
+   /* how many general-purpose registers are used */
+   c->prog_data.total_grf = c->max_wm_grf;
+
+   /* Scratch space is used for register spilling */
+   if (c->last_scratch) {
+      c->prog_data.total_scratch = c->last_scratch + 0x40;
+   }
+   else {
+      c->prog_data.total_scratch = 0;
+   }
+
+   /* Emit GEN4 code.
+    */
+   brw_wm_emit(c);
+}
+
+
+/**
+ * All Mesa program -> GPU code generation goes through this function.
+ * Depending on the instructions used (i.e. flow control instructions)
+ * we'll use one of two code generators.
+ */
 static void do_wm_prog( struct brw_context *brw,
 			struct brw_fragment_program *fp, 
 			struct brw_wm_prog_key *key)
@@ -90,52 +144,41 @@ static void do_wm_prog( struct brw_context *brw,
 
    c = brw->wm.compile_data;
    if (c == NULL) {
-     brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
-     c = brw->wm.compile_data;
+      brw->wm.compile_data = calloc(1, sizeof(*brw->wm.compile_data));
+      c = brw->wm.compile_data;
+      if (c == NULL) {
+         /* Ouch - big out of memory problem.  Can't continue
+          * without triggering a segfault, no way to signal,
+          * so just return.
+          */
+         return;
+      }
    } else {
-     memset(c, 0, sizeof(*brw->wm.compile_data));
+      memset(c, 0, sizeof(*brw->wm.compile_data));
    }
    memcpy(&c->key, key, sizeof(*key));
 
    c->fp = fp;
    c->env_param = brw->intel.ctx.FragmentProgram.Parameters;
 
-    brw_init_compile(brw, &c->func);
-   if (brw_wm_is_glsl(&c->fp->program)) {
-       brw_wm_glsl_emit(brw, c);
-   } else {
-       /* Augment fragment program.  Add instructions for pre- and
-	* post-fragment-program tasks such as interpolation and fogging.
-	*/
-       brw_wm_pass_fp(c);
-
-       /* Translate to intermediate representation.  Build register usage
-	* chains.
-	*/
-       brw_wm_pass0(c);
-
-       /* Dead code removal.
-	*/
-       brw_wm_pass1(c);
-
-       /* Register allocation.
-	*/
-       c->grf_limit = BRW_WM_MAX_GRF/2;
-
-       brw_wm_pass2(c);
-
-       c->prog_data.total_grf = c->max_wm_grf;
-       if (c->last_scratch) {
-	   c->prog_data.total_scratch =
-	       c->last_scratch + 0x40;
-       } else {
-	   c->prog_data.total_scratch = 0;
-       }
-
-       /* Emit GEN4 code.
-	*/
-       brw_wm_emit(c);
+   brw_init_compile(brw, &c->func);
+
+   /* temporary sanity check assertion */
+   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+
+   /*
+    * Shader which use GLSL features such as flow control are handled
+    * differently from "simple" shaders.
+    */
+   if (fp->isGLSL) {
+      c->dispatch_width = 8;
+      brw_wm_glsl_emit(brw, c);
    }
+   else {
+      c->dispatch_width = 16;
+      brw_wm_non_glsl_emit(brw, c);
+   }
+
    if (INTEL_DEBUG & DEBUG_WM)
       fprintf(stderr, "\n");
 
@@ -157,9 +200,11 @@ static void do_wm_prog( struct brw_context *brw,
 static void brw_wm_populate_key( struct brw_context *brw,
 				 struct brw_wm_prog_key *key )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
-   struct brw_fragment_program *fp = 
+   const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
+   GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
    GLuint lookup = 0;
    GLuint line_aa;
    GLuint i;
@@ -170,51 +215,50 @@ static void brw_wm_populate_key( struct brw_context *brw,
     */
    /* _NEW_COLOR */
    if (fp->program.UsesKill ||
-       brw->attribs.Color->AlphaEnabled)
+       ctx->Color.AlphaEnabled)
       lookup |= IZ_PS_KILL_ALPHATEST_BIT;
 
-   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPR))
+   if (fp->program.Base.OutputsWritten & (1<<FRAG_RESULT_DEPTH))
       lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
 
    /* _NEW_DEPTH */
-   if (brw->attribs.Depth->Test)
+   if (ctx->Depth.Test)
       lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
 
-   if (brw->attribs.Depth->Test &&  
-       brw->attribs.Depth->Mask) /* ?? */
+   if (ctx->Depth.Test &&  
+       ctx->Depth.Mask) /* ?? */
       lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
 
    /* _NEW_STENCIL */
-   if (brw->attribs.Stencil->Enabled) {
+   if (ctx->Stencil._Enabled) {
       lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
 
-      if (brw->attribs.Stencil->WriteMask[0] ||
-	  (brw->attribs.Stencil->_TestTwoSide &&
-	   brw->attribs.Stencil->WriteMask[1]))
+      if (ctx->Stencil.WriteMask[0] ||
+	  ctx->Stencil.WriteMask[ctx->Stencil._BackFace])
 	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
    }
 
    line_aa = AA_NEVER;
 
    /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
-   if (brw->attribs.Line->SmoothFlag) {
+   if (ctx->Line.SmoothFlag) {
       if (brw->intel.reduced_primitive == GL_LINES) {
 	 line_aa = AA_ALWAYS;
       }
       else if (brw->intel.reduced_primitive == GL_TRIANGLES) {
-	 if (brw->attribs.Polygon->FrontMode == GL_LINE) {
+	 if (ctx->Polygon.FrontMode == GL_LINE) {
 	    line_aa = AA_SOMETIMES;
 
-	    if (brw->attribs.Polygon->BackMode == GL_LINE ||
-		(brw->attribs.Polygon->CullFlag &&
-		 brw->attribs.Polygon->CullFaceMode == GL_BACK))
+	    if (ctx->Polygon.BackMode == GL_LINE ||
+		(ctx->Polygon.CullFlag &&
+		 ctx->Polygon.CullFaceMode == GL_BACK))
 	       line_aa = AA_ALWAYS;
 	 }
-	 else if (brw->attribs.Polygon->BackMode == GL_LINE) {
+	 else if (ctx->Polygon.BackMode == GL_LINE) {
 	    line_aa = AA_SOMETIMES;
 
-	    if ((brw->attribs.Polygon->CullFlag &&
-		 brw->attribs.Polygon->CullFaceMode == GL_FRONT))
+	    if ((ctx->Polygon.CullFlag &&
+		 ctx->Polygon.CullFaceMode == GL_FRONT))
 	       line_aa = AA_ALWAYS;
 	 }
       }
@@ -222,27 +266,36 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	 
    brw_wm_lookup_iz(line_aa,
 		    lookup,
+		    uses_depth,
 		    key);
 
 
    /* BRW_NEW_WM_INPUT_DIMENSIONS */
-   key->projtex_mask = brw->wm.input_size_masks[4-1] >> (FRAG_ATTRIB_TEX0 - FRAG_ATTRIB_WPOS); 
+   key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
 
    /* _NEW_LIGHT */
-   key->flat_shade = (brw->attribs.Light->ShadeModel == GL_FLAT);
+   key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
+
+   /* _NEW_HINT */
+   key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
 
    /* _NEW_TEXTURE */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      const struct gl_texture_unit *unit = &brw->attribs.Texture->Unit[i];
-      const struct gl_texture_object *t = unit->_Current;
+      const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
 
       if (unit->_ReallyEnabled) {
-	 if (t->Image[0][t->BaseLevel]->InternalFormat == GL_YCBCR_MESA) {
-	    key->yuvtex_mask |= 1<<i;
-	    if (t->Image[0][t->BaseLevel]->TexFormat->MesaFormat == 
-		    MESA_FORMAT_YCBCR)
-		key->yuvtex_swap_mask |= 1<< i;
+         const struct gl_texture_object *t = unit->_Current;
+         const struct gl_texture_image *img = t->Image[0][t->BaseLevel];
+	 if (img->InternalFormat == GL_YCBCR_MESA) {
+	    key->yuvtex_mask |= 1 << i;
+	    if (img->TexFormat->MesaFormat == MESA_FORMAT_YCBCR)
+		key->yuvtex_swap_mask |= 1 << i;
 	 }
+
+         key->tex_swizzles[i] = t->_Swizzle;
+      }
+      else {
+         key->tex_swizzles[i] = SWIZZLE_NOOP;
       }
    }
 
@@ -273,10 +326,11 @@ static void brw_wm_populate_key( struct brw_context *brw,
       key->drawable_height = brw->intel.driDrawable->h;
    }
 
-   /* Extra info:
-    */
-   key->program_string_id = fp->id;
+   /* CACHE_NEW_VS_PROG */
+   key->vp_outputs_written = brw->vs.prog_data->outputs_written & DO_SETUP_BITS;
 
+   /* The unique fragment program ID */
+   key->program_string_id = fp->id;
 }
 
 
@@ -300,12 +354,11 @@ static void brw_prepare_wm_prog(struct brw_context *brw)
 }
 
 
-/* See brw_wm.c:
- */
 const struct brw_tracked_state brw_wm_prog = {
    .dirty = {
       .mesa  = (_NEW_COLOR |
 		_NEW_DEPTH |
+                _NEW_HINT |
 		_NEW_STENCIL |
 		_NEW_POLYGON |
 		_NEW_LINE |
@@ -315,7 +368,7 @@ const struct brw_tracked_state brw_wm_prog = {
       .brw   = (BRW_NEW_FRAGMENT_PROGRAM |
 		BRW_NEW_WM_INPUT_DIMENSIONS |
 		BRW_NEW_REDUCED_PRIMITIVE),
-      .cache = 0
+      .cache = CACHE_NEW_VS_PROG,
    },
    .prepare = brw_prepare_wm_prog
 };
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index ded079695e..872b1f3ecf 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -38,6 +38,8 @@
 #include "brw_context.h"
 #include "brw_eu.h"
 
+#define SATURATE (1<<5)
+
 /* A big lookup table is used to figure out which and how many
  * additional regs will inserted before the main payload in the WM
  * program execution.  These mainly relate to depth and stencil
@@ -60,20 +62,23 @@ struct brw_wm_prog_key {
    GLuint aa_dest_stencil_reg:3;
    GLuint dest_depth_reg:3;
    GLuint nr_depth_regs:3;
-   GLuint projtex_mask:8;
-   GLuint shadowtex_mask:8;
    GLuint computes_depth:1;	/* could be derived from program string */
    GLuint source_depth_to_render_target:1;
    GLuint flat_shade:1;
+   GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
    GLuint runtime_check_aads_emit:1;
    
-   GLuint yuvtex_mask:8;
-   GLuint yuvtex_swap_mask:8;	/* UV swaped */
-   GLuint pad1:16;
+   GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
+   GLuint shadowtex_mask:16;
+   GLuint yuvtex_mask:16;
+   GLuint yuvtex_swap_mask:16;	/* UV swaped */
+
+   GLuint tex_swizzles[BRW_MAX_TEX_UNIT];
 
    GLuint program_string_id:32;
    GLuint origin_x, origin_y;
    GLuint drawable_height;
+   GLuint vp_outputs_written;
 };
 
 
@@ -142,13 +147,12 @@ struct brw_wm_instruction {
    GLuint writemask:4;
    GLuint tex_unit:4;   /* texture unit for TEX, TXD, TXP instructions */
    GLuint tex_idx:3;    /* TEXTURE_1D,2D,3D,CUBE,RECT_INDEX source target */
+   GLuint tex_shadow:1; /* do shadow comparison? */
    GLuint eot:1;    	/* End of thread indicator for FB_WRITE*/
    GLuint target:10;    /* target binding table index for FB_WRITE*/
 };
 
 
-#define PROGRAM_INTERNAL_PARAM 
-
 #define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + FRAG_ATTRIB_MAX + 3)
 #define BRW_WM_MAX_GRF   128		/* hardware limit */
 #define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
@@ -172,7 +176,8 @@ struct brw_wm_instruction {
 #define WM_CINTERP        (MAX_OPCODE + 5)
 #define WM_WPOSXY         (MAX_OPCODE + 6)
 #define WM_FB_WRITE       (MAX_OPCODE + 7)
-#define MAX_WM_OPCODE     (MAX_OPCODE + 8)
+#define WM_FRONTFACING    (MAX_OPCODE + 8)
+#define MAX_WM_OPCODE     (MAX_OPCODE + 9)
 
 #define PROGRAM_PAYLOAD   (PROGRAM_FILE_MAX)
 #define PAYLOAD_DEPTH     (FRAG_ATTRIB_MAX)
@@ -200,7 +205,6 @@ struct brw_wm_compile {
    GLuint fp_temp;
    GLuint fp_interp_emitted;
    GLuint fp_fragcolor_emitted;
-   GLuint fp_deriv_emitted;
 
    struct prog_src_register pixel_xy;
    struct prog_src_register delta_xy;
@@ -239,17 +243,31 @@ struct brw_wm_compile {
    GLuint max_wm_grf;
    GLuint last_scratch;
 
+   GLuint cur_inst;  /**< index of current instruction */
+
+   GLboolean out_of_regs;  /**< ran out of GRF registers? */
+
+   /** Mapping from Mesa registers to hardware registers */
    struct {
-	GLboolean inited;
-	struct brw_reg reg;
+      GLboolean inited;
+      struct brw_reg reg;
    } wm_regs[PROGRAM_PAYLOAD+1][256][4];
+
+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
    struct brw_reg stack;
    struct brw_reg emit_mask_reg;
-   GLuint reg_index;
    GLuint tmp_regs[BRW_WM_MAX_GRF];
    GLuint tmp_index;
    GLuint tmp_max;
    GLuint subroutines[BRW_WM_MAX_SUBROUTINE];
+   GLuint dispatch_width;
+
+   /** we may need up to 3 constants per instruction (if use_const_buffer) */
+   struct {
+      GLint index;
+      struct brw_reg reg;
+   } current_const[3];
 };
 
 
@@ -276,8 +294,16 @@ void brw_wm_print_program( struct brw_wm_compile *c,
 
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
+		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key );
 
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
+
+void emit_ddxy(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       GLboolean is_ddx,
+	       const struct brw_reg *arg0);
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_wm_debug.c b/src/mesa/drivers/dri/i965/brw_wm_debug.c
index 8f07f89ebc..220821087c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_debug.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_debug.c
@@ -130,6 +130,9 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
    case WM_FB_WRITE:
       _mesa_printf(" = FB_WRITE");
       break;
+   case WM_FRONTFACING:
+      _mesa_printf(" = FRONTFACING");
+      break;
    default:
       _mesa_printf(" = %s", _mesa_opcode_string(inst->opcode));
       break;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index b5050a3e40..bf80a2942a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -34,8 +34,6 @@
 #include "brw_context.h"
 #include "brw_wm.h"
 
-#define SATURATE (1<<5)
-
 /* Not quite sure how correct this is - need to understand horiz
  * vs. vertical strides a little better.
  */
@@ -65,8 +63,7 @@ static INLINE struct brw_reg sechalf( struct brw_reg reg )
 
 static void emit_pixel_xy(struct brw_compile *p,
 			  const struct brw_reg *dst,
-			  GLuint mask,
-			  const struct brw_reg *arg0)
+			  GLuint mask)
 {
    struct brw_reg r1 = brw_vec1_grf(1, 0);
    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
@@ -98,8 +95,7 @@ static void emit_pixel_xy(struct brw_compile *p,
 static void emit_delta_xy(struct brw_compile *p,
 			  const struct brw_reg *dst,
 			  GLuint mask,
-			  const struct brw_reg *arg0,
-			  const struct brw_reg *arg1)
+			  const struct brw_reg *arg0)
 {
    struct brw_reg r1 = brw_vec1_grf(1, 0);
 
@@ -254,6 +250,107 @@ static void emit_cinterp( struct brw_compile *p,
    }
 }
 
+/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
+static void emit_frontfacing( struct brw_compile *p,
+			      const struct brw_reg *dst,
+			      GLuint mask )
+{
+   struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
+   GLuint i;
+
+   if (!(mask & WRITEMASK_XYZW))
+      return;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MOV(p, dst[i], brw_imm_f(0.0));
+      }
+   }
+
+   /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
+    * us front face
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_MOV(p, dst[i], brw_imm_f(1.0));
+      }
+   }
+   brw_set_predicate_control_flag_value(p, 0xff);
+}
+
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+ *
+ * and we're trying to produce:
+ *
+ *           DDX                     DDY
+ * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
+ *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
+ *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
+ *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
+ *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
+ *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
+ *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
+ *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
+ *
+ * and add another set of two more subspans if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
+ * between each other.  We could probably do it like ddx and swizzle the right
+ * order later, but bail for now and just produce
+ * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
+ */
+void emit_ddxy(struct brw_compile *p,
+	       const struct brw_reg *dst,
+	       GLuint mask,
+	       GLboolean is_ddx,
+	       const struct brw_reg *arg0)
+{
+   int i;
+   struct brw_reg src0, src1;
+
+   if (mask & SATURATE)
+      brw_set_saturate(p, 1);
+   for (i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 if (is_ddx) {
+	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 1,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_2,
+			   BRW_WIDTH_2,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 0,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_2,
+			   BRW_WIDTH_2,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+	 } else {
+	    src0 = brw_reg(arg0[i].file, arg0[i].nr, 0,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_4,
+			   BRW_WIDTH_4,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+	    src1 = brw_reg(arg0[i].file, arg0[i].nr, 2,
+			   BRW_REGISTER_TYPE_F,
+			   BRW_VERTICAL_STRIDE_4,
+			   BRW_WIDTH_4,
+			   BRW_HORIZONTAL_STRIDE_0,
+			   BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+	 }
+	 brw_ADD(p, dst[i], src0, negate(src1));
+      }
+   }
+   if (mask & SATURATE)
+      brw_set_saturate(p, 0);
+}
 
 static void emit_alu1( struct brw_compile *p, 
 		       struct brw_instruction *(*func)(struct brw_compile *, 
@@ -325,6 +422,19 @@ static void emit_mad( struct brw_compile *p,
    }
 }
 
+static void emit_trunc( struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_RNDZ(p, dst[i], arg0[i]);
+      }
+   }
+}
 
 static void emit_lrp( struct brw_compile *p, 
 		      const struct brw_reg *dst,
@@ -504,16 +614,18 @@ static void emit_dp3( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
 
    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_MAC(p, dst[0], arg0[2], arg1[2]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
    brw_set_saturate(p, 0);
 }
 
@@ -524,17 +636,19 @@ static void emit_dp4( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
    brw_MAC(p, brw_null_reg(), arg0[2], arg1[2]);
 
    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_MAC(p, dst[0], arg0[3], arg1[3]);
+   brw_MAC(p, dst[dst_chan], arg0[3], arg1[3]);
    brw_set_saturate(p, 0);
 }
 
@@ -545,17 +659,19 @@ static void emit_dph( struct brw_compile *p,
 		      const struct brw_reg *arg0,
 		      const struct brw_reg *arg1 )
 {
+   const int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_MUL(p, brw_null_reg(), arg0[0], arg1[0]);
    brw_MAC(p, brw_null_reg(), arg0[1], arg1[1]);
-   brw_MAC(p, dst[0], arg0[2], arg1[2]);
+   brw_MAC(p, dst[dst_chan], arg0[2], arg1[2]);
 
    brw_set_saturate(p, (mask & SATURATE) ? 1 : 0);
-   brw_ADD(p, dst[0], dst[0], arg1[3]);
+   brw_ADD(p, dst[dst_chan], dst[dst_chan], arg1[3]);
    brw_set_saturate(p, 0);
 }
 
@@ -591,18 +707,19 @@ static void emit_math1( struct brw_compile *p,
 			GLuint mask,
 			const struct brw_reg *arg0 )
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   //assert((mask & WRITEMASK_XYZW) == WRITEMASK_X ||
-   //	  function == BRW_MATH_FUNCTION_SINCOS);
-   
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
    brw_MOV(p, brw_message_reg(2), arg0[0]);
 
    /* Send two messages to perform all 16 operations:
     */
    brw_math_16(p, 
-	       dst[0],
+	       dst[dst_chan],
 	       function,
 	       (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	       2,
@@ -618,10 +735,12 @@ static void emit_math2( struct brw_compile *p,
 			const struct brw_reg *arg0,
 			const struct brw_reg *arg1)
 {
+   int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
    if (!(mask & WRITEMASK_XYZW))
       return; /* Do not emit dead code */
 
-   assert((mask & WRITEMASK_XYZW) == WRITEMASK_X);
+   assert(is_power_of_two(mask & WRITEMASK_XYZW));
 
    brw_push_insn_state(p);
 
@@ -640,7 +759,7 @@ static void emit_math2( struct brw_compile *p,
     */
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_math(p, 
-	    dst[0],
+	    dst[dst_chan],
 	    function,
 	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	    2,
@@ -650,7 +769,7 @@ static void emit_math2( struct brw_compile *p,
 
    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
    brw_math(p, 
-	    offset(dst[0],1),
+	    offset(dst[dst_chan],1),
 	    function,
 	    (mask & SATURATE) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
 	    4,
@@ -671,9 +790,9 @@ static void emit_tex( struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    GLuint msgLength, responseLength;
-   GLboolean shadow = (c->key.shadowtex_mask & (1<<inst->tex_unit)) ? 1 : 0;
    GLuint i, nr;
    GLuint emit;
+   GLuint msg_type;
 
    /* How many input regs are there?
     */
@@ -687,13 +806,17 @@ static void emit_tex( struct brw_wm_compile *c,
       emit = WRITEMASK_XY;
       nr = 2;
       break;
-   default:
+   case TEXTURE_3D_INDEX:
+   case TEXTURE_CUBE_INDEX:
       emit = WRITEMASK_XYZ;
       nr = 3;
       break;
+   default:
+      /* unexpected target */
+      abort();
    }
 
-   if (shadow) {
+   if (inst->tex_shadow) {
       nr = 4;
       emit |= WRITEMASK_W;
    }
@@ -711,19 +834,31 @@ static void emit_tex( struct brw_wm_compile *c,
 
    responseLength = 8;		/* always */
 
+   if (BRW_IS_IGDNG(p->brw)) {
+       if (inst->tex_shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
+   } else {
+       if (inst->tex_shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+   }
+
    brw_SAMPLE(p, 
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
 	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-	      inst->tex_unit + MAX_DRAW_BUFFERS, /* surface */
+              SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
-	      (shadow ? 
-	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE : 
-	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE),
+	      msg_type, 
 	      responseLength,
 	      msgLength,
-	      0);	
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
 }
 
 
@@ -735,7 +870,7 @@ static void emit_txb( struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    GLuint msgLength;
-
+   GLuint msg_type;
    /* Shadow ignored for txb.
     */
    switch (inst->tex_idx) {
@@ -750,27 +885,38 @@ static void emit_txb( struct brw_wm_compile *c,
       brw_MOV(p, brw_message_reg(4), arg[1]);
       brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
       break;
-   default:
+   case TEXTURE_3D_INDEX:
+   case TEXTURE_CUBE_INDEX:
       brw_MOV(p, brw_message_reg(2), arg[0]);
       brw_MOV(p, brw_message_reg(4), arg[1]);
       brw_MOV(p, brw_message_reg(6), arg[2]);
       break;
+   default:
+      /* unexpected target */
+      abort();
    }
 
    brw_MOV(p, brw_message_reg(8), arg[3]);
    msgLength = 9;
 
+   if (BRW_IS_IGDNG(p->brw))
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG;
+   else
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+
    brw_SAMPLE(p, 
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
 	      retype(c->payload.depth[0].hw_reg, BRW_REGISTER_TYPE_UW),
-	      inst->tex_unit + MAX_DRAW_BUFFERS, /* surface */
+              SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
-	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+	      msg_type,
 	      8,		/* responseLength */
 	      msgLength,
-	      0);	
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
 }
 
 
@@ -833,6 +979,20 @@ static void emit_kil( struct brw_wm_compile *c,
    }
 }
 
+/* KIL_NV kills the pixels that are currently executing, not based on a test
+ * of the arguments.
+ */
+static void emit_kil_nv( struct brw_wm_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
+   brw_pop_insn_state(p);
+}
 
 static void fire_fb_write( struct brw_wm_compile *c,
 			   GLuint base_reg,
@@ -886,6 +1046,9 @@ static void emit_aa( struct brw_wm_compile *c,
 
 /* Post-fragment-program processing.  Send the results to the
  * framebuffer.
+ * \param arg0  the fragment color
+ * \param arg1  the pass-through depth value
+ * \param arg2  the shader-computed depth value
  */
 static void emit_fb_write( struct brw_wm_compile *c,
 			   struct brw_reg *arg0,
@@ -979,7 +1142,7 @@ static void emit_fb_write( struct brw_wm_compile *c,
 	      get_element_ud(brw_vec8_grf(1,0), 6), 
 	      brw_imm_ud(1<<26)); 
 
-      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+      jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
       {
 	 emit_aa(c, arg1, 2);
 	 fire_fb_write(c, 0, nr, target, eot);
@@ -994,8 +1157,8 @@ static void emit_fb_write( struct brw_wm_compile *c,
 }
 
 
-/* Post-fragment-program processing.  Send the results to the
- * framebuffer.
+/**
+ * Move a GPR to scratch memory. 
  */
 static void emit_spill( struct brw_wm_compile *c,
 			struct brw_reg reg,
@@ -1014,11 +1177,13 @@ static void emit_spill( struct brw_wm_compile *c,
    */
    brw_dp_WRITE_16(p, 
 		   retype(vec16(brw_vec8_grf(0, 0)), BRW_REGISTER_TYPE_UW),
-		   1, 
 		   slot);
 }
 
 
+/**
+ * Load a GPR from scratch memory. 
+ */
 static void emit_unspill( struct brw_wm_compile *c,
 			  struct brw_reg reg,
 			  GLuint slot )
@@ -1039,13 +1204,13 @@ static void emit_unspill( struct brw_wm_compile *c,
 
    brw_dp_READ_16(p,
 		  retype(vec16(reg), BRW_REGISTER_TYPE_UW),
-		  1, 
 		  slot);
 }
 
 
 /**
- * Retrieve upto 4 GEN4 register pairs for the given wm reg:
+ * Retrieve up to 4 GEN4 register pairs for the given wm reg:
+ * Args with unspill_reg != 0 will be loaded from scratch memory.
  */
 static void get_argument_regs( struct brw_wm_compile *c,
 			       struct brw_wm_ref *arg[],
@@ -1055,13 +1220,12 @@ static void get_argument_regs( struct brw_wm_compile *c,
 
    for (i = 0; i < 4; i++) {
       if (arg[i]) {
-
-	 if (arg[i]->unspill_reg) 
-	    emit_unspill(c, 
+	 if (arg[i]->unspill_reg)
+	    emit_unspill(c,
 			 brw_vec8_grf(arg[i]->unspill_reg, 0),
 			 arg[i]->value->spill_slot);
 
-	 regs[i] = arg[i]->hw_reg;	 
+	 regs[i] = arg[i]->hw_reg;
       }
       else {
 	 regs[i] = brw_null_reg();
@@ -1070,6 +1234,9 @@ static void get_argument_regs( struct brw_wm_compile *c,
 }
 
 
+/**
+ * For values that have a spill_slot!=0, write those regs to scratch memory.
+ */
 static void spill_values( struct brw_wm_compile *c,
 			  struct brw_wm_value *values,
 			  GLuint nr )
@@ -1127,11 +1294,11 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 /* Generated instructions for calculating triangle interpolants:
 	  */
       case WM_PIXELXY:
-	 emit_pixel_xy(p, dst, dst_flags, args[0]);
+	 emit_pixel_xy(p, dst, dst_flags);
 	 break;
 
       case WM_DELTAXY:
-	 emit_delta_xy(p, dst, dst_flags, args[0], args[1]);
+	 emit_delta_xy(p, dst, dst_flags, args[0]);
 	 break;
 
       case WM_WPOSXY:
@@ -1158,6 +1325,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_fb_write(c, args[0], args[1], args[2], inst->target, inst->eot);
 	 break;
 
+      case WM_FRONTFACING:
+	 emit_frontfacing(p, dst, dst_flags);
+	 break;
+
 	 /* Straightforward arithmetic:
 	  */
       case OPCODE_ADD:
@@ -1172,6 +1343,14 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
 	 break;
 
+      case OPCODE_DDX:
+	 emit_ddxy(p, dst, dst_flags, GL_TRUE, args[0]);
+	 break;
+
+      case OPCODE_DDY:
+	 emit_ddxy(p, dst, dst_flags, GL_FALSE, args[0]);
+	 break;
+
       case OPCODE_DP3:
 	 emit_dp3(p, dst, dst_flags, args[0], args[1]);
 	 break;
@@ -1184,6 +1363,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_dph(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
+      case OPCODE_TRUNC:
+	 emit_trunc(p, dst, dst_flags, args[0]);
+	 break;
+
       case OPCODE_LRP:
 	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
 	 break;
@@ -1297,6 +1480,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_kil(c, args[0]);
 	 break;
 
+      case OPCODE_KIL_NV:
+	 emit_kil_nv(c);
+	 break;
+
       default:
 	 _mesa_printf("Unsupported opcode %i (%s) in fragment shader\n",
 		      inst->opcode, inst->opcode < MAX_OPCODE ?
@@ -1310,4 +1497,13 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->hw_reg, 
 		      inst->dst[i]->spill_slot);
    }
+
+   if (INTEL_DEBUG & DEBUG_WM) {
+      int i;
+
+      _mesa_printf("wm-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
+   }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 6df2c95d80..4e3edfbbff 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -42,6 +42,12 @@
 #include "shader/prog_statevars.h"
 
 
+/** An invalid texture target */
+#define TEX_TARGET_NONE NUM_TEXTURE_TARGETS
+
+/** An invalid texture unit */
+#define TEX_UNIT_NONE BRW_MAX_TEX_UNIT
+
 #define FIRST_INTERNAL_TEMP MAX_NV_FRAGMENT_PROGRAM_TEMPS
 
 #define X    0
@@ -58,7 +64,8 @@ static const char *wm_opcode_strings[] = {
    "PINTERP",
    "CINTERP",
    "WPOSXY",
-   "FB_WRITE"
+   "FB_WRITE",
+   "FRONTFACING",
 };
 
 #if 0
@@ -79,9 +86,8 @@ static struct prog_src_register src_reg(GLuint file, GLuint idx)
    reg.Index = idx;
    reg.Swizzle = SWIZZLE_NOOP;
    reg.RelAddr = 0;
-   reg.NegateBase = 0;
+   reg.Negate = NEGATE_NONE;
    reg.Abs = 0;
-   reg.NegateAbs = 0;
    return reg;
 }
 
@@ -111,6 +117,12 @@ static struct prog_src_register src_swizzle1( struct prog_src_register reg, int
    return src_swizzle(reg, x, x, x, x);
 }
 
+static struct prog_src_register src_swizzle4( struct prog_src_register reg, uint swizzle )
+{
+   reg.Swizzle = swizzle;
+   return reg;
+}
+
 
 /***********************************************************************
  * Dest regs
@@ -123,7 +135,7 @@ static struct prog_dst_register dst_reg(GLuint file, GLuint idx)
    reg.Index = idx;
    reg.WriteMask = WRITEMASK_XYZW;
    reg.RelAddr = 0;
-   reg.CondMask = 0;
+   reg.CondMask = COND_TR;
    reg.CondSwizzle = 0;
    reg.CondSrc = 0;
    reg.pad = 0;
@@ -177,22 +189,31 @@ static struct prog_instruction *emit_insn(struct brw_wm_compile *c,
 {
    struct prog_instruction *inst = get_fp_inst(c);
    *inst = *inst0;
-   inst->Data = (void *)inst0;
    return inst;
 }
 
-static struct prog_instruction * emit_op(struct brw_wm_compile *c,
+static struct prog_instruction * emit_tex_op(struct brw_wm_compile *c,
 				       GLuint op,
 				       struct prog_dst_register dest,
 				       GLuint saturate,
 				       GLuint tex_src_unit,
 				       GLuint tex_src_target,
+				       GLuint tex_shadow,
 				       struct prog_src_register src0,
 				       struct prog_src_register src1,
 				       struct prog_src_register src2 )
 {
    struct prog_instruction *inst = get_fp_inst(c);
       
+   assert(tex_src_unit < BRW_MAX_TEX_UNIT ||
+          tex_src_unit == TEX_UNIT_NONE);
+   assert(tex_src_target < NUM_TEXTURE_TARGETS ||
+          tex_src_target == TEX_TARGET_NONE);
+
+   /* update mask of which texture units are referenced by this program */
+   if (tex_src_unit != TEX_UNIT_NONE)
+      c->fp->tex_units_used |= (1 << tex_src_unit);
+
    memset(inst, 0, sizeof(*inst));
 
    inst->Opcode = op;
@@ -200,6 +221,7 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
    inst->SaturateMode = saturate;   
    inst->TexSrcUnit = tex_src_unit;
    inst->TexSrcTarget = tex_src_target;
+   inst->TexShadow = tex_shadow;
    inst->SrcReg[0] = src0;
    inst->SrcReg[1] = src1;
    inst->SrcReg[2] = src2;
@@ -207,6 +229,53 @@ static struct prog_instruction * emit_op(struct brw_wm_compile *c,
 }
    
 
+static struct prog_instruction * emit_op(struct brw_wm_compile *c,
+				       GLuint op,
+				       struct prog_dst_register dest,
+				       GLuint saturate,
+				       struct prog_src_register src0,
+				       struct prog_src_register src1,
+				       struct prog_src_register src2 )
+{
+   return emit_tex_op(c, op, dest, saturate,
+                      TEX_UNIT_NONE, TEX_TARGET_NONE, 0,  /* unit, tgt, shadow */
+                      src0, src1, src2);
+}
+
+
+/* Many Mesa opcodes produce the same value across all the result channels.
+ * We'd rather not have to support that splatting in the opcode implementations,
+ * and brw_wm_pass*.c wants to optimize them out by shuffling references around
+ * anyway.  We can easily get both by emitting the opcode to one channel, and
+ * then MOVing it to the others, which brw_wm_pass*.c already understands.
+ */
+static struct prog_instruction *emit_scalar_insn(struct brw_wm_compile *c,
+						 const struct prog_instruction *inst0)
+{
+   struct prog_instruction *inst;
+   unsigned int dst_chan;
+   unsigned int other_channel_mask;
+
+   if (inst0->DstReg.WriteMask == 0)
+      return NULL;
+
+   dst_chan = _mesa_ffs(inst0->DstReg.WriteMask) - 1;
+   inst = get_fp_inst(c);
+   *inst = *inst0;
+   inst->DstReg.WriteMask = 1 << dst_chan;
+
+   other_channel_mask = inst0->DstReg.WriteMask & ~(1 << dst_chan);
+   if (other_channel_mask != 0) {
+      inst = emit_op(c,
+		     OPCODE_MOV,
+		     dst_mask(inst0->DstReg, other_channel_mask),
+		     0,
+		     src_swizzle1(src_reg_from_dst(inst0->DstReg), dst_chan),
+		     src_undef(),
+		     src_undef());
+   }
+   return inst;
+}
 
 
 /***********************************************************************
@@ -228,7 +297,7 @@ static struct prog_src_register get_pixel_xy( struct brw_wm_compile *c )
       emit_op(c,
 	      WM_PIXELXY,
 	      dst_mask(pixel_xy, WRITEMASK_XY),
-	      0, 0, 0,
+	      0,
 	      payload_r0_depth,
 	      src_undef(),
 	      src_undef());
@@ -251,7 +320,7 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
       emit_op(c,
 	      WM_DELTAXY,
 	      dst_mask(delta_xy, WRITEMASK_XY),
-	      0, 0, 0,
+	      0,
 	      pixel_xy, 
 	      payload_r0_depth,
 	      src_undef());
@@ -268,14 +337,13 @@ static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
       struct prog_dst_register pixel_w = get_temp(c);
       struct prog_src_register deltas = get_delta_xy(c);
       struct prog_src_register interp_wpos = src_reg(PROGRAM_PAYLOAD, FRAG_ATTRIB_WPOS);
-      
-      
+
       /* deltas.xyw = DELTAS2 deltas.xy, payload.interp_wpos.x
        */
       emit_op(c,
 	      WM_PIXELW,
 	      dst_mask(pixel_w, WRITEMASK_W),
-	      0, 0, 0,
+	      0,
 	      interp_wpos,
 	      deltas, 
 	      src_undef());
@@ -293,24 +361,19 @@ static void emit_interp( struct brw_wm_compile *c,
    struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
    struct prog_src_register deltas = get_delta_xy(c);
-   struct prog_src_register arg2;
-   GLuint opcode;
-   
+
    /* Need to use PINTERP on attributes which have been
     * multiplied by 1/W in the SF program, and LINTERP on those
     * which have not:
     */
    switch (idx) {
    case FRAG_ATTRIB_WPOS:
-      opcode = WM_LINTERP;
-      arg2 = src_undef();
-
       /* Have to treat wpos.xy specially:
        */
       emit_op(c,
 	      WM_WPOSXY,
 	      dst_mask(dst, WRITEMASK_XY),
-	      0, 0, 0,
+	      0,
 	      get_pixel_xy(c),
 	      src_undef(),
 	      src_undef());
@@ -322,10 +385,10 @@ static void emit_interp( struct brw_wm_compile *c,
       emit_op(c,
 	      WM_LINTERP,
 	      dst,
-	      0, 0, 0,
+	      0,
 	      interp,
 	      deltas,
-	      arg2);
+	      src_undef());
       break;
    case FRAG_ATTRIB_COL0:
    case FRAG_ATTRIB_COL1:
@@ -333,26 +396,95 @@ static void emit_interp( struct brw_wm_compile *c,
 	 emit_op(c,
 		 WM_CINTERP,
 		 dst,
-		 0, 0, 0,
+		 0,
 		 interp,
 		 src_undef(),
 		 src_undef());
       }
       else {
-	 emit_op(c,
-		 WM_LINTERP,
-		 dst,
-		 0, 0, 0,
-		 interp,
-		 deltas,
-		 src_undef());
+         if (c->key.linear_color) {
+            emit_op(c,
+                    WM_LINTERP,
+                    dst,
+                    0,
+                    interp,
+                    deltas,
+                    src_undef());
+         }
+         else {
+            /* perspective-corrected color interpolation */
+            emit_op(c,
+                    WM_PINTERP,
+                    dst,
+                    0,
+                    interp,
+                    deltas,
+                    get_pixel_w(c));
+         }
       }
       break;
+   case FRAG_ATTRIB_FOGC:
+      /* Interpolate the fog coordinate */
+      emit_op(c,
+	      WM_PINTERP,
+	      dst_mask(dst, WRITEMASK_X),
+	      0,
+	      interp,
+	      deltas,
+	      get_pixel_w(c));
+
+      emit_op(c,
+	      OPCODE_MOV,
+	      dst_mask(dst, WRITEMASK_YZW),
+	      0,
+	      src_swizzle(interp,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ONE),
+	      src_undef(),
+	      src_undef());
+      break;
+
+   case FRAG_ATTRIB_FACE:
+      /* XXX review/test this case */
+      emit_op(c,
+              WM_FRONTFACING,
+              dst_mask(dst, WRITEMASK_X),
+              0,
+              src_undef(),
+              src_undef(),
+              src_undef());
+      break;
+
+   case FRAG_ATTRIB_PNTC:
+      /* XXX review/test this case */
+      emit_op(c,
+	      WM_PINTERP,
+	      dst_mask(dst, WRITEMASK_XY),
+	      0,
+	      interp,
+	      deltas,
+	      get_pixel_w(c));
+
+      emit_op(c,
+	      OPCODE_MOV,
+	      dst_mask(dst, WRITEMASK_ZW),
+	      0,
+	      src_swizzle(interp,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ZERO,
+			  SWIZZLE_ONE),
+	      src_undef(),
+	      src_undef());
+      break;
+
    default:
       emit_op(c,
 	      WM_PINTERP,
 	      dst,
-	      0, 0, 0,
+	      0,
 	      interp,
 	      deltas,
 	      get_pixel_w(c));
@@ -362,38 +494,6 @@ static void emit_interp( struct brw_wm_compile *c,
    c->fp_interp_emitted |= 1<<idx;
 }
 
-static void emit_ddx( struct brw_wm_compile *c,
-        const struct prog_instruction *inst )
-{
-    GLuint idx = inst->SrcReg[0].Index;
-    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
-
-    c->fp_deriv_emitted |= 1<<idx;
-    emit_op(c,
-            OPCODE_DDX,
-            inst->DstReg,
-            0, 0, 0,
-            interp,
-            get_pixel_w(c),
-            src_undef());
-}
-
-static void emit_ddy( struct brw_wm_compile *c,
-        const struct prog_instruction *inst )
-{
-    GLuint idx = inst->SrcReg[0].Index;
-    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
-
-    c->fp_deriv_emitted |= 1<<idx;
-    emit_op(c,
-            OPCODE_DDY,
-            inst->DstReg,
-            0, 0, 0,
-            interp,
-            get_pixel_w(c),
-            src_undef());
-}
-
 /***********************************************************************
  * Hacks to extend the program parameter and constant lists.
  */
@@ -483,13 +583,12 @@ static void precalc_dst( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MUL,
 	      dst_mask(dst, WRITEMASK_Y),
-	      inst->SaturateMode, 0, 0,
+	      inst->SaturateMode,
 	      src0,
 	      src1,
 	      src_undef());
    }
 
-
    if (dst.WriteMask & WRITEMASK_XZ) {
       struct prog_instruction *swz;
       GLuint z = GET_SWZ(src0.Swizzle, Z);
@@ -499,12 +598,12 @@ static void precalc_dst( struct brw_wm_compile *c,
       swz = emit_op(c,
 		    OPCODE_SWZ,
 		    dst_mask(dst, WRITEMASK_XZ),
-		    inst->SaturateMode, 0, 0,
+		    inst->SaturateMode,
 		    src_swizzle(src0, SWIZZLE_ONE, z, z, z),
 		    src_undef(),
 		    src_undef());
       /* Avoid letting negation flag of src0 affect our 1 constant. */
-      swz->SrcReg[0].NegateBase &= ~NEGATE_X;
+      swz->SrcReg[0].Negate &= ~NEGATE_X;
    }
    if (dst.WriteMask & WRITEMASK_W) {
       /* dst.w = mov src1.w
@@ -512,7 +611,7 @@ static void precalc_dst( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MOV,
 	      dst_mask(dst, WRITEMASK_W),
-	      inst->SaturateMode, 0, 0,
+	      inst->SaturateMode,
 	      src1,
 	      src_undef(),
 	      src_undef());
@@ -534,32 +633,40 @@ static void precalc_lit( struct brw_wm_compile *c,
       swz = emit_op(c,
 		    OPCODE_SWZ,
 		    dst_mask(dst, WRITEMASK_XW),
-		    0, 0, 0,
+		    0,
 		    src_swizzle1(src0, SWIZZLE_ONE),
 		    src_undef(),
 		    src_undef());
       /* Avoid letting the negation flag of src0 affect our 1 constant. */
-      swz->SrcReg[0].NegateBase = 0;
+      swz->SrcReg[0].Negate = NEGATE_NONE;
    }
 
-
    if (dst.WriteMask & WRITEMASK_YZ) {
       emit_op(c,
 	      OPCODE_LIT,
 	      dst_mask(dst, WRITEMASK_YZ),
-	      inst->SaturateMode, 0, 0,
+	      inst->SaturateMode,
 	      src0,
 	      src_undef(),
 	      src_undef());
    }
 }
 
+
+/**
+ * Some TEX instructions require extra code, cube map coordinate
+ * normalization, or coordinate scaling for RECT textures, etc.
+ * This function emits those extra instructions and the TEX
+ * instruction itself.
+ */
 static void precalc_tex( struct brw_wm_compile *c,
 			 const struct prog_instruction *inst )
 {
    struct prog_src_register coord;
    struct prog_dst_register tmpcoord;
-   GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+   const GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+
+   assert(unit < BRW_MAX_TEX_UNIT);
 
    if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX) {
        struct prog_instruction *out;
@@ -569,49 +676,56 @@ static void precalc_tex( struct brw_wm_compile *c,
        struct prog_src_register tmp1src = src_reg_from_dst(tmp1);
        struct prog_src_register src0 = inst->SrcReg[0];
 
+       /* find longest component of coord vector and normalize it */
        tmpcoord = get_temp(c);
        coord = src_reg_from_dst(tmpcoord);
 
+       /* tmpcoord = src0 (i.e.: coord = src0) */
        out = emit_op(c, OPCODE_MOV,
                      tmpcoord,
-                     0, 0, 0,
+                     0,
                      src0,
                      src_undef(),
                      src_undef());
-       out->SrcReg[0].NegateBase = 0;
+       out->SrcReg[0].Negate = NEGATE_NONE;
        out->SrcReg[0].Abs = 1;
 
+       /* tmp0 = MAX(coord.X, coord.Y) */
        emit_op(c, OPCODE_MAX,
                tmp0,
-               0, 0, 0,
+               0,
                src_swizzle1(coord, X),
                src_swizzle1(coord, Y),
                src_undef());
 
+       /* tmp1 = MAX(tmp0, coord.Z) */
        emit_op(c, OPCODE_MAX,
                tmp1,
-               0, 0, 0,
+               0,
                tmp0src,
                src_swizzle1(coord, Z),
                src_undef());
 
+       /* tmp0 = 1 / tmp1 */
        emit_op(c, OPCODE_RCP,
-               tmp0,
-               0, 0, 0,
+               dst_mask(tmp0, WRITEMASK_X),
+               0,
                tmp1src,
                src_undef(),
                src_undef());
 
+       /* tmpCoord = src0 * tmp0 */
        emit_op(c, OPCODE_MUL,
                tmpcoord,
-               0, 0, 0,
+               0,
                src0,
-               tmp0src,
+               src_swizzle1(tmp0src, SWIZZLE_X),
                src_undef());
 
        release_temp(c, tmp0);
        release_temp(c, tmp1);
-   } else if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) {
+   }
+   else if (inst->TexSrcTarget == TEXTURE_RECT_INDEX) {
       struct prog_src_register scale = 
 	 search_or_add_param5( c, 
 			       STATE_INTERNAL, 
@@ -626,9 +740,13 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MUL,
 	      tmpcoord,
-	      0, 0, 0,
+	      0,
 	      inst->SrcReg[0],
-	      scale,
+	      src_swizzle(scale,
+			  SWIZZLE_X,
+			  SWIZZLE_Y,
+			  SWIZZLE_ONE,
+			  SWIZZLE_ONE),
 	      src_undef());
 
       coord = src_reg_from_dst(tmpcoord);
@@ -642,19 +760,9 @@ static void precalc_tex( struct brw_wm_compile *c,
     * conversion requires allocating a temporary variable which we
     * don't have the facility to do that late in the compilation.
     */
-   if (!(c->key.yuvtex_mask & (1<<unit))) {
-      emit_op(c, 
-	      OPCODE_TEX,
-	      inst->DstReg,
-	      inst->SaturateMode,
-	      unit,
-	      inst->TexSrcTarget,
-	      coord,
-	      src_undef(),
-	      src_undef());
-   }
-   else {
-       GLboolean  swap_uv = c->key.yuvtex_swap_mask & (1<<unit);
+   if (c->key.yuvtex_mask & (1 << unit)) {
+      /* convert ycbcr to RGBA */
+      GLboolean  swap_uv = c->key.yuvtex_swap_mask & (1<<unit);
 
       /* 
 	 CONST C0 = { -.5, -.0625,  -.5, 1.164 }
@@ -676,22 +784,23 @@ static void precalc_tex( struct brw_wm_compile *c,
      
       /* tmp     = TEX ...
        */
-      emit_op(c, 
-	      OPCODE_TEX,
-	      tmp,
-	      inst->SaturateMode,
-	      unit,
-	      inst->TexSrcTarget,
-	      coord,
-	      src_undef(),
-	      src_undef());
+      emit_tex_op(c, 
+                  OPCODE_TEX,
+                  tmp,
+                  inst->SaturateMode,
+                  unit,
+                  inst->TexSrcTarget,
+                  inst->TexShadow,
+                  coord,
+                  src_undef(),
+                  src_undef());
 
       /* tmp.xyz =  ADD TMP, C0
        */
       emit_op(c,
 	      OPCODE_ADD,
 	      dst_mask(tmp, WRITEMASK_XYZ),
-	      0, 0, 0,
+	      0,
 	      tmpsrc,
 	      C0,
 	      src_undef());
@@ -702,7 +811,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_Y),
-	      0, 0, 0,
+	      0,
 	      tmpsrc,
 	      src_swizzle1(C0, W),
 	      src_undef());
@@ -717,7 +826,7 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_XYZ),
-	      0, 0, 0,
+	      0,
 	      swap_uv?src_swizzle(tmpsrc, Z,Z,X,X):src_swizzle(tmpsrc, X,X,Z,Z),
 	      C1,
 	      src_swizzle1(tmpsrc, Y));
@@ -727,13 +836,38 @@ static void precalc_tex( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MAD,
 	      dst_mask(dst, WRITEMASK_Y),
-	      0, 0, 0,
+	      0,
 	      src_swizzle1(tmpsrc, Z),
 	      src_swizzle1(C1, W),
 	      src_swizzle1(src_reg_from_dst(dst), Y));
 
       release_temp(c, tmp);
    }
+   else {
+      /* ordinary RGBA tex instruction */
+      emit_tex_op(c, 
+                  OPCODE_TEX,
+                  inst->DstReg,
+                  inst->SaturateMode,
+                  unit,
+                  inst->TexSrcTarget,
+                  inst->TexShadow,
+                  coord,
+                  src_undef(),
+                  src_undef());
+   }
+
+   /* For GL_EXT_texture_swizzle: */
+   if (c->key.tex_swizzles[unit] != SWIZZLE_NOOP) {
+      /* swizzle the result of the TEX instruction */
+      struct prog_src_register tmpsrc = src_reg_from_dst(inst->DstReg);
+      emit_op(c, OPCODE_SWZ,
+              inst->DstReg,
+              SATURATE_OFF, /* saturate already done above */
+              src_swizzle4(tmpsrc, c->key.tex_swizzles[unit]),
+              src_undef(),
+              src_undef());
+   }
 
    if ((inst->TexSrcTarget == TEXTURE_RECT_INDEX) ||
        (inst->TexSrcTarget == TEXTURE_CUBE_INDEX))
@@ -741,10 +875,16 @@ static void precalc_tex( struct brw_wm_compile *c,
 }
 
 
+/**
+ * Check if the given TXP instruction really needs the divide-by-W step.
+ */
 static GLboolean projtex( struct brw_wm_compile *c,
 			  const struct prog_instruction *inst )
 {
-   struct prog_src_register src = inst->SrcReg[0];
+   const struct prog_src_register src = inst->SrcReg[0];
+   GLboolean retVal;
+
+   assert(inst->Opcode == OPCODE_TXP);
 
    /* Only try to detect the simplest cases.  Could detect (later)
     * cases where we are trying to emit code like RCP {1.0}, MUL x,
@@ -754,16 +894,21 @@ static GLboolean projtex( struct brw_wm_compile *c,
     * user-provided fragment programs anyway:
     */
    if (inst->TexSrcTarget == TEXTURE_CUBE_INDEX)
-      return 0;  /* ut2004 gun rendering !?! */
+      retVal = GL_FALSE;  /* ut2004 gun rendering !?! */
    else if (src.File == PROGRAM_INPUT && 
 	    GET_SWZ(src.Swizzle, W) == W &&
-           (c->key.projtex_mask & (1<<(src.Index + FRAG_ATTRIB_WPOS - FRAG_ATTRIB_TEX0))) == 0)
-      return 0;
+            (c->key.proj_attrib_mask & (1 << src.Index)) == 0)
+      retVal = GL_FALSE;
    else
-      return 1;
+      retVal = GL_TRUE;
+
+   return retVal;
 }
 
 
+/**
+ * Emit code for TXP.
+ */
 static void precalc_txp( struct brw_wm_compile *c,
 			       const struct prog_instruction *inst )
 {
@@ -778,7 +923,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_RCP,
 	      dst_mask(tmp, WRITEMASK_W),
-	      0, 0, 0,
+	      0,
 	      src_swizzle1(src0, GET_SWZ(src0.Swizzle, W)),
 	      src_undef(),
 	      src_undef());
@@ -788,7 +933,7 @@ static void precalc_txp( struct brw_wm_compile *c,
       emit_op(c,
 	      OPCODE_MUL,
 	      dst_mask(tmp, WRITEMASK_XYZ),
-	      0, 0, 0,
+	      0,
 	      src0,
 	      src_swizzle1(src_reg_from_dst(tmp), W),
 	      src_undef());
@@ -814,42 +959,41 @@ static void precalc_txp( struct brw_wm_compile *c,
 static void emit_fb_write( struct brw_wm_compile *c )
 {
    struct prog_src_register payload_r0_depth = src_reg(PROGRAM_PAYLOAD, PAYLOAD_DEPTH);
-   struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPR);
+   struct prog_src_register outdepth = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DEPTH);
    struct prog_src_register outcolor;
    GLuint i;
 
    struct prog_instruction *inst, *last_inst;
    struct brw_context *brw = c->func.brw;
 
-   /* inst->Sampler is not used by backend, 
-      use it for fb write target and eot */
-
-   if (brw->state.nr_draw_regions > 1) {
-       for (i = 0 ; i < brw->state.nr_draw_regions; i++) {
-	   outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
-	   last_inst = inst = emit_op(c,
-		   WM_FB_WRITE, dst_mask(dst_undef(),0), 0, 0, 0,
-		   outcolor, payload_r0_depth, outdepth);
-	   inst->Sampler = (i<<1);
-	   if (c->fp_fragcolor_emitted) {
-	       outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR);
-	       last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-		       0, 0, 0, outcolor, payload_r0_depth, outdepth);
-	       inst->Sampler = (i<<1);
-	   }
-       }
-       last_inst->Sampler |= 1; //eot
+   /* The inst->Aux field is used for FB write target and the EOT marker */
+
+   if (brw->state.nr_color_regions > 1) {
+      for (i = 0 ; i < brw->state.nr_color_regions; i++) {
+         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0 + i);
+         last_inst = inst = emit_op(c,
+                                    WM_FB_WRITE, dst_mask(dst_undef(),0), 0,
+                                    outcolor, payload_r0_depth, outdepth);
+         inst->Aux = (i<<1);
+         if (c->fp_fragcolor_emitted) {
+            outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
+            last_inst = inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
+                                       0, outcolor, payload_r0_depth, outdepth);
+            inst->Aux = (i<<1);
+         }
+      }
+      last_inst->Aux |= 1; //eot
    }
    else {
       /* if gl_FragData[0] is written, use it, else use gl_FragColor */
       if (c->fp->program.Base.OutputsWritten & (1 << FRAG_RESULT_DATA0))
          outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_DATA0);
       else 
-         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR);
+         outcolor = src_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLOR);
 
-       inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
-	       0, 0, 0, outcolor, payload_r0_depth, outdepth);
-       inst->Sampler = 1|(0<<1);
+      inst = emit_op(c, WM_FB_WRITE, dst_mask(dst_undef(),0),
+                     0, outcolor, payload_r0_depth, outdepth);
+      inst->Aux = 1|(0<<1);
    }
 }
 
@@ -880,9 +1024,9 @@ static void validate_dst_regs( struct brw_wm_compile *c,
 			       const struct prog_instruction *inst )
 {
    if (inst->DstReg.File == PROGRAM_OUTPUT) {
-       GLuint idx = inst->DstReg.Index;
-       if (idx == FRAG_RESULT_COLR)
-	   c->fp_fragcolor_emitted = 1;
+      GLuint idx = inst->DstReg.Index;
+      if (idx == FRAG_RESULT_COLOR)
+         c->fp_fragcolor_emitted = 1;
    }
 }
 
@@ -902,11 +1046,15 @@ static void print_insns( const struct prog_instruction *insn,
 				     3);
       }
       else 
-	 _mesa_printf("UNKNOWN\n");
-	   
+	 _mesa_printf("965 Opcode %d\n", insn->Opcode);
    }
 }
 
+
+/**
+ * Initial pass for fragment program code generation.
+ * This function is used by both the GLSL and non-GLSL paths.
+ */
 void brw_wm_pass_fp( struct brw_wm_compile *c )
 {
    struct brw_fragment_program *fp = c->fp;
@@ -922,16 +1070,21 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
    c->delta_xy = src_undef();
    c->pixel_w = src_undef();
    c->nr_fp_insns = 0;
+   c->fp->tex_units_used = 0x0;
 
-   /* Emit preamble instructions:
+   /* Emit preamble instructions.  This is where special instructions such as
+    * WM_CINTERP, WM_LINTERP, WM_PINTERP and WM_WPOSXY are emitted to
+    * compute shader inputs from varying vars.
     */
-
-
    for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
       const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
       validate_src_regs(c, inst);
       validate_dst_regs(c, inst);
    }
+
+   /* Loop over all instructions doing assorted simplifications and
+    * transformations.
+    */
    for (insn = 0; insn < fp->program.Base.NumInstructions; insn++) {
       const struct prog_instruction *inst = &fp->program.Base.Instructions[insn];
       struct prog_instruction *out;
@@ -940,7 +1093,6 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
        * necessary:
        */
 
-
       switch (inst->Opcode) {
       case OPCODE_SWZ: 
 	 out = emit_insn(c, inst);
@@ -950,14 +1102,14 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       case OPCODE_ABS:
 	 out = emit_insn(c, inst);
 	 out->Opcode = OPCODE_MOV;
-	 out->SrcReg[0].NegateBase = 0;
+	 out->SrcReg[0].Negate = NEGATE_NONE;
 	 out->SrcReg[0].Abs = 1;
 	 break;
 
       case OPCODE_SUB: 
 	 out = emit_insn(c, inst);
 	 out->Opcode = OPCODE_ADD;
-	 out->SrcReg[1].NegateBase ^= 0xf;
+	 out->SrcReg[1].Negate ^= NEGATE_XYZW;
 	 break;
 
       case OPCODE_SCS: 
@@ -986,6 +1138,7 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
       case OPCODE_TXB:
 	 out = emit_insn(c, inst);
 	 out->TexSrcUnit = fp->program.Base.SamplerUnits[inst->TexSrcUnit];
+         assert(out->TexSrcUnit < BRW_MAX_TEX_UNIT);
 	 break;
 
       case OPCODE_XPD: 
@@ -1001,28 +1154,24 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	  */
 	 out->DstReg.WriteMask = 0;
 	 break;
-      case OPCODE_DDX:
-	 emit_ddx(c, inst);
-	 break;
-      case OPCODE_DDY:
-         emit_ddy(c, inst);
-	break;
       case OPCODE_END:
 	 emit_fb_write(c);
 	 break;
       case OPCODE_PRINT:
 	 break;
-	 
       default:
-	 emit_insn(c, inst);
+	 if (brw_wm_is_scalar_result(inst->Opcode))
+	    emit_scalar_insn(c, inst);
+	 else
+	    emit_insn(c, inst);
 	 break;
       }
    }
 
    if (INTEL_DEBUG & DEBUG_WM) {
-	   _mesa_printf("pass_fp:\n");
-	   print_insns( c->prog_instructions, c->nr_fp_insns );
-	   _mesa_printf("\n");
+      _mesa_printf("pass_fp:\n");
+      print_insns( c->prog_instructions, c->nr_fp_insns );
+      _mesa_printf("\n");
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index 8fd776ac39..c9fe1dd8ad 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -1,5 +1,7 @@
 #include "main/macros.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+#include "shader/prog_optimize.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
@@ -8,21 +10,28 @@ enum _subroutine {
     SUB_NOISE1, SUB_NOISE2, SUB_NOISE3, SUB_NOISE4
 };
 
-/* Only guess, need a flag in gl_fragment_program later */
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+                                  const struct prog_instruction *inst,
+                                  GLuint component);
+
+/**
+ * Determine if the given fragment program uses GLSL features such
+ * as flow conditionals, loops, subroutines.
+ * Some GLSL shaders may use these features, others might not.
+ */
 GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 {
     int i;
+
     for (i = 0; i < fp->Base.NumInstructions; i++) {
-	struct prog_instruction *inst = &fp->Base.Instructions[i];
+	const struct prog_instruction *inst = &fp->Base.Instructions[i];
 	switch (inst->Opcode) {
+	    case OPCODE_ARL:
 	    case OPCODE_IF:
-	    case OPCODE_TRUNC:
 	    case OPCODE_ENDIF:
 	    case OPCODE_CAL:
 	    case OPCODE_BRK:
 	    case OPCODE_RET:
-	    case OPCODE_DDX:
-	    case OPCODE_DDY:
 	    case OPCODE_NOISE1:
 	    case OPCODE_NOISE2:
 	    case OPCODE_NOISE3:
@@ -36,6 +45,87 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
     return GL_FALSE; 
 }
 
+
+
+static void
+reclaim_temps(struct brw_wm_compile *c);
+
+
+/** Mark GRF register as used. */
+static void
+prealloc_grf(struct brw_wm_compile *c, int r)
+{
+   c->used_grf[r] = GL_TRUE;
+}
+
+
+/** Mark given GRF register as not in use. */
+static void
+release_grf(struct brw_wm_compile *c, int r)
+{
+   /*assert(c->used_grf[r]);*/
+   c->used_grf[r] = GL_FALSE;
+   c->first_free_grf = MIN2(c->first_free_grf, r);
+}
+
+
+/** Return index of a free GRF, mark it as used. */
+static int
+alloc_grf(struct brw_wm_compile *c)
+{
+   GLuint r;
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   /* no free temps, try to reclaim some */
+   reclaim_temps(c);
+   c->first_free_grf = 0;
+
+   /* try alloc again */
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
+      assert(c->used_grf[r]);
+   }
+
+   /* really, no free GRF regs found */
+   if (!c->out_of_regs) {
+      /* print warning once per compilation */
+      _mesa_warning(NULL, "i965: ran out of registers for fragment program");
+      c->out_of_regs = GL_TRUE;
+   }
+
+   return -1;
+}
+
+
+/** Return number of GRF registers used */
+static int
+num_grf_used(const struct brw_wm_compile *c)
+{
+   int r;
+   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
+      if (c->used_grf[r])
+         return r + 1;
+   return 0;
+}
+
+
+
+/**
+ * Record the mapping of a Mesa register to a hardware register.
+ */
 static void set_reg(struct brw_wm_compile *c, int file, int index, 
 	int component, struct brw_reg reg)
 {
@@ -43,25 +133,32 @@ static void set_reg(struct brw_wm_compile *c, int file, int index,
     c->wm_regs[file][index][component].inited = GL_TRUE;
 }
 
-static int get_scalar_dst_index(struct prog_instruction *inst)
-{
-    int i;
-    for (i = 0; i < 4; i++)
-	if (inst->DstReg.WriteMask & (1<<i))
-	    break;
-    return i;
-}
-
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
     struct brw_reg reg;
-    if(c->tmp_index == c->tmp_max)
-	c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
-    
+
+    /* if we need to allocate another temp, grow the tmp_regs[] array */
+    if (c->tmp_index == c->tmp_max) {
+       int r = alloc_grf(c);
+       if (r < 0) {
+          /*printf("Out of temps in %s\n", __FUNCTION__);*/
+          r = 50; /* XXX random register! */
+       }
+       c->tmp_regs[ c->tmp_max++ ] = r;
+    }
+
+    /* form the GRF register */
     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
+    /*printf("alloc_temp %d\n", reg.nr);*/
+    assert(reg.nr < BRW_WM_MAX_GRF);
     return reg;
+
 }
 
+/**
+ * Save current temp register info.
+ * There must be a matching call to release_tmps().
+ */
 static int mark_tmps(struct brw_wm_compile *c)
 {
     return c->tmp_index;
@@ -77,8 +174,22 @@ static void release_tmps(struct brw_wm_compile *c, int mark)
     c->tmp_index = mark;
 }
 
+/**
+ * Convert Mesa src register to brw register.
+ *
+ * Since we're running in SOA mode each Mesa register corresponds to four
+ * hardware registers.  We allocate the hardware registers as needed here.
+ *
+ * \param file  register file, one of PROGRAM_x
+ * \param index  register number
+ * \param component  src component (X=0, Y=1, Z=2, W=3)
+ * \param nr  not used?!?
+ * \param neg  negate value?
+ * \param abs  take absolute value?
+ */
 static struct brw_reg 
-get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GLuint neg, GLuint abs)
+get_reg(struct brw_wm_compile *c, int file, int index, int component,
+        int nr, GLuint neg, GLuint abs)
 {
     struct brw_reg reg;
     switch (file) {
@@ -89,21 +200,40 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GL
 	    break;
 	case PROGRAM_UNDEFINED:
 	    return brw_null_reg();	
-	default:
+	case PROGRAM_TEMPORARY:
+	case PROGRAM_INPUT:
+	case PROGRAM_OUTPUT:
+	case PROGRAM_PAYLOAD:
 	    break;
+	default:
+	    _mesa_problem(NULL, "Unexpected file in get_reg()");
+	    return brw_null_reg();
     }
 
-    if(c->wm_regs[file][index][component].inited)
-	reg = c->wm_regs[file][index][component].reg;
-    else 
-	reg = brw_vec8_grf(c->reg_index, 0);
+    assert(index < 256);
+    assert(component < 4);
 
-    if(!c->wm_regs[file][index][component].inited) {
-	set_reg(c, file, index, component, reg);
-	c->reg_index++;
+    /* see if we've already allocated a HW register for this Mesa register */
+    if (c->wm_regs[file][index][component].inited) {
+       /* yes, re-use */
+       reg = c->wm_regs[file][index][component].reg;
+    }
+    else {
+	/* no, allocate new register */
+       int grf = alloc_grf(c);
+       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
+       if (grf < 0) {
+          /* totally out of temps */
+          grf = 51; /* XXX random register! */
+       }
+
+       reg = brw_vec8_grf(grf, 0);
+       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
+
+       set_reg(c, file, index, component, reg);
     }
 
-    if (neg & (1<< component)) {
+    if (neg & (1 << component)) {
 	reg = negate(reg);
     }
     if (abs)
@@ -111,78 +241,360 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component, int nr, GL
     return reg;
 }
 
+
+
+/**
+ * This is called if we run out of GRF registers.  Examine the live intervals
+ * of temp regs in the program and free those which won't be used again.
+ */
+static void
+reclaim_temps(struct brw_wm_compile *c)
+{
+   GLint intBegin[MAX_PROGRAM_TEMPS];
+   GLint intEnd[MAX_PROGRAM_TEMPS];
+   int index;
+
+   /*printf("Reclaim temps:\n");*/
+
+   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
+                             intBegin, intEnd);
+
+   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
+      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
+         /* program temp[i] can be freed */
+         int component;
+         /*printf("  temp[%d] is dead\n", index);*/
+         for (component = 0; component < 4; component++) {
+            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
+               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
+               release_grf(c, r);
+               /*
+               printf("  Reclaim temp %d, reg %d at inst %d\n",
+                      index, r, c->cur_inst);
+               */
+               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
+            }
+         }
+      }
+   }
+}
+
+
+
+
+/**
+ * Preallocate registers.  This sets up the Mesa to hardware register
+ * mapping for certain registers, such as constants (uniforms/state vars)
+ * and shader inputs.
+ */
 static void prealloc_reg(struct brw_wm_compile *c)
 {
     int i, j;
     struct brw_reg reg;
-    int nr_interp_regs = 0;
-    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
+    int urb_read_length = 0;
+    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
+    GLuint reg_index = 0;
+
+    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
+    c->first_free_grf = 0;
 
     for (i = 0; i < 4; i++) {
-	reg = (i < c->key.nr_depth_regs) 
-	    ? brw_vec8_grf(i*2, 0) : brw_vec8_grf(0, 0);
+        if (i < c->key.nr_depth_regs) 
+            reg = brw_vec8_grf(i * 2, 0);
+        else
+            reg = brw_vec8_grf(0, 0);
 	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
     }
-    c->reg_index += 2*c->key.nr_depth_regs;
+    reg_index += 2 * c->key.nr_depth_regs;
+
+    /* constants */
     {
-	int nr_params = c->fp->program.Base.Parameters->NumParameters;
-	struct gl_program_parameter_list *plist = 
-	    c->fp->program.Base.Parameters;
-	int index = 0;
-	c->prog_data.nr_params = 4*nr_params;
-	for (i = 0; i < nr_params; i++) {
-	    for (j = 0; j < 4; j++, index++) {
-		reg = brw_vec1_grf(c->reg_index + index/8, 
-			index%8);
-		c->prog_data.param[index] = 
-		    &plist->ParameterValues[i][j];
-		set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
-	    }
-	}
-	c->nr_creg = 2*((4*nr_params+15)/16);
-	c->reg_index += c->nr_creg;
+        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
+
+        /* use a real constant buffer, or just use a section of the GRF? */
+        /* XXX this heuristic may need adjustment... */
+        if ((nr_params + nr_temps) * 4 + reg_index > 80)
+           c->fp->use_const_buffer = GL_TRUE;
+        else
+           c->fp->use_const_buffer = GL_FALSE;
+        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
+
+        if (c->fp->use_const_buffer) {
+           /* We'll use a real constant buffer and fetch constants from
+            * it with a dataport read message.
+            */
+
+           /* number of float constants in CURBE */
+           c->prog_data.nr_params = 0;
+        }
+        else {
+           const struct gl_program_parameter_list *plist = 
+              c->fp->program.Base.Parameters;
+           int index = 0;
+
+           /* number of float constants in CURBE */
+           c->prog_data.nr_params = 4 * nr_params;
+
+           /* loop over program constants (float[4]) */
+           for (i = 0; i < nr_params; i++) {
+              /* loop over XYZW channels */
+              for (j = 0; j < 4; j++, index++) {
+                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
+                 /* Save pointer to parameter/constant value.
+                  * Constants will be copied in prepare_constant_buffer()
+                  */
+                 c->prog_data.param[index] = &plist->ParameterValues[i][j];
+                 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
+              }
+           }
+           /* number of constant regs used (each reg is float[8]) */
+           c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
+           reg_index += c->nr_creg;
+        }
     }
-    for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-	if (inputs & (1<<i)) {
-	    nr_interp_regs++;
-	    reg = brw_vec8_grf(c->reg_index, 0);
-	    for (j = 0; j < 4; j++)
-		set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
-	    c->reg_index += 2;
 
-	}
+    /* fragment shader inputs */
+    for (i = 0; i < VERT_RESULT_MAX; i++) {
+       int fp_input;
+
+       if (i >= VERT_RESULT_VAR0)
+	  fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
+       else if (i <= VERT_RESULT_TEX7)
+	  fp_input = i;
+       else
+	  fp_input = -1;
+
+       if (fp_input >= 0 && inputs & (1 << fp_input)) {
+	  urb_read_length = reg_index;
+	  reg = brw_vec8_grf(reg_index, 0);
+	  for (j = 0; j < 4; j++)
+	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
+       }
+       if (c->key.vp_outputs_written & (1 << i)) {
+	  reg_index += 2;
+       }
     }
+
     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
-    c->prog_data.urb_read_length = nr_interp_regs * 2;
+    c->prog_data.urb_read_length = urb_read_length;
     c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index += 2;
+    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index++;
+    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index += 2;
+
+    /* mark GRF regs [0..reg_index-1] as in-use */
+    for (i = 0; i < reg_index; i++)
+       prealloc_grf(c, i);
+
+    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
+    prealloc_grf(c, 126);
+    prealloc_grf(c, 127);
+
+    for (i = 0; i < c->nr_fp_insns; i++) {
+	const struct prog_instruction *inst = &c->prog_instructions[i];
+	struct brw_reg dst[4];
+
+	switch (inst->Opcode) {
+	case OPCODE_TEX:
+	case OPCODE_TXB:
+	    /* Allocate the channels of texture results contiguously,
+	     * since they are written out that way by the sampler unit.
+	     */
+	    for (j = 0; j < 4; j++) {
+		dst[j] = get_dst_reg(c, inst, j);
+		if (j != 0)
+		    assert(dst[j].nr == dst[j - 1].nr + 1);
+	    }
+	    break;
+	default:
+	    break;
+	}
+    }
+
+    /* An instruction may reference up to three constants.
+     * They'll be found in these registers.
+     * XXX alloc these on demand!
+     */
+    if (c->fp->use_const_buffer) {
+       for (i = 0; i < 3; i++) {
+          c->current_const[i].index = -1;
+          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
+       }
+    }
+#if 0
+    printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
+    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
+#endif
+}
+
+
+/**
+ * Check if any of the instruction's src registers are constants, uniforms,
+ * or statevars.  If so, fetch any constants that we don't already have in
+ * the three GRF slots.
+ */
+static void fetch_constants(struct brw_wm_compile *c,
+                            const struct prog_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   GLuint i;
+
+   /* loop over instruction src regs */
+   for (i = 0; i < 3; i++) {
+      const struct prog_src_register *src = &inst->SrcReg[i];
+      if (src->File == PROGRAM_STATE_VAR ||
+          src->File == PROGRAM_CONSTANT ||
+          src->File == PROGRAM_UNIFORM) {
+	 c->current_const[i].index = src->Index;
+
+#if 0
+	 printf("  fetch const[%d] for arg %d into reg %d\n",
+		src->Index, i, c->current_const[i].reg.nr);
+#endif
+
+	 /* need to fetch the constant now */
+	 brw_dp_READ_4(p,
+		       c->current_const[i].reg,  /* writeback dest */
+		       src->RelAddr,             /* relative indexing? */
+		       16 * src->Index,          /* byte offset */
+		       SURF_INDEX_FRAG_CONST_BUFFER/* binding table index */
+		       );
+      }
+   }
 }
 
+
+/**
+ * Convert Mesa dst register to brw register.
+ */
 static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
-	struct prog_instruction *inst, int component, int nr)
+                                  const struct prog_instruction *inst,
+                                  GLuint component)
 {
+    const int nr = 1;
     return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
 	    0, 0);
 }
 
+
+static struct brw_reg
+get_src_reg_const(struct brw_wm_compile *c,
+                  const struct prog_instruction *inst,
+                  GLuint srcRegIndex, GLuint component)
+{
+   /* We should have already fetched the constant from the constant
+    * buffer in fetch_constants().  Now we just have to return a
+    * register description that extracts the needed component and
+    * smears it across all eight vector components.
+    */
+   const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+   struct brw_reg const_reg;
+
+   assert(component < 4);
+   assert(srcRegIndex < 3);
+   assert(c->current_const[srcRegIndex].index != -1);
+   const_reg = c->current_const[srcRegIndex].reg;
+
+   /* extract desired float from the const_reg, and smear */
+   const_reg = stride(const_reg, 0, 1, 0);
+   const_reg.subnr = component * 4;
+
+   if (src->Negate & (1 << component))
+      const_reg = negate(const_reg);
+   if (src->Abs)
+      const_reg = brw_abs(const_reg);
+
+#if 0
+   printf("  form const[%d].%d for arg %d, reg %d\n",
+          c->current_const[srcRegIndex].index,
+          component,
+          srcRegIndex,
+          const_reg.nr);
+#endif
+
+   return const_reg;
+}
+
+
+/**
+ * Convert Mesa src register to brw register.
+ */
 static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
-	struct prog_src_register *src, int index, int nr)
+                                  const struct prog_instruction *inst,
+                                  GLuint srcRegIndex, GLuint channel)
 {
-    int component = GET_SWZ(src->Swizzle, index);
-    return get_reg(c, src->File, src->Index, component, nr, 
-	    src->NegateBase, src->Abs);
+    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+    const GLuint nr = 1;
+    const GLuint component = GET_SWZ(src->Swizzle, channel);
+
+    /* Extended swizzle terms */
+    if (component == SWIZZLE_ZERO) {
+       return brw_imm_f(0.0F);
+    }
+    else if (component == SWIZZLE_ONE) {
+       return brw_imm_f(1.0F);
+    }
+
+    if (c->fp->use_const_buffer &&
+        (src->File == PROGRAM_STATE_VAR ||
+         src->File == PROGRAM_CONSTANT ||
+         src->File == PROGRAM_UNIFORM)) {
+       return get_src_reg_const(c, inst, srcRegIndex, component);
+    }
+    else {
+       /* other type of source register */
+       return get_reg(c, src->File, src->Index, component, nr, 
+                      src->Negate, src->Abs);
+    }
 }
 
-/* Subroutines are minimal support for resusable instruction sequences.
-   They are implemented as simply as possible to minimise overhead: there
-   is no explicit support for communication between the caller and callee
-   other than saving the return address in a temporary register, nor is
-   there any automatic local storage.  This implies that great care is
-   required before attempting reentrancy or any kind of nested
-   subroutine invocations. */
+
+/**
+ * Same as \sa get_src_reg() but if the register is a literal, emit
+ * a brw_reg encoding the literal.
+ * Note that a brw instruction only allows one src operand to be a literal.
+ * For instructions with more than one operand, only the second can be a
+ * literal.  This means that we treat some literals as constants/uniforms
+ * (which why PROGRAM_CONSTANT is checked in fetch_constants()).
+ * 
+ */
+static struct brw_reg get_src_reg_imm(struct brw_wm_compile *c, 
+                                      const struct prog_instruction *inst,
+                                      GLuint srcRegIndex, GLuint channel)
+{
+    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
+    if (src->File == PROGRAM_CONSTANT) {
+       /* a literal */
+       const int component = GET_SWZ(src->Swizzle, channel);
+       const GLfloat *param =
+          c->fp->program.Base.Parameters->ParameterValues[src->Index];
+       GLfloat value = param[component];
+       if (src->Negate & (1 << channel))
+          value = -value;
+       if (src->Abs)
+          value = FABSF(value);
+#if 0
+       printf("  form immed value %f for chan %d\n", value, channel);
+#endif
+       return brw_imm_f(value);
+    }
+    else {
+       return get_src_reg(c, inst, srcRegIndex, channel);
+    }
+}
+
+
+/**
+ * Subroutines are minimal support for resusable instruction sequences.
+ * They are implemented as simply as possible to minimise overhead: there
+ * is no explicit support for communication between the caller and callee
+ * other than saving the return address in a temporary register, nor is
+ * there any automatic local storage.  This implies that great care is
+ * required before attempting reentrancy or any kind of nested
+ * subroutine invocations.
+ */
 static void invoke_subroutine( struct brw_wm_compile *c,
 			       enum _subroutine subroutine,
 			       void (*emit)( struct brw_wm_compile * ) )
@@ -238,25 +650,8 @@ static void invoke_subroutine( struct brw_wm_compile *c,
     }
 }
 
-static void emit_abs( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    int i;
-    struct brw_compile *p = &c->func;
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for (i = 0; i < 4; i++) {
-	if (inst->DstReg.WriteMask & (1<<i)) {
-	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    brw_MOV(p, dst, brw_abs(src));
-	}
-    }
-    brw_set_saturate(p, 0);
-}
-
 static void emit_trunc( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                        const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -265,8 +660,8 @@ static void emit_trunc( struct brw_wm_compile *c,
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
 	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1) ;
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src = get_src_reg(c, inst, 0, i);
 	    brw_RNDZ(p, dst, src);
 	}
     }
@@ -274,7 +669,7 @@ static void emit_trunc( struct brw_wm_compile *c,
 }
 
 static void emit_mov( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                      const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -283,8 +678,10 @@ static void emit_mov( struct brw_wm_compile *c,
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
 	    struct brw_reg src, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+            /* XXX some moves from immediate value don't work reliably!!! */
+            /*src = get_src_reg_imm(c, inst, 0, i);*/
+            src = get_src_reg(c, inst, 0, i);
 	    brw_MOV(p, dst, src);
 	}
     }
@@ -292,7 +689,7 @@ static void emit_mov( struct brw_wm_compile *c,
 }
 
 static void emit_pixel_xy(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);
@@ -301,8 +698,8 @@ static void emit_pixel_xy(struct brw_wm_compile *c,
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
 
-    dst0 = get_dst_reg(c, inst, 0, 1);
-    dst1 = get_dst_reg(c, inst, 1, 1);
+    dst0 = get_dst_reg(c, inst, 0);
+    dst1 = get_dst_reg(c, inst, 1);
     /* Calculate pixel centers by adding 1 or 0 to each of the
      * micro-tile coordinates passed in r1.
      */
@@ -319,21 +716,20 @@ static void emit_pixel_xy(struct brw_wm_compile *c,
 		stride(suboffset(r1_uw, 5), 2, 4, 0),
 		brw_imm_v(0x11001100));
     }
-
 }
 
 static void emit_delta_xy(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_reg r1 = brw_vec1_grf(1, 0);
     struct brw_reg dst0, dst1, src0, src1;
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
 
-    dst0 = get_dst_reg(c, inst, 0, 1);
-    dst1 = get_dst_reg(c, inst, 1, 1);
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src1 = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+    dst0 = get_dst_reg(c, inst, 0);
+    dst1 = get_dst_reg(c, inst, 1);
+    src0 = get_src_reg(c, inst, 0, 0);
+    src1 = get_src_reg(c, inst, 0, 1);
     /* Calc delta X,Y by subtracting origin in r1 from the pixel
      * centers.
      */
@@ -351,10 +747,8 @@ static void emit_delta_xy(struct brw_wm_compile *c,
 		negate(suboffset(r1,1)));
 
     }
-
 }
 
-
 static void fire_fb_write( struct brw_wm_compile *c,
                            GLuint base_reg,
                            GLuint nr,
@@ -385,7 +779,7 @@ static void fire_fb_write( struct brw_wm_compile *c,
 }
 
 static void emit_fb_write(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     int nr = 2;
@@ -397,38 +791,63 @@ static void emit_fb_write(struct brw_wm_compile *c,
      */
     if (c->key.aa_dest_stencil_reg)
 	nr += 1;
-    {
-	brw_push_insn_state(p);
-	for (channel = 0; channel < 4; channel++) {
-	    src0 = get_src_reg(c,  &inst->SrcReg[0], channel, 1);
-	    /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
-	    /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
-	    brw_MOV(p, brw_message_reg(nr + channel), src0);
-	}
-	/* skip over the regs populated above: */
-	nr += 8;
-	brw_pop_insn_state(p);
+
+    brw_push_insn_state(p);
+    for (channel = 0; channel < 4; channel++) {
+        src0 = get_src_reg(c,  inst, 0, channel);
+        /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+        /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+        brw_MOV(p, brw_message_reg(nr + channel), src0);
     }
+    /* skip over the regs populated above: */
+    nr += 8;
+    brw_pop_insn_state(p);
 
-   if (c->key.source_depth_to_render_target)
-   {
-      if (c->key.computes_depth) {
-         src0 = get_src_reg(c, &inst->SrcReg[2], 2, 1);
-         brw_MOV(p, brw_message_reg(nr), src0);
-      } else {
-         src0 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-         brw_MOV(p, brw_message_reg(nr), src0);
-      }
+    if (c->key.source_depth_to_render_target) {
+       if (c->key.computes_depth) {
+          src0 = get_src_reg(c, inst, 2, 2);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+       else {
+          src0 = get_src_reg(c, inst, 1, 1);
+          brw_MOV(p, brw_message_reg(nr), src0);
+       }
+
+       nr += 2;
+    }
+
+    if (c->key.dest_depth_reg) {
+        const GLuint comp = c->key.dest_depth_reg / 2;
+        const GLuint off = c->key.dest_depth_reg % 2;
+
+        if (off != 0) {
+            /* XXX this code needs review/testing */
+            struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
+            struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
 
-      nr += 2;
+            brw_push_insn_state(p);
+            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+            brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
+            /* 2nd half? */
+            brw_MOV(p, brw_message_reg(nr+1), arg1_1);
+            brw_pop_insn_state(p);
+        }
+        else
+        {
+            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
+            brw_MOV(p, brw_message_reg(nr), src);
+        }
+        nr += 2;
    }
-    target = inst->Sampler >> 1;
-    eot = inst->Sampler & 1;
+
+    target = inst->Aux >> 1;
+    eot = inst->Aux & 1;
     fire_fb_write(c, 0, nr, target, eot);
 }
 
 static void emit_pixel_w( struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                          const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -436,10 +855,10 @@ static void emit_pixel_w( struct brw_wm_compile *c,
 	struct brw_reg dst, src0, delta0, delta1;
 	struct brw_reg interp3;
 
-	dst = get_dst_reg(c, inst, 3, 1);
-	src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-	delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-	delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
+	dst = get_dst_reg(c, inst, 3);
+	src0 = get_src_reg(c, inst, 0, 0);
+	delta0 = get_src_reg(c, inst, 1, 0);
+	delta1 = get_src_reg(c, inst, 1, 1);
 
 	interp3 = brw_vec1_grf(src0.nr+1, 4);
 	/* Calc 1/w - just linterp wpos[3] optimized by putting the
@@ -458,19 +877,19 @@ static void emit_pixel_w( struct brw_wm_compile *c,
 }
 
 static void emit_linterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
     struct brw_reg interp[4];
     struct brw_reg dst, delta0, delta1;
     struct brw_reg src0;
+    GLuint nr, i;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-    delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-    GLuint nr = src0.nr;
-    int i;
+    src0 = get_src_reg(c, inst, 0, 0);
+    delta0 = get_src_reg(c, inst, 1, 0);
+    delta1 = get_src_reg(c, inst, 1, 1);
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -479,7 +898,7 @@ static void emit_linterp(struct brw_wm_compile *c,
 
     for(i = 0; i < 4; i++ ) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
 	    brw_MAC(p, dst, suboffset(interp[i],1), delta1);
 	}
@@ -487,17 +906,17 @@ static void emit_linterp(struct brw_wm_compile *c,
 }
 
 static void emit_cinterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
 
     struct brw_reg interp[4];
     struct brw_reg dst, src0;
+    GLuint nr, i;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    GLuint nr = src0.nr;
-    int i;
+    src0 = get_src_reg(c, inst, 0, 0);
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -506,14 +925,14 @@ static void emit_cinterp(struct brw_wm_compile *c,
 
     for(i = 0; i < 4; i++ ) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV(p, dst, suboffset(interp[i],3));
 	}
     }
 }
 
 static void emit_pinterp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -521,13 +940,13 @@ static void emit_pinterp(struct brw_wm_compile *c,
     struct brw_reg interp[4];
     struct brw_reg dst, delta0, delta1;
     struct brw_reg src0, w;
+    GLuint nr, i;
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    delta0 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
-    delta1 = get_src_reg(c, &inst->SrcReg[1], 1, 1);
-    w = get_src_reg(c, &inst->SrcReg[2], 3, 1);
-    GLuint nr = src0.nr;
-    int i;
+    src0 = get_src_reg(c, inst, 0, 0);
+    delta0 = get_src_reg(c, inst, 1, 0);
+    delta1 = get_src_reg(c, inst, 1, 1);
+    w = get_src_reg(c, inst, 2, 3);
+    nr = src0.nr;
 
     interp[0] = brw_vec1_grf(nr, 0);
     interp[1] = brw_vec1_grf(nr, 4);
@@ -536,7 +955,7 @@ static void emit_pinterp(struct brw_wm_compile *c,
 
     for(i = 0; i < 4; i++ ) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_LINE(p, brw_null_reg(), interp[i], delta0);
 	    brw_MAC(p, dst, suboffset(interp[i],1), 
 		    delta1);
@@ -545,8 +964,38 @@ static void emit_pinterp(struct brw_wm_compile *c,
     }
 }
 
+/* Sets the destination channels to 1.0 or 0.0 according to glFrontFacing. */
+static void emit_frontfacing(struct brw_wm_compile *c,
+			     const struct prog_instruction *inst)
+{
+    struct brw_compile *p = &c->func;
+    struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
+    struct brw_reg dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_MOV(p, dst, brw_imm_f(0.0));
+	}
+    }
+
+    /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
+     * us front face
+     */
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, r1_6ud, brw_imm_ud(1 << 31));
+    for (i = 0; i < 4; i++) {
+	if (mask & (1<<i)) {
+	    dst = get_dst_reg(c, inst, i);
+	    brw_MOV(p, dst, brw_imm_f(1.0));
+	}
+    }
+    brw_set_predicate_control_flag_value(p, 0xff);
+}
+
 static void emit_xpd(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     int i;
     struct brw_compile *p = &c->func;
@@ -556,12 +1005,12 @@ static void emit_xpd(struct brw_wm_compile *c,
 	GLuint i1 = (i+1)%3;
 	if (mask & (1<<i)) {
 	    struct brw_reg src0, src1, dst;
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = negate(get_src_reg(c, &inst->SrcReg[0], i2, 1));
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i1, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = negate(get_src_reg(c, inst, 0, i2));
+	    src1 = get_src_reg_imm(c, inst, 1, i1);
 	    brw_MUL(p, brw_null_reg(), src0, src1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i1, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i2, 1);
+	    src0 = get_src_reg(c, inst, 0, i1);
+	    src1 = get_src_reg_imm(c, inst, 1, i2);
 	    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
 	    brw_MAC(p, dst, src0, src1);
 	    brw_set_saturate(p, 0);
@@ -571,17 +1020,25 @@ static void emit_xpd(struct brw_wm_compile *c,
 }
 
 static void emit_dp3(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[3], src1[3], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 3; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
 
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
@@ -590,16 +1047,24 @@ static void emit_dp3(struct brw_wm_compile *c,
 }
 
 static void emit_dp4(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
@@ -609,16 +1074,24 @@ static void emit_dp4(struct brw_wm_compile *c,
 }
 
 static void emit_dph(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_reg src0[4], src1[4], dst;
     int i;
     struct brw_compile *p = &c->func;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
     for (i = 0; i < 4; i++) {
-	src0[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	src1[i] = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	src0[i] = get_src_reg(c, inst, 0, i);
+	src1[i] = get_src_reg_imm(c, inst, 1, i);
     }
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
+    dst = get_dst_reg(c, inst, dst_chan);
     brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
     brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
     brw_MAC(p, dst, src0[2], src1[2]);
@@ -627,63 +1100,77 @@ static void emit_dph(struct brw_wm_compile *c,
     brw_set_saturate(p, 0);
 }
 
+/**
+ * Emit a scalar instruction, like RCP, RSQ, LOG, EXP.
+ * Note that the result of the function is smeared across the dest
+ * register's X, Y, Z and W channels (subject to writemasking of course).
+ */
 static void emit_math1(struct brw_wm_compile *c,
-		struct prog_instruction *inst, GLuint func)
+                       const struct prog_instruction *inst, GLuint func)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    /* Get first component of source register */
+    dst = get_dst_reg(c, inst, dst_chan);
+    src0 = get_src_reg(c, inst, 0, 0);
 
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
     brw_MOV(p, brw_message_reg(2), src0);
     brw_math(p,
-	    dst,
-	    func,
-	    (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
-	    2,
-	    brw_null_reg(),
-	    BRW_MATH_DATA_VECTOR,
-	    BRW_MATH_PRECISION_FULL);
+             dst,
+             func,
+             (inst->SaturateMode != SATURATE_OFF) ? BRW_MATH_SATURATE_SATURATE : BRW_MATH_SATURATE_NONE,
+             2,
+             brw_null_reg(),
+             BRW_MATH_DATA_VECTOR,
+             BRW_MATH_PRECISION_FULL);
 }
 
 static void emit_rcp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
 }
 
 static void emit_rsq(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
 }
 
 static void emit_sin(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
 }
 
 static void emit_cos(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
 }
 
 static void emit_ex2(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
 }
 
 static void emit_lg2(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
 }
 
 static void emit_add(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -692,36 +1179,31 @@ static void emit_add(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 	    brw_ADD(p, dst, src0, src1);
 	}
     }
     brw_set_saturate(p, 0);
 }
 
-static void emit_sub(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static void emit_arl(struct brw_wm_compile *c,
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
-    struct brw_reg src0, src1, dst;
-    GLuint mask = inst->DstReg.WriteMask;
-    int i;
+    struct brw_reg src0, addr_reg;
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    for (i = 0 ; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_ADD(p, dst, src0, negate(src1));
-	}
-    }
+    addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
+                           BRW_ARF_ADDRESS, 0);
+    src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
+    brw_MOV(p, addr_reg, src0);
     brw_set_saturate(p, 0);
 }
 
+
 static void emit_mul(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, dst;
@@ -730,9 +1212,9 @@ static void emit_mul(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 	    brw_MUL(p, dst, src0, src1);
 	}
     }
@@ -740,7 +1222,7 @@ static void emit_mul(struct brw_wm_compile *c,
 }
 
 static void emit_frc(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
@@ -749,8 +1231,8 @@ static void emit_frc(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg_imm(c, inst, 0, i);
 	    brw_FRC(p, dst, src0);
 	}
     }
@@ -759,7 +1241,7 @@ static void emit_frc(struct brw_wm_compile *c,
 }
 
 static void emit_flr(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, dst;
@@ -768,78 +1250,79 @@ static void emit_flr(struct brw_wm_compile *c,
     brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg_imm(c, inst, 0, i);
 	    brw_RNDD(p, dst, src0);
 	}
     }
     brw_set_saturate(p, 0);
 }
 
-static void emit_max(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg src0, src1, dst;
-    int i;
-    brw_push_insn_state(p);
-    for (i = 0; i < 4; i++) {
-	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_MOV(p, dst, src0);
-	    brw_set_saturate(p, 0);
-
-	    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
-	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	    brw_MOV(p, dst, src1);
-	    brw_set_saturate(p, 0);
-	    brw_set_predicate_control_flag_value(p, 0xff);
-	}
-    }
-    brw_pop_insn_state(p);
-}
 
-static void emit_min(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+static void emit_min_max(struct brw_wm_compile *c,
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg src0, src1, dst;
+    const GLuint mask = inst->DstReg.WriteMask;
+    const int mark = mark_tmps(c);
     int i;
     brw_push_insn_state(p);
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+            struct brw_reg real_dst = get_dst_reg(c, inst, i);
+	    struct brw_reg src0 = get_src_reg(c, inst, 0, i);
+	    struct brw_reg src1 = get_src_reg(c, inst, 1, i);
+            struct brw_reg dst;
+            /* if dst==src0 or dst==src1 we need to use a temp reg */
+            GLboolean use_temp = brw_same_reg(dst, src0) ||
+                                 brw_same_reg(dst, src1);
+            if (use_temp)
+               dst = alloc_tmp(c);
+            else
+               dst = real_dst;
+
+            /*
+            printf("  Min/max: dst %d  src0 %d  src1 %d\n",
+                   dst.nr, src0.nr, src1.nr);
+            */
 	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 	    brw_MOV(p, dst, src0);
 	    brw_set_saturate(p, 0);
 
-	    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+            if (inst->Opcode == OPCODE_MIN)
+               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+            else
+               brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, src1, src0);
+
 	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
 	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	    brw_MOV(p, dst, src1);
 	    brw_set_saturate(p, 0);
 	    brw_set_predicate_control_flag_value(p, 0xff);
+            if (use_temp)
+               brw_MOV(p, real_dst, dst);
 	}
     }
     brw_pop_insn_state(p);
+    release_tmps(c, mark);
 }
 
 static void emit_pow(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst, src0, src1;
-    dst = get_dst_reg(c, inst, get_scalar_dst_index(inst), 1);
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src1 = get_src_reg(c, &inst->SrcReg[1], 0, 1);
+    GLuint mask = inst->DstReg.WriteMask;
+    int dst_chan = _mesa_ffs(mask & WRITEMASK_XYZW) - 1;
+
+    if (!(mask & WRITEMASK_XYZW))
+	return;
+
+    assert(is_power_of_two(mask & WRITEMASK_XYZW));
+
+    dst = get_dst_reg(c, inst, dst_chan);
+    src0 = get_src_reg_imm(c, inst, 0, 0);
+    src1 = get_src_reg_imm(c, inst, 1, 0);
 
     brw_MOV(p, brw_message_reg(2), src0);
     brw_MOV(p, brw_message_reg(3), src1);
@@ -855,7 +1338,7 @@ static void emit_pow(struct brw_wm_compile *c,
 }
 
 static void emit_lrp(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -864,10 +1347,10 @@ static void emit_lrp(struct brw_wm_compile *c,
     int mark = mark_tmps(c);
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
 
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 
 	    if (src1.nr == dst.nr) {
 		tmp1 = alloc_tmp(c);
@@ -875,7 +1358,7 @@ static void emit_lrp(struct brw_wm_compile *c,
 	    } else
 		tmp1 = src1;
 
-	    src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
+	    src2 = get_src_reg(c, inst, 2, i);
 	    if (src2.nr == dst.nr) {
 		tmp2 = alloc_tmp(c);
 		brw_MOV(p, tmp2, src2);
@@ -908,7 +1391,7 @@ static void emit_kil(struct brw_wm_compile *c)
 }
 
 static void emit_mad(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -917,10 +1400,10 @@ static void emit_mad(struct brw_wm_compile *c,
 
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
-	    src2 = get_src_reg(c, &inst->SrcReg[2], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
+	    src2 = get_src_reg_imm(c, inst, 2, i);
 	    brw_MUL(p, dst, src0, src1);
 
 	    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
@@ -931,7 +1414,7 @@ static void emit_mad(struct brw_wm_compile *c,
 }
 
 static void emit_sop(struct brw_wm_compile *c,
-		struct prog_instruction *inst, GLuint cond)
+                     const struct prog_instruction *inst, GLuint cond)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
@@ -940,9 +1423,9 @@ static void emit_sop(struct brw_wm_compile *c,
 
     for (i = 0; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
-	    src0 = get_src_reg(c, &inst->SrcReg[0], i, 1);
-	    src1 = get_src_reg(c, &inst->SrcReg[1], i, 1);
+	    dst = get_dst_reg(c, inst, i);
+	    src0 = get_src_reg(c, inst, 0, i);
+	    src1 = get_src_reg_imm(c, inst, 1, i);
 	    brw_push_insn_state(p);
 	    brw_CMP(p, brw_null_reg(), cond, src0, src1);
 	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
@@ -955,113 +1438,58 @@ static void emit_sop(struct brw_wm_compile *c,
 }
 
 static void emit_slt(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_L);
 }
 
 static void emit_sle(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_LE);
 }
 
 static void emit_sgt(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_G);
 }
 
 static void emit_sge(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_GE);
 }
 
 static void emit_seq(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_EQ);
 }
 
 static void emit_sne(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
 }
 
-static void emit_ddx(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg interp[4];
-    struct brw_reg dst;
-    struct brw_reg src0, w;
-    GLuint nr, i;
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
-    nr = src0.nr;
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for(i = 0; i < 4; i++ ) {
-        if (mask & (1<<i)) {
-            dst = get_dst_reg(c, inst, i, 1);
-            brw_MOV(p, dst, interp[i]);
-            brw_MUL(p, dst, dst, w);
-        }
-    }
-    brw_set_saturate(p, 0);
-}
-
-static void emit_ddy(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    GLuint mask = inst->DstReg.WriteMask;
-    struct brw_reg interp[4];
-    struct brw_reg dst;
-    struct brw_reg src0, w;
-    GLuint nr, i;
-
-    src0 = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    nr = src0.nr;
-    w = get_src_reg(c, &inst->SrcReg[1], 3, 1);
-    interp[0] = brw_vec1_grf(nr, 0);
-    interp[1] = brw_vec1_grf(nr, 4);
-    interp[2] = brw_vec1_grf(nr+1, 0);
-    interp[3] = brw_vec1_grf(nr+1, 4);
-    brw_set_saturate(p, inst->SaturateMode != SATURATE_OFF);
-    for(i = 0; i < 4; i++ ) {
-        if (mask & (1<<i)) {
-            dst = get_dst_reg(c, inst, i, 1);
-            brw_MOV(p, dst, suboffset(interp[i], 1));
-            brw_MUL(p, dst, dst, w);
-        }
-    }
-    brw_set_saturate(p, 0);
-}
-
-static __inline struct brw_reg high_words( struct brw_reg reg )
+static INLINE struct brw_reg high_words( struct brw_reg reg )
 {
     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
 		   0, 8, 2 );
 }
 
-static __inline struct brw_reg low_words( struct brw_reg reg )
+static INLINE struct brw_reg low_words( struct brw_reg reg )
 {
     return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
 }
 
-static __inline struct brw_reg even_bytes( struct brw_reg reg )
+static INLINE struct brw_reg even_bytes( struct brw_reg reg )
 {
     return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
 }
 
-static __inline struct brw_reg odd_bytes( struct brw_reg reg )
+static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
 {
     return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
 		   0, 16, 2 );
@@ -1161,7 +1589,7 @@ static void noise1_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise1( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src, param, dst;
@@ -1171,7 +1599,7 @@ static void emit_noise1( struct brw_wm_compile *c,
 
     assert( mark == 0 );
     
-    src = get_src_reg( c, inst->SrcReg, 0, 1 );
+    src = get_src_reg( c, inst, 0, 0 );
 
     param = alloc_tmp( c );
 
@@ -1183,7 +1611,7 @@ static void emit_noise1( struct brw_wm_compile *c,
     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV( p, dst, param );
 	}
     }
@@ -1331,7 +1759,7 @@ static void noise2_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise2( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, param0, param1, dst;
@@ -1341,8 +1769,8 @@ static void emit_noise2( struct brw_wm_compile *c,
 
     assert( mark == 0 );
     
-    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
-    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
+    src0 = get_src_reg( c, inst, 0, 0 );
+    src1 = get_src_reg( c, inst, 0, 1 );
 
     param0 = alloc_tmp( c );
     param1 = alloc_tmp( c );
@@ -1356,7 +1784,7 @@ static void emit_noise2( struct brw_wm_compile *c,
     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV( p, dst, param0 );
 	}
     }
@@ -1366,9 +1794,11 @@ static void emit_noise2( struct brw_wm_compile *c,
     release_tmps( c, mark );
 }
 
-/* The three-dimensional case is much like the one- and two- versions above,
-   but since the number of corners is rapidly growing we now pack 16 16-bit
-   hashes into each register to extract more parallelism from the EUs. */
+/**
+ * The three-dimensional case is much like the one- and two- versions above,
+ * but since the number of corners is rapidly growing we now pack 16 16-bit
+ * hashes into each register to extract more parallelism from the EUs.
+ */
 static void noise3_sub( struct brw_wm_compile *c ) {
 
     struct brw_compile *p = &c->func;
@@ -1632,7 +2062,7 @@ static void noise3_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise3( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, src2, param0, param1, param2, dst;
@@ -1642,9 +2072,9 @@ static void emit_noise3( struct brw_wm_compile *c,
 
     assert( mark == 0 );
     
-    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
-    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
-    src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
+    src0 = get_src_reg( c, inst, 0, 0 );
+    src1 = get_src_reg( c, inst, 0, 1 );
+    src2 = get_src_reg( c, inst, 0, 2 );
 
     param0 = alloc_tmp( c );
     param1 = alloc_tmp( c );
@@ -1660,7 +2090,7 @@ static void emit_noise3( struct brw_wm_compile *c,
     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV( p, dst, param0 );
 	}
     }
@@ -1670,13 +2100,15 @@ static void emit_noise3( struct brw_wm_compile *c,
     release_tmps( c, mark );
 }
     
-/* For the four-dimensional case, the little micro-optimisation benefits
-   we obtain by unrolling all the loops aren't worth the massive bloat it
-   now causes.  Instead, we loop twice around performing a similar operation
-   to noise3, once for the w=0 cube and once for the w=1, with a bit more
-   code to glue it all together. */
-static void noise4_sub( struct brw_wm_compile *c ) {
-
+/**
+ * For the four-dimensional case, the little micro-optimisation benefits
+ * we obtain by unrolling all the loops aren't worth the massive bloat it
+ * now causes.  Instead, we loop twice around performing a similar operation
+ * to noise3, once for the w=0 cube and once for the w=1, with a bit more
+ * code to glue it all together.
+ */
+static void noise4_sub( struct brw_wm_compile *c )
+{
     struct brw_compile *p = &c->func;
     struct brw_reg param[ 4 ],
 	x0y0, x0y1, x1y0, x1y1, /* gradients at four of the corners */
@@ -2053,7 +2485,7 @@ static void noise4_sub( struct brw_wm_compile *c ) {
 }
 
 static void emit_noise4( struct brw_wm_compile *c,
-			 struct prog_instruction *inst )
+			 const struct prog_instruction *inst )
 {
     struct brw_compile *p = &c->func;
     struct brw_reg src0, src1, src2, src3, param0, param1, param2, param3, dst;
@@ -2063,10 +2495,10 @@ static void emit_noise4( struct brw_wm_compile *c,
 
     assert( mark == 0 );
     
-    src0 = get_src_reg( c, inst->SrcReg, 0, 1 );
-    src1 = get_src_reg( c, inst->SrcReg, 1, 1 );
-    src2 = get_src_reg( c, inst->SrcReg, 2, 1 );
-    src3 = get_src_reg( c, inst->SrcReg, 3, 1 );
+    src0 = get_src_reg( c, inst, 0, 0 );
+    src1 = get_src_reg( c, inst, 0, 1 );
+    src2 = get_src_reg( c, inst, 0, 2 );
+    src3 = get_src_reg( c, inst, 0, 3 );
 
     param0 = alloc_tmp( c );
     param1 = alloc_tmp( c );
@@ -2084,7 +2516,7 @@ static void emit_noise4( struct brw_wm_compile *c,
     brw_set_saturate( p, inst->SaturateMode == SATURATE_ZERO_ONE );
     for (i = 0 ; i < 4; i++) {
 	if (mask & (1<<i)) {
-	    dst = get_dst_reg(c, inst, i, 1);
+	    dst = get_dst_reg(c, inst, i);
 	    brw_MOV( p, dst, param0 );
 	}
     }
@@ -2095,17 +2527,17 @@ static void emit_noise4( struct brw_wm_compile *c,
 }
     
 static void emit_wpos_xy(struct brw_wm_compile *c,
-                struct prog_instruction *inst)
+                         const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     GLuint mask = inst->DstReg.WriteMask;
     struct brw_reg src0[2], dst[2];
 
-    dst[0] = get_dst_reg(c, inst, 0, 1);
-    dst[1] = get_dst_reg(c, inst, 1, 1);
+    dst[0] = get_dst_reg(c, inst, 0);
+    dst[1] = get_dst_reg(c, inst, 1);
 
-    src0[0] = get_src_reg(c, &inst->SrcReg[0], 0, 1);
-    src0[1] = get_src_reg(c, &inst->SrcReg[0], 1, 1);
+    src0[0] = get_src_reg(c, inst, 0, 0);
+    src0[1] = get_src_reg(c, inst, 0, 1);
 
     /* Calculate the pixel offset from window bottom left into destination
      * X and Y channels.
@@ -2128,27 +2560,32 @@ static void emit_wpos_xy(struct brw_wm_compile *c,
 }
 
 /* TODO
-   BIAS on SIMD8 not workind yet...
+   BIAS on SIMD8 not working yet...
  */	
 static void emit_txb(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
-    GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
-
+    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->TexSrcUnit;
     GLuint i;
+    GLuint msg_type;
+
+    assert(unit < BRW_MAX_TEX_UNIT);
+
     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
+
     for (i = 0; i < 4; i++) 
-	dst[i] = get_dst_reg(c, inst, i, 1);
+	dst[i] = get_dst_reg(c, inst, i);
     for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
+	src[i] = get_src_reg(c, inst, 0, i);
 
     switch (inst->TexSrcTarget) {
 	case TEXTURE_1D_INDEX:
-	    brw_MOV(p, brw_message_reg(2), src[0]);
-	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
-	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+	    brw_MOV(p, brw_message_reg(2), src[0]);         /* s coord */
+	    brw_MOV(p, brw_message_reg(3), brw_imm_f(0));   /* t coord */
+	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));   /* r coord */
 	    break;
 	case TEXTURE_2D_INDEX:
 	case TEXTURE_RECT_INDEX:
@@ -2156,46 +2593,63 @@ static void emit_txb(struct brw_wm_compile *c,
 	    brw_MOV(p, brw_message_reg(3), src[1]);
 	    brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
 	    break;
-	default:
+	case TEXTURE_3D_INDEX:
+	case TEXTURE_CUBE_INDEX:
 	    brw_MOV(p, brw_message_reg(2), src[0]);
 	    brw_MOV(p, brw_message_reg(3), src[1]);
 	    brw_MOV(p, brw_message_reg(4), src[2]);
 	    break;
+	default:
+            /* invalid target */
+            abort();
     }
-    brw_MOV(p, brw_message_reg(5), src[3]);
-    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+    brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
+    brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
+
+    if (BRW_IS_IGDNG(p->brw)) {
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
+    } else {
+        /* Does it work well on SIMD8? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+    }
+
     brw_SAMPLE(p,
-	    retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
-	    1,
-	    retype(payload_reg, BRW_REGISTER_TYPE_UW),
-	    unit + MAX_DRAW_BUFFERS, /* surface */
-	    unit,     /* sampler */
-	    inst->DstReg.WriteMask,
-	    BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
-	    4,
-	    4,
-	    0);
+               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
+               1,                                           /* msg_reg_nr */
+               retype(payload_reg, BRW_REGISTER_TYPE_UW),   /* src0 */
+               SURF_INDEX_TEXTURE(unit),
+               unit,                                        /* sampler */
+               inst->DstReg.WriteMask,                      /* writemask */
+               msg_type,                                    /* msg_type */
+               4,                                           /* response_length */
+               4,                                           /* msg_length */
+               0,                                           /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
 }
 
+
 static void emit_tex(struct brw_wm_compile *c,
-		struct prog_instruction *inst)
+                     const struct prog_instruction *inst)
 {
     struct brw_compile *p = &c->func;
     struct brw_reg dst[4], src[4], payload_reg;
-    GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
-
+    /* Note: TexSrcUnit was already looked up through SamplerTextures[] */
+    const GLuint unit = inst->TexSrcUnit;
     GLuint msg_len;
     GLuint i, nr;
     GLuint emit;
     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
+    GLuint msg_type;
+
+    assert(unit < BRW_MAX_TEX_UNIT);
 
     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
 
     for (i = 0; i < 4; i++) 
-	dst[i] = get_dst_reg(c, inst, i, 1);
+	dst[i] = get_dst_reg(c, inst, i);
     for (i = 0; i < 4; i++)
-	src[i] = get_src_reg(c, &inst->SrcReg[0], i, 1);
-
+	src[i] = get_src_reg(c, inst, 0, i);
 
     switch (inst->TexSrcTarget) {
 	case TEXTURE_1D_INDEX:
@@ -2207,13 +2661,18 @@ static void emit_tex(struct brw_wm_compile *c,
 	    emit = WRITEMASK_XY;
 	    nr = 2;
 	    break;
-	default:
+	case TEXTURE_3D_INDEX:
+	case TEXTURE_CUBE_INDEX:
 	    emit = WRITEMASK_XYZ;
 	    nr = 3;
 	    break;
+	default:
+           /* invalid target */
+           abort();
     }
     msg_len = 1;
 
+    /* move/load S, T, R coords */
     for (i = 0; i < nr; i++) {
 	static const GLuint swz[4] = {0,1,2,2};
 	if (emit & (1<<i))
@@ -2224,77 +2683,103 @@ static void emit_tex(struct brw_wm_compile *c,
     }
 
     if (shadow) {
-	brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
-	brw_MOV(p, brw_message_reg(6), src[2]);
+       brw_MOV(p, brw_message_reg(5), brw_imm_f(0));  /* lod / bias */
+       brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
     }
 
+    if (BRW_IS_IGDNG(p->brw)) {
+        if (shadow)
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
+        else
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
+    } else {
+        /* Does it work for shadow on SIMD8 ? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+    }
+    
     brw_SAMPLE(p,
-	    retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
-	    1,
-	    retype(payload_reg, BRW_REGISTER_TYPE_UW),
-	    unit + MAX_DRAW_BUFFERS, /* surface */
-	    unit,     /* sampler */
-	    inst->DstReg.WriteMask,
-	    BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
-	    4,
-	    shadow ? 6 : 4,
-	    0);
+               retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
+               1,                                          /* msg_reg_nr */
+               retype(payload_reg, BRW_REGISTER_TYPE_UW),  /* src0 */
+               SURF_INDEX_TEXTURE(unit),
+               unit,                                       /* sampler */
+               inst->DstReg.WriteMask,                     /* writemask */
+               msg_type,                                   /* msg_type */
+               4,                                          /* response_length */
+               shadow ? 6 : 4,                             /* msg_length */
+               0,                                          /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
 
     if (shadow)
 	brw_MOV(p, dst[3], brw_imm_f(1.0));
 }
 
+
+/**
+ * Resolve subroutine calls after code emit is done.
+ */
 static void post_wm_emit( struct brw_wm_compile *c )
 {
-    GLuint nr_insns = c->fp->program.Base.NumInstructions;
-    GLuint insn, target_insn;
-    struct prog_instruction *inst1, *inst2;
-    struct brw_instruction *brw_inst1, *brw_inst2;
-    int offset;
-    for (insn = 0; insn < nr_insns; insn++) {
-	inst1 = &c->fp->program.Base.Instructions[insn];
-	brw_inst1 = inst1->Data;
-	switch (inst1->Opcode) {
-	    case OPCODE_CAL:
-		target_insn = inst1->BranchTarget;
-		inst2 = &c->fp->program.Base.Instructions[target_insn];
-		brw_inst2 = inst2->Data;
-		offset = brw_inst2 - brw_inst1;
-		brw_set_src1(brw_inst1, brw_imm_d(offset*16));
-		break;
-	    default:
-		break;
-	}
+    brw_resolve_cals(&c->func);
+}
+
+static void
+get_argument_regs(struct brw_wm_compile *c,
+		  const struct prog_instruction *inst,
+		  int index,
+		  struct brw_reg *regs,
+		  int mask)
+{
+    int i;
+
+    for (i = 0; i < 4; i++) {
+	if (mask & (1 << i))
+	    regs[i] = get_src_reg(c, inst, index, i);
     }
 }
 
 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 {
-#define MAX_IFSN 32
+#define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32
-    struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
-    struct brw_instruction *inst0, *inst1;
-    int i, if_insn = 0, loop_insn = 0;
+    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+    GLuint i, if_depth = 0, loop_depth = 0;
     struct brw_compile *p = &c->func;
     struct brw_indirect stack_index = brw_indirect(0, 0);
 
-    c->reg_index = 0;
+    c->out_of_regs = GL_FALSE;
+
     prealloc_reg(c);
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
 
     for (i = 0; i < c->nr_fp_insns; i++) {
-	struct prog_instruction *inst = &c->prog_instructions[i];
-	struct prog_instruction *orig_inst;
+        const struct prog_instruction *inst = &c->prog_instructions[i];
+	int dst_flags;
+	struct brw_reg args[3][4], dst[4];
+	int j;
 
-	if ((orig_inst = inst->Data) != 0)
-	    orig_inst->Data = current_insn(p);
+        c->cur_inst = i;
+
+#if 0
+        _mesa_printf("Inst %d: ", i);
+        _mesa_print_instruction(inst);
+#endif
+
+        /* fetch any constants that this instruction needs */
+        if (c->fp->use_const_buffer)
+           fetch_constants(c, inst);
 
 	if (inst->CondUpdate)
 	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
 	else
 	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
 
+	dst_flags = inst->DstReg.WriteMask;
+	if (inst->SaturateMode == SATURATE_ZERO_ONE)
+	    dst_flags |= SATURATE;
+
 	switch (inst->Opcode) {
 	    case WM_PIXELXY:
 		emit_pixel_xy(c, inst);
@@ -2320,14 +2805,14 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case WM_FB_WRITE:
 		emit_fb_write(c, inst);
 		break;
-	    case OPCODE_ABS:
-		emit_abs(c, inst);
+	    case WM_FRONTFACING:
+		emit_frontfacing(c, inst);
 		break;
 	    case OPCODE_ADD:
 		emit_add(c, inst);
 		break;
-	    case OPCODE_SUB:
-		emit_sub(c, inst);
+	    case OPCODE_ARL:
+		emit_arl(c, inst);
 		break;
 	    case OPCODE_FRC:
 		emit_frc(c, inst);
@@ -2342,6 +2827,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_trunc(c, inst);
 		break;
 	    case OPCODE_MOV:
+	    case OPCODE_SWZ:
 		emit_mov(c, inst);
 		break;
 	    case OPCODE_DP3:
@@ -2374,17 +2860,21 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 	    case OPCODE_LG2:
 		emit_lg2(c, inst);
 		break;
-	    case OPCODE_MAX:	
-		emit_max(c, inst);
-		break;
 	    case OPCODE_MIN:	
-		emit_min(c, inst);
+	    case OPCODE_MAX:	
+		emit_min_max(c, inst);
 		break;
 	    case OPCODE_DDX:
-		emit_ddx(c, inst);
-		break;
 	    case OPCODE_DDY:
-                emit_ddy(c, inst);
+		for (j = 0; j < 4; j++) {
+		    if (inst->DstReg.WriteMask & (1 << j))
+			dst[j] = get_dst_reg(c, inst, j);
+		    else
+			dst[j] = brw_null_reg();
+		}
+		get_argument_regs(c, inst, 0, args[0], WRITEMASK_XYZW);
+		emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
+			  args[0]);
                 break;
 	    case OPCODE_SLT:
 		emit_slt(c, inst);
@@ -2435,18 +2925,21 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_kil(c);
 		break;
 	    case OPCODE_IF:
-		assert(if_insn < MAX_IFSN);
-		if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+		assert(if_depth < MAX_IF_DEPTH);
+		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_ELSE:
-		if_inst[if_insn-1]  = brw_ELSE(p, if_inst[if_insn-1]);
+		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
 		break;
 	    case OPCODE_ENDIF:
-		assert(if_insn > 0);
-		brw_ENDIF(p, if_inst[--if_insn]);
+		assert(if_depth > 0);
+		brw_ENDIF(p, if_inst[--if_depth]);
 		break;
 	    case OPCODE_BGNSUB:
+		brw_save_label(p, inst->Comment, p->nr_insn);
+		break;
 	    case OPCODE_ENDSUB:
+		/* no-op */
 		break;
 	    case OPCODE_CAL: 
 		brw_push_insn_state(p);
@@ -2456,8 +2949,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
                 brw_set_access_mode(p, BRW_ALIGN_16);
                 brw_ADD(p, get_addr_reg(stack_index),
                          get_addr_reg(stack_index), brw_imm_d(4));
-                orig_inst = inst->Data;
-                orig_inst->Data = &p->store[p->nr_insn];
+		brw_save_call(&c->func, inst->Comment, p->nr_insn);
                 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
                 brw_pop_insn_state(p);
 		break;
@@ -2474,7 +2966,8 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 
 		break;
 	    case OPCODE_BGNLOOP:
-		loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+                /* XXX may need to invalidate the current_constant regs */
+		loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_BRK:
 		brw_BREAK(p);
@@ -2485,39 +2978,69 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 		break;
 	    case OPCODE_ENDLOOP: 
-		loop_insn--;
-		inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
-		/* patch all the BREAK instructions from
-		   last BEGINLOOP */
-		while (inst0 > loop_inst[loop_insn]) {
-		    inst0--;
-		    if (inst0->header.opcode == BRW_OPCODE_BREAK) {
-			inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
+               {
+                  struct brw_instruction *inst0, *inst1;
+                  GLuint br = 1;
+
+                  if (BRW_IS_IGDNG(brw))
+                     br = 2;
+ 
+                  loop_depth--;
+                  inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+                  /* patch all the BREAK/CONT instructions from last BGNLOOP */
+                  while (inst0 > loop_inst[loop_depth]) {
+                     inst0--;
+                     if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+			inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
 			inst0->bits3.if_else.pop_count = 0;
-		    } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
-                        inst0->bits3.if_else.jump_count = inst1 - inst0;
+                     }
+                     else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                        inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
                         inst0->bits3.if_else.pop_count = 0;
-                    }
-		}
-		break;
+                     }
+                  }
+               }
+               break;
 	    default:
 		_mesa_printf("unsupported IR in fragment shader %d\n",
 			inst->Opcode);
 	}
+
 	if (inst->CondUpdate)
 	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	else
 	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
     }
     post_wm_emit(c);
-    for (i = 0; i < c->fp->program.Base.NumInstructions; i++)
-	c->fp->program.Base.Instructions[i].Data = NULL;
+
+    if (INTEL_DEBUG & DEBUG_WM) {
+      _mesa_printf("wm-native:\n");
+      for (i = 0; i < p->nr_insn; i++)
+	 brw_disasm(stderr, &p->store[i]);
+      _mesa_printf("\n");
+    }
 }
 
+/**
+ * Do GPU code generation for shaders that use GLSL features such as
+ * flow control.  Other shaders will be compiled with the 
+ */
 void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
 {
+    if (INTEL_DEBUG & DEBUG_WM) {
+        _mesa_printf("brw_wm_glsl_emit:\n");
+    }
+
+    /* initial instruction translation/simplification */
     brw_wm_pass_fp(c);
+
+    /* actual code generation */
     brw_wm_emit_glsl(brw, c);
-    c->prog_data.total_grf = c->reg_index;
+
+    if (INTEL_DEBUG & DEBUG_WM) {
+        brw_wm_print_program(c, "brw_wm_glsl_emit done");
+    }
+
+    c->prog_data.total_grf = num_grf_used(c);
     c->prog_data.total_scratch = 0;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.c b/src/mesa/drivers/dri/i965/brw_wm_iz.c
index bd60ac9b31..5e399ac62a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_iz.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_iz.c
@@ -116,8 +116,13 @@ const struct {
  { C, 0, 1, 1, 1 } 
 };
 
+/**
+ * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
+ * \param lookup  bitmask of IZ_* flags
+ */
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
+		       GLboolean ps_uses_depth,
 		       struct brw_wm_prog_key *key )
 {
    GLuint reg = 2;
@@ -127,7 +132,7 @@ void brw_wm_lookup_iz( GLuint line_aa,
    if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
       key->computes_depth = 1;
 
-   if (wm_iz_table[lookup].sd_present) {
+   if (wm_iz_table[lookup].sd_present || ps_uses_depth) {
       key->source_depth_reg = reg;
       reg += 2;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 205a7160d3..6279258339 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -51,6 +51,7 @@ static struct brw_wm_value *get_value( struct brw_wm_compile *c)
    return &c->vreg[c->nr_vreg++];
 }
 
+/** return pointer to a newly allocated instruction */
 static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
 {
    assert(c->nr_insns < BRW_WM_MAX_INSN);
@@ -60,6 +61,7 @@ static struct brw_wm_instruction *get_instruction( struct brw_wm_compile *c )
 /***********************************************************************
  */
 
+/** Init the "undef" register */
 static void pass0_init_undef( struct brw_wm_compile *c)
 {
    struct brw_wm_ref *ref = &c->undef_ref;
@@ -69,6 +71,7 @@ static void pass0_init_undef( struct brw_wm_compile *c)
    ref->prevuse = NULL;
 }
 
+/** Set a FP register to a value */
 static void pass0_set_fpreg_value( struct brw_wm_compile *c,
 				   GLuint file,
 				   GLuint idx,
@@ -83,6 +86,7 @@ static void pass0_set_fpreg_value( struct brw_wm_compile *c,
    c->pass0_fp_reg[file][idx][component] = ref;
 }
 
+/** Set a FP register to a ref */
 static void pass0_set_fpreg_ref( struct brw_wm_compile *c,
 				 GLuint file,
 				 GLuint idx,
@@ -115,12 +119,13 @@ static const struct brw_wm_ref *get_param_ref( struct brw_wm_compile *c,
       ref->value = &c->creg[i/16];
       ref->insn = 0;
       ref->prevuse = NULL;
-      
+
       return ref;
    }
 }
 
 
+/** Return a ref to a constant/literal value */
 static const struct brw_wm_ref *get_const_ref( struct brw_wm_compile *c,
 					       const GLfloat *constval )
 {
@@ -142,7 +147,7 @@ static const struct brw_wm_ref *get_const_ref( struct brw_wm_compile *c,
        */
       c->constref[i].constval = *constval;
       c->constref[i].ref = get_param_ref(c, constval);
-   
+
       return c->constref[i].ref;
    }
    else {
@@ -187,7 +192,7 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	 
 	 /* There's something really hokey about parameters parsed in
 	  * arb programs - they all end up in here, whether they be
-	  * state values, paramters or constants.  This duplicates the
+	  * state values, parameters or constants.  This duplicates the
 	  * structure above & also seems to subvert the limits set for
 	  * each type of constant/param.
 	  */ 
@@ -198,7 +203,7 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 	     */
 	    ref = get_const_ref(c, &plist->ParameterValues[idx][component]);
 	    break;
-	    
+
 	 case PROGRAM_STATE_VAR:
 	 case PROGRAM_UNIFORM:
 	    /* These may change from run to run:
@@ -229,14 +234,13 @@ static const struct brw_wm_ref *pass0_get_reg( struct brw_wm_compile *c,
 
 
 
-
 /***********************************************************************
  * Straight translation to internal instruction format
  */
 
 static void pass0_set_dst( struct brw_wm_compile *c,
-			   struct brw_wm_instruction *out,		     
-			   const struct prog_instruction *inst,		     
+			   struct brw_wm_instruction *out,
+			   const struct prog_instruction *inst,
 			   GLuint writemask )
 {
    const struct prog_dst_register *dst = &inst->DstReg;
@@ -245,44 +249,14 @@ static void pass0_set_dst( struct brw_wm_compile *c,
    for (i = 0; i < 4; i++) {
       if (writemask & (1<<i)) {
 	 out->dst[i] = get_value(c);
-
 	 pass0_set_fpreg_value(c, dst->File, dst->Index, i, out->dst[i]);
       }
    }
-   
-   out->writemask = writemask;
-}
 
-
-static void pass0_set_dst_scalar( struct brw_wm_compile *c,
-				  struct brw_wm_instruction *out,		     
-				  const struct prog_instruction *inst,		     
-				  GLuint writemask )
-{
-   if (writemask) {
-      const struct prog_dst_register *dst = &inst->DstReg;
-      GLuint i;
-
-      /* Compute only the first (X) value:
-       */
-      out->writemask = WRITEMASK_X;
-      out->dst[0] = get_value(c);
-
-      /* Update our tracking register file for all the components in
-       * writemask:
-       */
-      for (i = 0; i < 4; i++) {
-	 if (writemask & (1<<i)) {
-	    pass0_set_fpreg_value(c, dst->File, dst->Index, i, out->dst[0]);
-	 }
-      }
-   }
-   else
-      out->writemask = 0;
+   out->writemask = writemask;
 }
 
 
-
 static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
 						    struct prog_src_register src,
 						    GLuint i )
@@ -292,14 +266,13 @@ static const struct brw_wm_ref *get_fp_src_reg_ref( struct brw_wm_compile *c,
    static const GLfloat const_zero = 0.0;
    static const GLfloat const_one = 1.0;
 
-	 
    if (component == SWIZZLE_ZERO) 
       src_ref = get_const_ref(c, &const_zero);
    else if (component == SWIZZLE_ONE) 
       src_ref = get_const_ref(c, &const_one);
    else 
       src_ref = pass0_get_reg(c, src.File, src.Index, component);
-	 
+
    return src_ref;
 }
 
@@ -311,19 +284,19 @@ static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
 {
    const struct brw_wm_ref *ref = get_fp_src_reg_ref(c, src, i);
    struct brw_wm_ref *newref = get_ref(c);
-      
+
    newref->value = ref->value;
    newref->hw_reg = ref->hw_reg;
 
-   if (insn) { 
+   if (insn) {
       newref->insn = insn - c->instruction;
       newref->prevuse = newref->value->lastuse;
       newref->value->lastuse = newref;
    }
 
-   if (src.NegateBase & (1<<i)) 
+   if (src.Negate & (1 << i))
       newref->hw_reg.negate ^= 1;
-	    
+
    if (src.Abs) {
       newref->hw_reg.negate = 0;
       newref->hw_reg.abs = 1;
@@ -333,9 +306,9 @@ static struct brw_wm_ref *get_new_ref( struct brw_wm_compile *c,
 }
 
 
-
-static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
-						  const struct prog_instruction *inst )
+static void
+translate_insn(struct brw_wm_compile *c,
+               const struct prog_instruction *inst)
 {
    struct brw_wm_instruction *out = get_instruction(c);
    GLuint writemask = inst->DstReg.WriteMask;
@@ -348,8 +321,9 @@ static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
    out->saturate = (inst->SaturateMode != SATURATE_OFF);
    out->tex_unit = inst->TexSrcUnit;
    out->tex_idx = inst->TexSrcTarget;
-   out->eot = inst->Sampler & 1;
-   out->target = inst->Sampler>>1;
+   out->tex_shadow = inst->TexShadow;
+   out->eot = inst->Aux & 1;
+   out->target = inst->Aux >> 1;
 
    /* Args:
     */
@@ -361,12 +335,7 @@ static struct brw_wm_instruction *translate_insn( struct brw_wm_compile *c,
 
    /* Dst:
     */
-   if (brw_wm_is_scalar_result(out->opcode)) 
-      pass0_set_dst_scalar(c, out, inst, writemask);
-   else 
-      pass0_set_dst(c, out, inst, writemask);
-
-   return out;
+   pass0_set_dst(c, out, inst, writemask);
 }
 
 
@@ -379,14 +348,22 @@ static void pass0_precalc_mov( struct brw_wm_compile *c,
 {
    const struct prog_dst_register *dst = &inst->DstReg;
    GLuint writemask = inst->DstReg.WriteMask;
+   struct brw_wm_ref *refs[4];
    GLuint i;
 
    /* Get the effect of a MOV by manipulating our register table:
+    * First get all refs, then assign refs.  This ensures that "in-place"
+    * swizzles such as:
+    *   MOV t, t.xxyx
+    * are handled correctly.  Previously, these two steps were done in
+    * one loop and the above case was incorrectly handled.
     */
    for (i = 0; i < 4; i++) {
-      if (writemask & (1<<i)) {	    
-	 pass0_set_fpreg_ref( c, dst->File, dst->Index, i, 
-			      get_new_ref(c, inst->SrcReg[0], i, NULL));
+      refs[i] = get_new_ref(c, inst->SrcReg[0], i, NULL);
+   }
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1 << i)) {	    
+         pass0_set_fpreg_ref( c, dst->File, dst->Index, i, refs[i]);
       }
    }
 }
@@ -418,6 +395,7 @@ static void pass0_init_payload( struct brw_wm_compile *c )
 			     &c->payload.input_interp[i] );      
 }
 
+
 /***********************************************************************
  * PASS 0
  *
@@ -440,7 +418,6 @@ void brw_wm_pass0( struct brw_wm_compile *c )
    for (insn = 0; insn < c->nr_fp_insns; insn++) {
       const struct prog_instruction *inst = &c->prog_instructions[insn];
 
-
       /* Optimize away moves, otherwise emit translated instruction:
        */      
       switch (inst->Opcode) {
@@ -453,8 +430,6 @@ void brw_wm_pass0( struct brw_wm_compile *c )
 	    translate_insn(c, inst);
 	 }
 	 break;
-	 
-
       default:
 	 translate_insn(c, inst);
 	 break;
@@ -465,4 +440,3 @@ void brw_wm_pass0( struct brw_wm_compile *c )
       brw_wm_print_program(c, "pass0");
    }
 }
-
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass1.c b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
index f6f3a38e9e..b449394029 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass1.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
@@ -58,7 +58,8 @@ static void unlink_ref(struct brw_wm_ref *ref)
 
    if (ref == value->lastuse) {
       value->lastuse = ref->prevuse;
-   } else {
+   }
+   else {
       struct brw_wm_ref *i = value->lastuse;
       while (i->prevuse != ref) i = i->prevuse;
       i->prevuse = ref->prevuse;
@@ -75,8 +76,9 @@ static void track_arg(struct brw_wm_compile *c,
    for (i = 0; i < 4; i++) {
       struct brw_wm_ref *ref = inst->src[arg][i];
       if (ref) {
-	 if (readmask & (1<<i)) 
+	 if (readmask & (1<<i)) {
 	    ref->value->contributes_to_output = 1;
+         }
 	 else {
 	    unlink_ref(ref);
 	    inst->src[arg][i] = NULL;
@@ -88,15 +90,21 @@ static void track_arg(struct brw_wm_compile *c,
 static GLuint get_texcoord_mask( GLuint tex_idx )
 {
    switch (tex_idx) {
-   case TEXTURE_1D_INDEX: return WRITEMASK_X;
-   case TEXTURE_2D_INDEX: return WRITEMASK_XY;
-   case TEXTURE_3D_INDEX: return WRITEMASK_XYZ;
-   case TEXTURE_CUBE_INDEX: return WRITEMASK_XYZ;
-   case TEXTURE_RECT_INDEX: return WRITEMASK_XY;
+   case TEXTURE_1D_INDEX:
+      return WRITEMASK_X;
+   case TEXTURE_2D_INDEX:
+      return WRITEMASK_XY;
+   case TEXTURE_3D_INDEX:
+      return WRITEMASK_XYZ;
+   case TEXTURE_CUBE_INDEX:
+      return WRITEMASK_XYZ;
+   case TEXTURE_RECT_INDEX:
+      return WRITEMASK_XY;
    default: return 0;
    }
 }
 
+
 /* Step two: Basically this is dead code elimination.  
  *
  * Iterate backwards over instructions, noting which values
@@ -151,6 +159,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case OPCODE_FRC:
       case OPCODE_MOV:
       case OPCODE_SWZ:
+      case OPCODE_TRUNC:
 	 read0 = writemask;
 	 break;
 
@@ -169,6 +178,11 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 read1 = writemask;
 	 break;
 
+      case OPCODE_DDX:
+      case OPCODE_DDY:
+	 read0 = writemask;
+	 break;
+
       case OPCODE_MAD:	
       case OPCODE_CMP:
       case OPCODE_LRP:
@@ -202,9 +216,10 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_TEX:
+      case OPCODE_TXP:
 	 read0 = get_texcoord_mask(inst->tex_idx);
 
-	 if (c->key.shadowtex_mask & (1<<inst->tex_unit))
+         if (inst->tex_shadow)
 	    read0 |= WRITEMASK_Z;
 	 break;
 
@@ -259,7 +274,8 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 	 break;
 
       case OPCODE_DST:
-      case OPCODE_TXP:
+      case WM_FRONTFACING:
+      case OPCODE_KIL_NV:
       default:
 	 break;
       }
@@ -273,6 +289,3 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       brw_wm_print_program(c, "pass1");
    }
 }
-
-
-
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass2.c b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
index 6fca9ad220..6faea018fb 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass2.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
@@ -69,8 +69,6 @@ static void prealloc_reg(struct brw_wm_compile *c,
  */
 static void init_registers( struct brw_wm_compile *c )
 {
-   struct brw_context *brw = c->func.brw;
-   GLuint inputs = (brw->vs.prog_data->outputs_written & DO_SETUP_BITS);
    GLuint nr_interp_regs = 0;
    GLuint i = 0;
    GLuint j;
@@ -84,18 +82,22 @@ static void init_registers( struct brw_wm_compile *c )
    for (j = 0; j < c->nr_creg; j++) 
       prealloc_reg(c, &c->creg[j], i++);
 
-   for (j = 0; j < FRAG_ATTRIB_MAX; j++) 
-      if (inputs & (1<<j)) {
-	 /* index for vs output and ps input are not the same 
-	    in shader varying */
-	 GLuint index;
-	 if (j > FRAG_ATTRIB_VAR0)
-	     index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+   for (j = 0; j < FRAG_ATTRIB_MAX; j++) {
+      if (c->key.vp_outputs_written & (1<<j)) {
+	 int fp_index;
+
+	 if (j >= VERT_RESULT_VAR0)
+	    fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+	 else if (j <= VERT_RESULT_TEX7)
+	    fp_index = j;
 	 else
-	     index = j;
+	    fp_index = -1;
+
 	 nr_interp_regs++;
-	 prealloc_reg(c, &c->payload.input_interp[index], i++);
+	 if (fp_index >= 0)
+	    prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
       }
+   }
 
    assert(nr_interp_regs >= 1);
 
@@ -120,7 +122,7 @@ static void update_register_usage(struct brw_wm_compile *c,
       /* Only search those which can change:
        */
       if (grf->nextuse < thisinsn) {
-	 struct brw_wm_ref *ref = grf->value->lastuse;
+	 const struct brw_wm_ref *ref = grf->value->lastuse;
 
 	 /* Has last use of value been passed?
 	  */
@@ -148,7 +150,7 @@ static void spill_value(struct brw_wm_compile *c,
    /* Allocate a spill slot.  Note that allocations start from 0x40 -
     * the first slot is reserved to mean "undef" in brw_wm_emit.c
     */
-   if (!value->spill_slot) {  
+   if (!value->spill_slot) {
       c->last_scratch += 0x40;	
       value->spill_slot = c->last_scratch;
    }
@@ -189,7 +191,7 @@ static GLuint search_contiguous_regs(struct brw_wm_compile *c,
 	 if (grf[i+j].nextuse < group_nextuse)
 	    group_nextuse = grf[i+j].nextuse;
       }
-	 
+
       if (group_nextuse > furthest) {
 	 furthest = group_nextuse;
 	 reg = i;
@@ -197,7 +199,7 @@ static GLuint search_contiguous_regs(struct brw_wm_compile *c,
    }
 
    assert(furthest != thisinsn);
-   
+
    /* Any non-empty regs will need to be spilled:
     */
    for (j = 0; j < nr; j++) 
@@ -243,7 +245,7 @@ static void alloc_contiguous_dest(struct brw_wm_compile *c,
 
 static void load_args(struct brw_wm_compile *c, 
 		      struct brw_wm_instruction *inst)
-{   
+{
    GLuint thisinsn = inst - c->instruction;
    GLuint i,j;
 
@@ -258,17 +260,17 @@ static void load_args(struct brw_wm_compile *c,
 		* register allocation and mark the ref as requiring a fill.
 		*/
 	       GLuint reg = search_contiguous_regs(c, 1, thisinsn);
-            
+
 	       c->pass2_grf[reg].value = ref->value;
 	       c->pass2_grf[reg].nextuse = thisinsn;
-	    
+
 	       ref->value->resident = &c->pass2_grf[reg];
 
 	       /* Note that a fill is required:
 		*/
 	       ref->unspill_reg = reg*2;
 	    }
-	    
+
 	    /* Adjust the hw_reg to point at the value's current location:
 	     */
 	    assert(ref->value == ref->value->resident->value);
@@ -294,7 +296,7 @@ void brw_wm_pass2( struct brw_wm_compile *c )
 
    for (insn = 0; insn < c->nr_insns; insn++) {
       struct brw_wm_instruction *inst = &c->instruction[insn];
-      
+
       /* Update registers' nextuse values:
        */
       update_register_usage(c, insn);
@@ -322,11 +324,11 @@ void brw_wm_pass2( struct brw_wm_compile *c )
 	 break;
       }
 
-      if (TEST_DST_SPILLS && inst->opcode != WM_PIXELXY)
+      if (TEST_DST_SPILLS && inst->opcode != WM_PIXELXY) {
 	 for (i = 0; i < 4; i++)	
 	    if (inst->dst[i])
 	       spill_value(c, inst->dst[i]);
-
+      }
    }
 
    if (INTEL_DEBUG & DEBUG_WM) {
@@ -339,6 +341,3 @@ void brw_wm_pass2( struct brw_wm_compile *c )
        brw_wm_print_program(c, "pass2/done");
    }
 }
-
-
-
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index 8c9cb78945..dff466587a 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -95,6 +95,7 @@ struct wm_sampler_key {
    int sampler_count;
 
    struct wm_sampler_entry {
+      GLenum tex_target;
       GLenum wrap_r, wrap_s, wrap_t;
       float maxlod, minlod;
       float lod_bias;
@@ -102,6 +103,10 @@ struct wm_sampler_key {
       GLenum minfilter, magfilter;
       GLenum comparemode, comparefunc;
       dri_bo *sdc_bo;
+
+      /** If target is cubemap, take context setting.
+       */
+      GLboolean seamless_cube_map;
    } sampler[BRW_MAX_TEX_UNIT];
 };
 
@@ -151,7 +156,7 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
       sampler->ss0.mag_filter = BRW_MAPFILTER_ANISOTROPIC;
 
       if (key->max_aniso > 2.0) {
-	 sampler->ss3.max_aniso = MAX2((key->max_aniso - 2) / 2,
+	 sampler->ss3.max_aniso = MIN2((key->max_aniso - 2) / 2,
 				       BRW_ANISORATIO_16);
       }
    }
@@ -172,15 +177,29 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
    sampler->ss1.s_wrap_mode = translate_wrap_mode(key->wrap_s);
    sampler->ss1.t_wrap_mode = translate_wrap_mode(key->wrap_t);
 
-   /* Fulsim complains if I don't do this.  Hardware doesn't mind:
+   /* Cube-maps on 965 and later must use the same wrap mode for all 3
+    * coordinate dimensions.  Futher, only CUBE and CLAMP are valid.
     */
-#if 0
-   if (texObj->Target == GL_TEXTURE_CUBE_MAP_ARB) {
-      sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
-      sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
-      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+   if (key->tex_target == GL_TEXTURE_CUBE_MAP) {
+      if (key->seamless_cube_map &&
+	  (key->minfilter != GL_NEAREST || key->magfilter != GL_NEAREST)) {
+	 sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	 sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+	 sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      } else {
+	 sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	 sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+	 sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CLAMP;
+      }
+   } else if (key->tex_target == GL_TEXTURE_1D) {
+      /* There's a bug in 1D texture sampling - it actually pays
+       * attention to the wrap_t value, though it should not.
+       * Override the wrap_t value here to GL_REPEAT to keep
+       * any nonexistent border pixels from floating in.
+       */
+      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_WRAP;
    }
-#endif
+
 
    /* Set shadow function: 
     */
@@ -215,24 +234,31 @@ static void brw_update_sampler_state(struct wm_sampler_entry *key,
    sampler->ss2.default_color_pointer = sdc_bo->offset >> 5; /* reloc */
 }
 
+
 /** Sets up the cache key for sampler state for all texture units */
 static void
 brw_wm_sampler_populate_key(struct brw_context *brw,
 			    struct wm_sampler_key *key)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    int unit;
 
    memset(key, 0, sizeof(*key));
 
    for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
-      if (brw->attribs.Texture->Unit[unit]._ReallyEnabled) {
+      if (ctx->Texture.Unit[unit]._ReallyEnabled) {
 	 struct wm_sampler_entry *entry = &key->sampler[unit];
-	 struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[unit];
+	 struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 	 struct gl_texture_object *texObj = texUnit->_Current;
 	 struct intel_texture_object *intelObj = intel_texture_object(texObj);
 	 struct gl_texture_image *firstImage =
 	    texObj->Image[0][intelObj->firstLevel];
 
+         entry->tex_target = texObj->Target;
+
+	 entry->seamless_cube_map = (texObj->Target == GL_TEXTURE_CUBE_MAP)
+	    ? ctx->Texture.CubeMapSeamless : GL_FALSE;
+
 	 entry->wrap_r = texObj->WrapR;
 	 entry->wrap_s = texObj->WrapS;
 	 entry->wrap_t = texObj->WrapT;
@@ -274,6 +300,7 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
  */
 static void upload_wm_samplers( struct brw_context *brw )
 {
+   GLcontext *ctx = &brw->intel.ctx;
    struct wm_sampler_key key;
    int i;
 
@@ -317,7 +344,7 @@ static void upload_wm_samplers( struct brw_context *brw )
 
       /* Emit SDC relocations */
       for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-	 if (!brw->attribs.Texture->Unit[i]._ReallyEnabled)
+	 if (!ctx->Texture.Unit[i]._ReallyEnabled)
 	    continue;
 
 	 dri_bo_emit_reloc(brw->wm.sampler_bo,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 5302405eda..361f91292b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -60,7 +60,9 @@ struct brw_wm_unit_key {
 static void
 wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    const struct gl_fragment_program *fp = brw->fragment_program;
+   const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
    struct intel_context *intel = &brw->intel;
 
    memset(key, 0, sizeof(*key));
@@ -69,7 +71,9 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
       key->max_threads = 1;
    else {
       /* WM maximum threads is number of EUs times number of threads per EU. */
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_IGDNG(brw))
+         key->max_threads = 12 * 6;
+      else if (BRW_IS_G4X(brw))
 	 key->max_threads = 10 * 5;
       else
 	 key->max_threads = 8 * 4;
@@ -95,31 +99,43 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
    key->sampler_count = brw->wm.sampler_count;
 
    /* _NEW_POLYGONSTIPPLE */
-   key->polygon_stipple = brw->attribs.Polygon->StippleFlag;
+   key->polygon_stipple = ctx->Polygon.StippleFlag;
 
    /* BRW_NEW_FRAGMENT_PROGRAM */
    key->uses_depth = (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
 
    /* as far as we can tell */
    key->computes_depth =
-      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPR)) != 0;
+      (fp->Base.OutputsWritten & (1 << FRAG_RESULT_DEPTH)) != 0;
+   /* BRW_NEW_DEPTH_BUFFER
+    * Override for NULL depthbuffer case, required by the Pixel Shader Computed
+    * Depth field.
+    */
+   if (brw->state.depth_region == NULL)
+      key->computes_depth = 0;
 
    /* _NEW_COLOR */
-   key->uses_kill = fp->UsesKill || brw->attribs.Color->AlphaEnabled;
-   key->is_glsl = brw_wm_is_glsl(fp);
+   key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
+   key->is_glsl = bfp->isGLSL;
+
+   /* temporary sanity check assertion */
+   ASSERT(bfp->isGLSL == brw_wm_is_glsl(fp));
 
-   /* XXX: This needs a flag to indicate when it changes. */
+   /* _NEW_DEPTH */
    key->stats_wm = intel->stats_wm;
 
    /* _NEW_LINE */
-   key->line_stipple = brw->attribs.Line->StippleFlag;
+   key->line_stipple = ctx->Line.StippleFlag;
 
    /* _NEW_POLYGON */
-   key->offset_enable = brw->attribs.Polygon->OffsetFill;
-   key->offset_units = brw->attribs.Polygon->OffsetUnits;
-   key->offset_factor = brw->attribs.Polygon->OffsetFactor;
+   key->offset_enable = ctx->Polygon.OffsetFill;
+   key->offset_units = ctx->Polygon.OffsetUnits;
+   key->offset_factor = ctx->Polygon.OffsetFactor;
 }
 
+/**
+ * Setup wm hardware state.  See page 225 of Volume 2
+ */
 static dri_bo *
 wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 			dri_bo **reloc_bufs)
@@ -133,11 +149,15 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
    wm.thread1.depth_coef_urb_read_offset = 1;
    wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   wm.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   if (BRW_IS_IGDNG(brw))
+      wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      wm.thread1.binding_table_entry_count = key->nr_surfaces;
 
    if (key->total_scratch != 0) {
       wm.thread2.scratch_space_base_pointer =
-	 brw->wm.scratch_buffer->offset >> 10; /* reloc */
+	 brw->wm.scratch_bo->offset >> 10; /* reloc */
       wm.thread2.per_thread_scratch_space = key->total_scratch / 1024 - 1;
    } else {
       wm.thread2.scratch_space_base_pointer = 0;
@@ -146,11 +166,15 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 
    wm.thread3.dispatch_grf_start_reg = key->dispatch_grf_start_reg;
    wm.thread3.urb_entry_read_length = key->urb_entry_read_length;
+   wm.thread3.urb_entry_read_offset = 0;
    wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
-   wm.thread3.urb_entry_read_offset = 0;
 
-   wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+   if (BRW_IS_IGDNG(brw)) 
+      wm.wm4.sampler_count = 0; /* hardware requirement */
+   else
+      wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+
    if (brw->wm.sampler_bo != NULL) {
       /* reloc */
       wm.wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5;
@@ -215,7 +239,7 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
 			0, 0,
 			wm.thread2.per_thread_scratch_space,
 			offsetof(struct brw_wm_unit_state, thread2),
-			brw->wm.scratch_buffer);
+			brw->wm.scratch_bo);
    }
 
    /* Emit sampler state relocation */
@@ -246,20 +270,20 @@ static void upload_wm_unit( struct brw_context *brw )
    if (key.total_scratch) {
       GLuint total = key.total_scratch * key.max_threads;
 
-      if (brw->wm.scratch_buffer && total > brw->wm.scratch_buffer->size) {
-	 dri_bo_unreference(brw->wm.scratch_buffer);
-	 brw->wm.scratch_buffer = NULL;
+      if (brw->wm.scratch_bo && total > brw->wm.scratch_bo->size) {
+	 dri_bo_unreference(brw->wm.scratch_bo);
+	 brw->wm.scratch_bo = NULL;
       }
-      if (brw->wm.scratch_buffer == NULL) {
-	 brw->wm.scratch_buffer = dri_bo_alloc(intel->bufmgr,
-					       "wm scratch",
-					       total,
-					       4096);
+      if (brw->wm.scratch_bo == NULL) {
+	 brw->wm.scratch_bo = dri_bo_alloc(intel->bufmgr,
+                                           "wm scratch",
+                                           total,
+                                           4096);
       }
    }
 
    reloc_bufs[0] = brw->wm.prog_bo;
-   reloc_bufs[1] = brw->wm.scratch_buffer;
+   reloc_bufs[1] = brw->wm.scratch_bo;
    reloc_bufs[2] = brw->wm.sampler_bo;
 
    dri_bo_unreference(brw->wm.state_bo);
@@ -277,11 +301,13 @@ const struct brw_tracked_state brw_wm_unit = {
       .mesa = (_NEW_POLYGON | 
 	       _NEW_POLYGONSTIPPLE | 
 	       _NEW_LINE | 
-	       _NEW_COLOR),
+	       _NEW_COLOR |
+	       _NEW_DEPTH),
 
       .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
 	      BRW_NEW_CURBE_OFFSETS |
-	      BRW_NEW_NR_SURFACES),
+	      BRW_NEW_DEPTH_BUFFER |
+	      BRW_NEW_NR_WM_SURFACES),
 
       .cache = (CACHE_NEW_WM_PROG |
 		CACHE_NEW_SAMPLER)
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 06e71e6d69..51539ac1e7 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -33,11 +33,12 @@
 #include "main/mtypes.h"
 #include "main/texformat.h"
 #include "main/texstore.h"
+#include "shader/prog_parameter.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_batchbuffer.h"
 #include "intel_tex.h"
-
+#include "intel_fbo.h"
 
 #include "brw_context.h"
 #include "brw_state.h"
@@ -69,7 +70,8 @@ static GLuint translate_tex_target( GLenum target )
 }
 
 
-static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
+static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format,
+				    GLenum depth_mode )
 {
    switch( mesa_format ) {
    case MESA_FORMAT_L8:
@@ -89,10 +91,16 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
       return BRW_SURFACEFORMAT_R8G8B8_UNORM;      
 
    case MESA_FORMAT_ARGB8888:
-      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+      if (internal_format == GL_RGB)
+	 return BRW_SURFACEFORMAT_B8G8R8X8_UNORM;
+      else
+	 return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
 
    case MESA_FORMAT_RGBA8888_REV:
-      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+      if (internal_format == GL_RGB)
+	 return BRW_SURFACEFORMAT_R8G8B8X8_UNORM;
+      else
+	 return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
 
    case MESA_FORMAT_RGB565:
       return BRW_SURFACEFORMAT_B5G6R5_UNORM;
@@ -133,13 +141,34 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
    case MESA_FORMAT_RGBA_DXT5:
        return BRW_SURFACEFORMAT_BC3_UNORM;
 
-   case MESA_FORMAT_SRGBA8:
-      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
+   case MESA_FORMAT_SARGB8:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB;
+
+   case MESA_FORMAT_SLA8:
+      return BRW_SURFACEFORMAT_L8A8_UNORM_SRGB;
+
+   case MESA_FORMAT_SL8:
+      return BRW_SURFACEFORMAT_L8_UNORM_SRGB;
+
    case MESA_FORMAT_SRGB_DXT1:
       return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
 
    case MESA_FORMAT_S8_Z24:
-      return BRW_SURFACEFORMAT_I24X8_UNORM;
+      /* XXX: these different surface formats don't seem to
+       * make any difference for shadow sampler/compares.
+       */
+      if (depth_mode == GL_INTENSITY) 
+         return BRW_SURFACEFORMAT_I24X8_UNORM;
+      else if (depth_mode == GL_ALPHA)
+         return BRW_SURFACEFORMAT_A24X8_UNORM;
+      else
+         return BRW_SURFACEFORMAT_L24X8_UNORM;
+
+   case MESA_FORMAT_DUDV8:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+
+   case MESA_FORMAT_SIGNED_RGBA8888_REV:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
 
    default:
       assert(0);
@@ -147,17 +176,6 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum depth_mode )
    }
 }
 
-struct brw_wm_surface_key {
-   GLenum target, depthmode;
-   dri_bo *bo;
-   GLint format;
-   GLint first_level, last_level;
-   GLint width, height, depth;
-   GLint pitch, cpp;
-   uint32_t tiling;
-   GLuint offset;
-};
-
 static void
 brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
 {
@@ -179,7 +197,7 @@ brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
 
 static dri_bo *
 brw_create_texture_surface( struct brw_context *brw,
-			    struct brw_wm_surface_key *key )
+			    struct brw_surface_key *key )
 {
    struct brw_surface_state surf;
    dri_bo *bo;
@@ -188,9 +206,11 @@ brw_create_texture_surface( struct brw_context *brw,
 
    surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
    surf.ss0.surface_type = translate_tex_target(key->target);
-
-   if (key->bo) 
-      surf.ss0.surface_format = translate_tex_format(key->format, key->depthmode);
+   if (key->bo) {
+      surf.ss0.surface_format = translate_tex_format(key->format,
+						     key->internal_format,
+						     key->depthmode);
+   }
    else {
       switch (key->depth) {
       case 32:
@@ -232,7 +252,7 @@ brw_create_texture_surface( struct brw_context *brw,
       surf.ss0.cube_neg_z = 1;
    }
 
-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@@ -253,10 +273,11 @@ static void
 brw_update_texture_surface( GLcontext *ctx, GLuint unit )
 {
    struct brw_context *brw = brw_context(ctx);
-   struct gl_texture_object *tObj = brw->attribs.Texture->Unit[unit]._Current;
+   struct gl_texture_object *tObj = ctx->Texture.Unit[unit]._Current;
    struct intel_texture_object *intelObj = intel_texture_object(tObj);
    struct gl_texture_image *firstImage = tObj->Image[0][intelObj->firstLevel];
-   struct brw_wm_surface_key key;
+   struct brw_surface_key key;
+   const GLuint surf = SURF_INDEX_TEXTURE(unit);
 
    memset(&key, 0, sizeof(key));
 
@@ -267,6 +288,7 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
       key.offset = intelObj->textureOffset;
    } else {
       key.format = firstImage->TexFormat->MesaFormat;
+      key.internal_format = firstImage->InternalFormat;
       key.pitch = intelObj->mt->pitch;
       key.depth = firstImage->Depth;
       key.bo = intelObj->mt->region->buffer;
@@ -282,33 +304,221 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    key.cpp = intelObj->mt->cpp;
    key.tiling = intelObj->mt->region->tiling;
 
-   dri_bo_unreference(brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS]);
-   brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-							       &key, sizeof(key),
-							       &key.bo, key.bo ? 1 : 0,
-							       NULL);
-   if (brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] == NULL) {
-      brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS] = brw_create_texture_surface(brw, &key);
+   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->wm.surf_bo[surf] == NULL) {
+      brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
+   }
+}
+
+
+
+/**
+ * Create the constant buffer surface.  Vertex/fragment shader constants will be
+ * read from this buffer with Data Port Read instructions/messages.
+ */
+dri_bo *
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key )
+{
+   const GLint w = key->width - 1;
+   struct brw_surface_state surf;
+   dri_bo *bo;
+
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = BRW_SURFACE_BUFFER;
+   surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   assert(key->bo);
+   if (key->bo)
+      surf.ss1.base_addr = key->bo->offset; /* reloc */
+   else
+      surf.ss1.base_addr = key->offset;
+
+   surf.ss2.width = w & 0x7f;            /* bits 6:0 of size or width */
+   surf.ss2.height = (w >> 7) & 0x1fff;  /* bits 19:7 of size or width */
+   surf.ss3.depth = (w >> 20) & 0x7f;    /* bits 26:20 of size or width */
+   surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
+   brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
+ 
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
+			 key, sizeof(*key),
+			 &key->bo, key->bo ? 1 : 0,
+			 &surf, sizeof(surf),
+			 NULL, NULL);
+
+   if (key->bo) {
+      /* Emit relocation to surface contents */
+      dri_bo_emit_reloc(bo,
+			I915_GEM_DOMAIN_SAMPLER, 0,
+			0,
+			offsetof(struct brw_surface_state, ss1),
+			key->bo);
+   }
+
+   return bo;
+}
+
+/* Creates a new WM constant buffer reflecting the current fragment program's
+ * constants, if needed by the fragment program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_wm_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   if (!fp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
+
+/**
+ * Update the surface state for a WM constant buffer.
+ * The constant buffer will be (re)allocated here if needed.
+ */
+static void
+brw_update_wm_constant_surface( GLcontext *ctx,
+                                GLuint surf)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params =
+      fp->program.Base.Parameters;
+
+   /* If we're in this state update atom, we need to update WM constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+      brw->wm.surf_bo[surf] = NULL;
+      return;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = fp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   dri_bo_unreference(brw->wm.surf_bo[surf]);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->wm.surf_bo[surf] == NULL) {
+      brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+}
+
+/**
+ * Updates surface / buffer for fragment shader constant buffer, if
+ * one is required.
+ *
+ * This consumes the state updates for the constant buffer, and produces
+ * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
+ * inclusion in the binding table.
+ */
+static void prepare_wm_constant_surface(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
+
+   drm_intel_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      if (brw->wm.surf_bo[surf] != NULL) {
+	 drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+	 brw->wm.surf_bo[surf] = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+      }
+      return;
    }
+
+   brw_update_wm_constant_surface(ctx, surf);
 }
 
+const struct brw_tracked_state brw_wm_constant_surface = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constant_surface,
+};
+
+
 /**
  * Sets up a surface state structure to point at the given region.
  * While it is only used for the front/back buffer currently, it should be
  * usable for further buffers when doing ARB_draw_buffer support.
  */
 static void
-brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
-			  unsigned int unit, GLboolean cached)
+brw_update_renderbuffer_surface(struct brw_context *brw,
+				struct gl_renderbuffer *rb,
+				unsigned int unit)
 {
+   GLcontext *ctx = &brw->intel.ctx;
    dri_bo *region_bo = NULL;
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+   struct intel_region *region = irb ? irb->region : NULL;
    struct {
       unsigned int surface_type;
       unsigned int surface_format;
-      unsigned int width, height, cpp;
+      unsigned int width, height, pitch, cpp;
       GLubyte color_mask[4];
       GLboolean color_blend;
       uint32_t tiling;
+      uint32_t draw_offset;
    } key;
 
    memset(&key, 0, sizeof(key));
@@ -317,34 +527,54 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
       region_bo = region->buffer;
 
       key.surface_type = BRW_SURFACE_2D;
-      if (region->cpp == 4)
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
 	 key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-      else
+	 break;
+      case MESA_FORMAT_RGB565:
 	 key.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+	 break;
+      case MESA_FORMAT_ARGB1555:
+	 key.surface_format = BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+	 break;
+      case MESA_FORMAT_ARGB4444:
+	 key.surface_format = BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+	 break;
+      default:
+	 _mesa_problem(ctx, "Bad renderbuffer format: %d\n",
+		       irb->texformat->MesaFormat);
+      }
       key.tiling = region->tiling;
-      key.width = region->pitch; /* XXX: not really! */
-      key.height = region->height;
+      if (brw->intel.intelScreen->driScrnPriv->dri2.enabled) {
+	 key.width = rb->Width;
+	 key.height = rb->Height;
+      } else {
+	 key.width = region->width;
+	 key.height = region->height;
+      }
+      key.pitch = region->pitch;
       key.cpp = region->cpp;
+      key.draw_offset = region->draw_offset; /* cur 3d or cube face offset */
    } else {
       key.surface_type = BRW_SURFACE_NULL;
       key.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
-      key.tiling = 0;
+      key.tiling = I915_TILING_X;
       key.width = 1;
       key.height = 1;
       key.cpp = 4;
+      key.draw_offset = 0;
    }
-   memcpy(key.color_mask, brw->attribs.Color->ColorMask,
+   memcpy(key.color_mask, ctx->Color.ColorMask,
 	  sizeof(key.color_mask));
-   key.color_blend = (!brw->attribs.Color->_LogicOpEnabled &&
-		      brw->attribs.Color->BlendEnabled);
+   key.color_blend = (!ctx->Color._LogicOpEnabled &&
+		      ctx->Color.BlendEnabled);
 
    dri_bo_unreference(brw->wm.surf_bo[unit]);
-   brw->wm.surf_bo[unit] = NULL;
-   if (cached) 
-       brw->wm.surf_bo[unit] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-	       &key, sizeof(key),
-	       &region_bo, 1,
-	       NULL);
+   brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
+					    BRW_SS_SURFACE,
+					    &key, sizeof(key),
+					    &region_bo, 1,
+					    NULL);
 
    if (brw->wm.surf_bo[unit] == NULL) {
       struct brw_surface_state surf;
@@ -353,13 +583,34 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 
       surf.ss0.surface_format = key.surface_format;
       surf.ss0.surface_type = key.surface_type;
+      if (key.tiling == I915_TILING_NONE) {
+	 surf.ss1.base_addr = key.draw_offset;
+      } else {
+	 uint32_t tile_offset = key.draw_offset % 4096;
+
+	 surf.ss1.base_addr = key.draw_offset - tile_offset;
+
+	 assert(BRW_IS_G4X(brw) || tile_offset == 0);
+	 if (BRW_IS_G4X(brw)) {
+	    if (key.tiling == I915_TILING_X) {
+	       /* Note that the low bits of these fields are missing, so
+		* there's the possibility of getting in trouble.
+		*/
+	       surf.ss5.x_offset = (tile_offset % 512) / key.cpp / 4;
+	       surf.ss5.y_offset = tile_offset / 512 / 2;
+	    } else {
+	       surf.ss5.x_offset = (tile_offset % 128) / key.cpp / 4;
+	       surf.ss5.y_offset = tile_offset / 128 / 2;
+	    }
+	 }
+      }
       if (region_bo != NULL)
-	 surf.ss1.base_addr = region_bo->offset; /* reloc */
+	 surf.ss1.base_addr += region_bo->offset; /* reloc */
 
       surf.ss2.width = key.width - 1;
       surf.ss2.height = key.height - 1;
       brw_set_surface_tiling(&surf, key.tiling);
-      surf.ss3.pitch = (key.width * key.cpp) - 1;
+      surf.ss3.pitch = (key.pitch * key.cpp) - 1;
 
       /* _NEW_COLOR */
       surf.ss0.color_blend = key.color_blend;
@@ -369,8 +620,9 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
       surf.ss0.writedisable_alpha = !key.color_mask[3];
 
       /* Key size will never match key size for textures, so we're safe. */
-      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
-					      &key, sizeof(key),
+      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
+                                               BRW_SS_SURFACE,
+                                               &key, sizeof(key),
 					       &region_bo, 1,
 					       &surf, sizeof(surf),
 					       NULL, NULL);
@@ -379,13 +631,12 @@ brw_update_region_surface(struct brw_context *brw, struct intel_region *region,
 	  * them both.  We might be able to figure out from other state
 	  * a more restrictive relocation to emit.
 	  */
-	 dri_bo_emit_reloc(brw->wm.surf_bo[unit],
-			   I915_GEM_DOMAIN_RENDER |
-			   I915_GEM_DOMAIN_SAMPLER,
-			   I915_GEM_DOMAIN_RENDER,
-			   0,
-			   offsetof(struct brw_surface_state, ss1),
-			   region_bo);
+	 drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
+				 offsetof(struct brw_surface_state, ss1),
+				 region_bo,
+				 surf.ss1.base_addr - region_bo->offset,
+				 I915_GEM_DOMAIN_RENDER,
+				 I915_GEM_DOMAIN_RENDER);
       }
    }
 }
@@ -400,7 +651,9 @@ brw_wm_get_binding_table(struct brw_context *brw)
 {
    dri_bo *bind_bo;
 
-   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
+   assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
+
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
 			      NULL, 0,
 			      brw->wm.surf_bo, brw->wm.nr_surfaces,
 			      NULL);
@@ -416,7 +669,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
          else
             data[i] = 0;
 
-      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->wm.surf_bo, brw->wm.nr_surfaces,
 				  data, data_size,
@@ -446,51 +699,62 @@ static void prepare_wm_surfaces(struct brw_context *brw )
    GLuint i;
    int old_nr_surfaces;
 
-   if (brw->state.nr_draw_regions  > 1) {
-      for (i = 0; i < brw->state.nr_draw_regions; i++) {
-         brw_update_region_surface(brw, brw->state.draw_regions[i], i,
-				   GL_FALSE);
+   /* _NEW_BUFFERS */
+   /* Update surfaces for drawing buffers */
+   if (ctx->DrawBuffer->_NumColorDrawBuffers >= 1) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+         brw_update_renderbuffer_surface(brw,
+					 ctx->DrawBuffer->_ColorDrawBuffers[i],
+					 i);
       }
-   }else {
-      brw_update_region_surface(brw, brw->state.draw_regions[0], 0, GL_TRUE);
+   } else {
+      brw_update_renderbuffer_surface(brw, NULL, 0);
    }
 
    old_nr_surfaces = brw->wm.nr_surfaces;
    brw->wm.nr_surfaces = MAX_DRAW_BUFFERS;
 
+   if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL)
+       brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
+
+   /* Update surfaces for textures */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
-      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[i];
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[i];
+      const GLuint surf = SURF_INDEX_TEXTURE(i);
 
       /* _NEW_TEXTURE, BRW_NEW_TEXDATA */
-      if(texUnit->_ReallyEnabled) {
+      if (texUnit->_ReallyEnabled) {
          if (texUnit->_Current == intel->frame_buffer_texobj) {
-            dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-            brw->wm.surf_bo[i+MAX_DRAW_BUFFERS] = brw->wm.surf_bo[0];
-            dri_bo_reference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-            brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1;
+            /* render to texture */
+            dri_bo_unreference(brw->wm.surf_bo[surf]);
+            brw->wm.surf_bo[surf] = brw->wm.surf_bo[0];
+            dri_bo_reference(brw->wm.surf_bo[surf]);
+            brw->wm.nr_surfaces = surf + 1;
          } else {
+            /* regular texture */
             brw_update_texture_surface(ctx, i);
-            brw->wm.nr_surfaces = i + MAX_DRAW_BUFFERS + 1;
+            brw->wm.nr_surfaces = surf + 1;
          }
       } else {
-         dri_bo_unreference(brw->wm.surf_bo[i+MAX_DRAW_BUFFERS]);
-         brw->wm.surf_bo[i+MAX_DRAW_BUFFERS] = NULL;
+         dri_bo_unreference(brw->wm.surf_bo[surf]);
+         brw->wm.surf_bo[surf] = NULL;
       }
-
    }
 
    dri_bo_unreference(brw->wm.bind_bo);
    brw->wm.bind_bo = brw_wm_get_binding_table(brw);
 
    if (brw->wm.nr_surfaces != old_nr_surfaces)
-      brw->state.dirty.brw |= BRW_NEW_NR_SURFACES;
+      brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
 }
 
-
 const struct brw_tracked_state brw_wm_surfaces = {
    .dirty = {
-      .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS,
-      .brw = BRW_NEW_CONTEXT,
+      .mesa = (_NEW_COLOR |
+               _NEW_TEXTURE |
+               _NEW_BUFFERS),
+      .brw = (BRW_NEW_CONTEXT |
+	      BRW_NEW_WM_SURFACES),
       .cache = 0
    },
    .prepare = prepare_wm_surfaces,
diff --git a/src/mesa/drivers/dri/i965/intel_clear.c b/src/mesa/drivers/dri/i965/intel_clear.c
new file mode 120000
index 0000000000..9a2a742a0d
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_clear.c
@@ -0,0 +1 @@
+../intel/intel_clear.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/intel_depthstencil.c b/src/mesa/drivers/dri/i965/intel_depthstencil.c
deleted file mode 120000
index 4ac4ae690a..0000000000
--- a/src/mesa/drivers/dri/i965/intel_depthstencil.c
+++ /dev/null
@@ -1 +0,0 @@
-../intel/intel_depthstencil.c
-\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
new file mode 120000
index 0000000000..a2f3e8cd20
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -0,0 +1 @@
+../intel/intel_extensions.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/intel_generatemipmap.c b/src/mesa/drivers/dri/i965/intel_generatemipmap.c
new file mode 120000
index 0000000000..4c6b37ada0
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_generatemipmap.c
@@ -0,0 +1 @@
+../intel/intel_generatemipmap.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
new file mode 120000
index 0000000000..cc4589f4d4
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -0,0 +1 @@
+../intel/intel_pixel_read.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/intel_state.c b/src/mesa/drivers/dri/i965/intel_state.c
index 67ef5f78c1..519672fc35 100644..120000
--- a/src/mesa/drivers/dri/i965/intel_state.c
+++ b/src/mesa/drivers/dri/i965/intel_state.c
@@ -1,225 +1 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "main/glheader.h"
-#include "main/context.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "main/colormac.h"
-#include "main/dd.h"
-
-#include "intel_screen.h"
-#include "intel_context.h"
-#include "intel_regions.h"
-#include "swrast/swrast.h"
-
-int intel_translate_shadow_compare_func( GLenum func )
-{
-   switch(func) {
-   case GL_NEVER: 
-       return COMPAREFUNC_ALWAYS; 
-   case GL_LESS: 
-       return COMPAREFUNC_LEQUAL; 
-   case GL_LEQUAL: 
-       return COMPAREFUNC_LESS;
-   case GL_GREATER: 
-       return COMPAREFUNC_GEQUAL; 
-   case GL_GEQUAL: 
-      return COMPAREFUNC_GREATER; 
-   case GL_NOTEQUAL: 
-      return COMPAREFUNC_EQUAL; 
-   case GL_EQUAL: 
-      return COMPAREFUNC_NOTEQUAL; 
-   case GL_ALWAYS: 
-       return COMPAREFUNC_NEVER; 
-   }
-
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
-   return COMPAREFUNC_NEVER; 
-}
-
-int intel_translate_compare_func( GLenum func )
-{
-   switch(func) {
-   case GL_NEVER: 
-      return COMPAREFUNC_NEVER; 
-   case GL_LESS: 
-      return COMPAREFUNC_LESS; 
-   case GL_LEQUAL: 
-      return COMPAREFUNC_LEQUAL; 
-   case GL_GREATER: 
-      return COMPAREFUNC_GREATER; 
-   case GL_GEQUAL: 
-      return COMPAREFUNC_GEQUAL; 
-   case GL_NOTEQUAL: 
-      return COMPAREFUNC_NOTEQUAL; 
-   case GL_EQUAL: 
-      return COMPAREFUNC_EQUAL; 
-   case GL_ALWAYS: 
-      return COMPAREFUNC_ALWAYS; 
-   }
-
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
-   return COMPAREFUNC_ALWAYS; 
-}
-
-int intel_translate_stencil_op( GLenum op )
-{
-   switch(op) {
-   case GL_KEEP: 
-      return STENCILOP_KEEP; 
-   case GL_ZERO: 
-      return STENCILOP_ZERO; 
-   case GL_REPLACE: 
-      return STENCILOP_REPLACE; 
-   case GL_INCR: 
-      return STENCILOP_INCRSAT;
-   case GL_DECR: 
-      return STENCILOP_DECRSAT;
-   case GL_INCR_WRAP:
-      return STENCILOP_INCR; 
-   case GL_DECR_WRAP:
-      return STENCILOP_DECR; 
-   case GL_INVERT: 
-      return STENCILOP_INVERT; 
-   default: 
-      return STENCILOP_ZERO;
-   }
-}
-
-int intel_translate_blend_factor( GLenum factor )
-{
-   switch(factor) {
-   case GL_ZERO: 
-      return BLENDFACT_ZERO; 
-   case GL_SRC_ALPHA: 
-      return BLENDFACT_SRC_ALPHA; 
-   case GL_ONE: 
-      return BLENDFACT_ONE; 
-   case GL_SRC_COLOR: 
-      return BLENDFACT_SRC_COLR; 
-   case GL_ONE_MINUS_SRC_COLOR: 
-      return BLENDFACT_INV_SRC_COLR; 
-   case GL_DST_COLOR: 
-      return BLENDFACT_DST_COLR; 
-   case GL_ONE_MINUS_DST_COLOR: 
-      return BLENDFACT_INV_DST_COLR; 
-   case GL_ONE_MINUS_SRC_ALPHA:
-      return BLENDFACT_INV_SRC_ALPHA; 
-   case GL_DST_ALPHA: 
-      return BLENDFACT_DST_ALPHA; 
-   case GL_ONE_MINUS_DST_ALPHA:
-      return BLENDFACT_INV_DST_ALPHA; 
-   case GL_SRC_ALPHA_SATURATE: 
-      return BLENDFACT_SRC_ALPHA_SATURATE;
-   case GL_CONSTANT_COLOR:
-      return BLENDFACT_CONST_COLOR; 
-   case GL_ONE_MINUS_CONSTANT_COLOR:
-      return BLENDFACT_INV_CONST_COLOR;
-   case GL_CONSTANT_ALPHA:
-      return BLENDFACT_CONST_ALPHA; 
-   case GL_ONE_MINUS_CONSTANT_ALPHA:
-      return BLENDFACT_INV_CONST_ALPHA;
-   }
-   
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, factor);
-   return BLENDFACT_ZERO;
-}
-
-int intel_translate_logic_op( GLenum opcode )
-{
-   switch(opcode) {
-   case GL_CLEAR: 
-      return LOGICOP_CLEAR; 
-   case GL_AND: 
-      return LOGICOP_AND; 
-   case GL_AND_REVERSE: 
-      return LOGICOP_AND_RVRSE; 
-   case GL_COPY: 
-      return LOGICOP_COPY; 
-   case GL_COPY_INVERTED: 
-      return LOGICOP_COPY_INV; 
-   case GL_AND_INVERTED: 
-      return LOGICOP_AND_INV; 
-   case GL_NOOP: 
-      return LOGICOP_NOOP; 
-   case GL_XOR: 
-      return LOGICOP_XOR; 
-   case GL_OR: 
-      return LOGICOP_OR; 
-   case GL_OR_INVERTED: 
-      return LOGICOP_OR_INV; 
-   case GL_NOR: 
-      return LOGICOP_NOR; 
-   case GL_EQUIV: 
-      return LOGICOP_EQUIV; 
-   case GL_INVERT: 
-      return LOGICOP_INV; 
-   case GL_OR_REVERSE: 
-      return LOGICOP_OR_RVRSE; 
-   case GL_NAND: 
-      return LOGICOP_NAND; 
-   case GL_SET: 
-      return LOGICOP_SET; 
-   default:
-      return LOGICOP_SET;
-   }
-}
-
-
-static void intelClearColor(GLcontext *ctx, const GLfloat color[4])
-{
-   struct intel_context *intel = intel_context(ctx);
-
-   UNCLAMPED_FLOAT_TO_RGBA_CHAN(intel->clear_chan, color);
-
-   intel->ClearColor8888 = INTEL_PACKCOLOR8888(intel->clear_chan[0],
-					       intel->clear_chan[1],
-					       intel->clear_chan[2],
-					       intel->clear_chan[3]);
-   intel->ClearColor565 = INTEL_PACKCOLOR565(intel->clear_chan[0],
-					     intel->clear_chan[1],
-					     intel->clear_chan[2]);
-}
-
-
-
-/* Fallback to swrast for select and feedback.
- */
-static void intelRenderMode( GLcontext *ctx, GLenum mode )
-{
-   struct intel_context *intel = intel_context(ctx);
-   FALLBACK( intel, INTEL_FALLBACK_RENDERMODE, (mode != GL_RENDER) );
-}
-
-
-void intelInitStateFuncs( struct dd_function_table *functions )
-{
-   functions->RenderMode = intelRenderMode;
-   functions->ClearColor = intelClearColor;
-}
+../intel/intel_state.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/intel_swapbuffers.c b/src/mesa/drivers/dri/i965/intel_swapbuffers.c
new file mode 120000
index 0000000000..148d5215aa
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_swapbuffers.c
@@ -0,0 +1 @@
+../intel/intel_swapbuffers.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
new file mode 120000
index 0000000000..0b2e56ab24
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -0,0 +1 @@
+../intel/intel_syncobj.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 9d9937289a..6aa36d10b1 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -195,7 +195,16 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
 {
    struct intel_context *intel = batch->intel;
    GLuint used = batch->ptr - batch->map;
-   GLboolean was_locked = intel->locked;
+
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
+   }
+
+   if (intel->first_post_swapbuffers_batch == NULL) {
+      intel->first_post_swapbuffers_batch = intel->batch->buf;
+      drm_intel_bo_reference(intel->first_post_swapbuffers_batch);
+   }
 
    if (used == 0) {
       batch->cliprect_mode = IGNORE_CLIPRECTS;
@@ -207,7 +216,7 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
 	      used);
 
    /* Emit a flush if the bufmgr doesn't do it for us. */
-   if (!intel->ttm) {
+   if (intel->always_flush_cache || !intel->ttm) {
       *(GLuint *) (batch->ptr) = intel->vtbl.flush_cmd();
       batch->ptr += 4;
       used = batch->ptr - batch->map;
@@ -243,13 +252,9 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
    /* TODO: Just pass the relocation list and dma buffer up to the
     * kernel.
     */
-   if (!was_locked)
-      LOCK_HARDWARE(intel);
-
+   LOCK_HARDWARE(intel);
    do_flush_locked(batch, used, GL_FALSE);
-
-   if (!was_locked)
-      UNLOCK_HARDWARE(intel);
+   UNLOCK_HARDWARE(intel);
 
    if (INTEL_DEBUG & DEBUG_SYNC) {
       fprintf(stderr, "waiting for idle\n");
diff --git a/src/mesa/drivers/dri/intel/intel_blit.c b/src/mesa/drivers/dri/intel/intel_blit.c
index 208f90c0ab..43141c509c 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.c
+++ b/src/mesa/drivers/dri/intel/intel_blit.c
@@ -26,12 +26,11 @@
  **************************************************************************/
 
 
-#include <stdio.h>
-#include <errno.h>
-
 #include "main/mtypes.h"
 #include "main/context.h"
 #include "main/enums.h"
+#include "main/texformat.h"
+#include "main/colormac.h"
 
 #include "intel_blit.h"
 #include "intel_buffers.h"
@@ -98,14 +97,16 @@ intelCopyBuffer(const __DRIdrawablePrivate * dPriv,
       ASSERT(src->cpp == dst->cpp);
 
       if (cpp == 2) {
-	 BR13 = (0xCC << 16) | (1 << 24);
+	 BR13 = (0xCC << 16) | BR13_565;
 	 CMD = XY_SRC_COPY_BLT_CMD;
       }
       else {
-	 BR13 = (0xCC << 16) | (1 << 24) | (1 << 25);
+	 BR13 = (0xCC << 16) | BR13_8888;
 	 CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       }
 
+      assert(src->tiling != I915_TILING_Y);
+      assert(dst->tiling != I915_TILING_Y);
 #ifndef I915
       if (src->tiling != I915_TILING_NONE) {
 	 CMD |= XY_SRC_TILED;
@@ -173,64 +174,6 @@ intelCopyBuffer(const __DRIdrawablePrivate * dPriv,
    UNLOCK_HARDWARE(intel);
 }
 
-
-
-
-void
-intelEmitFillBlit(struct intel_context *intel,
-		  GLuint cpp,
-		  GLshort dst_pitch,
-		  dri_bo *dst_buffer,
-		  GLuint dst_offset,
-		  uint32_t dst_tiling,
-		  GLshort x, GLshort y,
-		  GLshort w, GLshort h,
-		  GLuint color)
-{
-   GLuint BR13, CMD;
-   BATCH_LOCALS;
-
-   dst_pitch *= cpp;
-
-   switch (cpp) {
-   case 1:
-   case 2:
-   case 3:
-      BR13 = (0xF0 << 16) | (1 << 24);
-      CMD = XY_COLOR_BLT_CMD;
-      break;
-   case 4:
-      BR13 = (0xF0 << 16) | (1 << 24) | (1 << 25);
-      CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      break;
-   default:
-      return;
-   }
-#ifndef I915
-   if (dst_tiling != I915_TILING_NONE) {
-      CMD |= XY_DST_TILED;
-      dst_pitch /= 4;
-   }
-#endif
-
-   DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
-       __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h);
-
-   assert(w > 0);
-   assert(h > 0);
-
-   BEGIN_BATCH(6, NO_LOOP_CLIPRECTS);
-   OUT_BATCH(CMD);
-   OUT_BATCH(BR13 | dst_pitch);
-   OUT_BATCH((y << 16) | x);
-   OUT_BATCH(((y + h) << 16) | (x + w));
-   OUT_RELOC(dst_buffer,
-	     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-	     dst_offset);
-   OUT_BATCH(color);
-   ADVANCE_BATCH();
-}
-
 static GLuint translate_raster_op(GLenum logicop)
 {
    switch(logicop) {
@@ -257,7 +200,7 @@ static GLuint translate_raster_op(GLenum logicop)
 
 /* Copy BitBlt
  */
-void
+GLboolean
 intelEmitCopyBlit(struct intel_context *intel,
 		  GLuint cpp,
 		  GLshort src_pitch,
@@ -279,6 +222,19 @@ intelEmitCopyBlit(struct intel_context *intel,
    dri_bo *aper_array[3];
    BATCH_LOCALS;
 
+   if (dst_tiling != I915_TILING_NONE) {
+      if (dst_offset & 4095)
+	 return GL_FALSE;
+      if (dst_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+   if (src_tiling != I915_TILING_NONE) {
+      if (src_offset & 4095)
+	 return GL_FALSE;
+      if (src_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+
    /* do space/cliprects check before going any further */
    do {
        aper_array[0] = intel->batch->buf;
@@ -293,12 +249,7 @@ intelEmitCopyBlit(struct intel_context *intel,
    } while (pass < 2);
 
    if (pass >= 2) {
-       GLboolean locked = GL_FALSE;       
-       if (!intel->locked) {
-           LOCK_HARDWARE(intel);
-           locked = GL_TRUE;
-       }
-
+       LOCK_HARDWARE(intel);
        dri_bo_map(dst_buffer, GL_TRUE);
        dri_bo_map(src_buffer, GL_FALSE);
        _mesa_copy_rect((GLubyte *)dst_buffer->virtual + dst_offset,
@@ -312,11 +263,9 @@ intelEmitCopyBlit(struct intel_context *intel,
        
        dri_bo_unmap(src_buffer);
        dri_bo_unmap(dst_buffer);
-       
-       if (locked)
-           UNLOCK_HARDWARE(intel);
+       UNLOCK_HARDWARE(intel);
 
-       return;
+       return GL_TRUE;
    }
 
    intel_batchbuffer_require_space(intel->batch, 8 * 4, NO_LOOP_CLIPRECTS);
@@ -332,17 +281,18 @@ intelEmitCopyBlit(struct intel_context *intel,
 
    switch (cpp) {
    case 1:
+      CMD = XY_SRC_COPY_BLT_CMD;
+      break;
    case 2:
-   case 3:
-      BR13 |= (1 << 24);
+      BR13 |= BR13_565;
       CMD = XY_SRC_COPY_BLT_CMD;
       break;
    case 4:
-      BR13 |= (1 << 24) | (1 << 25);
+      BR13 |= BR13_8888;
       CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       break;
    default:
-      return;
+      return GL_FALSE;
    }
 
 #ifndef I915
@@ -357,7 +307,7 @@ intelEmitCopyBlit(struct intel_context *intel,
 #endif
 
    if (dst_y2 <= dst_y || dst_x2 <= dst_x) {
-      return;
+      return GL_TRUE;
    }
 
    assert(dst_x < dst_x2);
@@ -379,6 +329,8 @@ intelEmitCopyBlit(struct intel_context *intel,
    ADVANCE_BATCH();
 
    intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   return GL_TRUE;
 }
 
 
@@ -481,10 +433,9 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
             const GLbitfield bufBit = 1 << buf;
             if ((clearMask & bufBit) && !(bufBit & skipBuffers)) {
                /* OK, clear this renderbuffer */
-               struct intel_region *irb_region =
-		  intel_get_rb_region(fb, buf);
+	       struct intel_renderbuffer *irb = intel_get_renderbuffer(fb, buf);
                dri_bo *write_buffer =
-                  intel_region_buffer(intel, irb_region,
+                  intel_region_buffer(intel, irb->region,
                                       all ? INTEL_WRITE_FULL :
                                       INTEL_WRITE_PART);
 
@@ -492,15 +443,13 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                GLint pitch, cpp;
                GLuint BR13, CMD;
 
-               ASSERT(irb_region);
-
-               pitch = irb_region->pitch;
-               cpp = irb_region->cpp;
+               pitch = irb->region->pitch;
+               cpp = irb->region->cpp;
 
                DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
                    __FUNCTION__,
-                   irb_region->buffer, (pitch * cpp),
-                   irb_region->draw_offset,
+                   irb->region->buffer, (pitch * cpp),
+                   irb->region->draw_offset,
                    b.x1, b.y1, b.x2 - b.x1, b.y2 - b.y1);
 
 	       BR13 = 0xf0 << 16;
@@ -508,7 +457,7 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
 
                /* Setup the blit command */
                if (cpp == 4) {
-                  BR13 |= (1 << 24) | (1 << 25);
+                  BR13 |= BR13_8888;
                   if (buf == BUFFER_DEPTH || buf == BUFFER_STENCIL) {
                      if (clearMask & BUFFER_BIT_DEPTH)
                         CMD |= XY_BLT_WRITE_RGB;
@@ -521,12 +470,14 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                   }
                }
                else {
-                  ASSERT(cpp == 2 || cpp == 0);
-                  BR13 |= (1 << 24);
+                  ASSERT(cpp == 2);
+                  BR13 |= BR13_565;
                }
 
+	       assert(irb->region->tiling != I915_TILING_Y);
+
 #ifndef I915
-	       if (irb_region->tiling != I915_TILING_NONE) {
+	       if (irb->region->tiling != I915_TILING_NONE) {
 		  CMD |= XY_DST_TILED;
 		  pitch /= 4;
 	       }
@@ -537,9 +488,36 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                   clearVal = clear_depth;
                }
                else {
-                  clearVal = (cpp == 4)
-                     ? intel->ClearColor8888 : intel->ClearColor565;
-               }
+		  uint8_t clear[4];
+		  GLclampf *color = ctx->Color.ClearColor;
+
+		  CLAMPED_FLOAT_TO_UBYTE(clear[0], color[0]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[1], color[1]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[2], color[2]);
+		  CLAMPED_FLOAT_TO_UBYTE(clear[3], color[3]);
+
+		  switch (irb->texformat->MesaFormat) {
+		  case MESA_FORMAT_ARGB8888:
+		     clearVal = intel->ClearColor8888;
+		     break;
+		  case MESA_FORMAT_RGB565:
+		     clearVal = intel->ClearColor565;
+		     break;
+		  case MESA_FORMAT_ARGB4444:
+		     clearVal = PACK_COLOR_4444(clear[3], clear[0],
+						clear[1], clear[2]);
+		     break;
+		  case MESA_FORMAT_ARGB1555:
+		     clearVal = PACK_COLOR_1555(clear[3], clear[0],
+						clear[1], clear[2]);
+		     break;
+		  default:
+		     _mesa_problem(ctx, "Unexpected renderbuffer format: %d\n",
+				   irb->texformat->MesaFormat);
+		     clearVal = 0;
+		  }
+	       }
+
                /*
                   _mesa_debug(ctx, "hardware blit clear buf %d rb id %d\n",
                   buf, irb->Base.Name);
@@ -555,20 +533,19 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
                OUT_BATCH((b.y2 << 16) | b.x2);
                OUT_RELOC(write_buffer,
 			 I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                         irb_region->draw_offset);
+                         irb->region->draw_offset);
                OUT_BATCH(clearVal);
                ADVANCE_BATCH();
                clearMask &= ~bufBit;    /* turn off bit, for faster loop exit */
             }
          }
       }
-      intel_batchbuffer_emit_mi_flush(intel->batch);
    }
 
    UNLOCK_HARDWARE(intel);
 }
 
-void
+GLboolean
 intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  GLuint cpp,
 				  GLubyte *src_bits, GLuint src_size,
@@ -584,11 +561,19 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
    int dwords = ALIGN(src_size, 8) / 4;
    uint32_t opcode, br13, blit_cmd;
 
+   if (dst_tiling != I915_TILING_NONE) {
+      if (dst_offset & 4095)
+	 return GL_FALSE;
+      if (dst_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+
    assert( logic_op - GL_CLEAR >= 0 );
    assert( logic_op - GL_CLEAR < 0x10 );
+   assert(dst_pitch > 0);
 
    if (w < 0 || h < 0)
-      return;
+      return GL_TRUE;
 
    dst_pitch *= cpp;
 
@@ -645,4 +630,46 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 			   REFERENCES_CLIPRECTS );
 
    intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   return GL_TRUE;
+}
+
+/* We don't have a memmove-type blit like some other hardware, so we'll do a
+ * rectangular blit covering a large space, then emit 1-scanline blit at the
+ * end to cover the last if we need.
+ */
+void
+intel_emit_linear_blit(struct intel_context *intel,
+		       drm_intel_bo *dst_bo,
+		       unsigned int dst_offset,
+		       drm_intel_bo *src_bo,
+		       unsigned int src_offset,
+		       unsigned int size)
+{
+   GLuint pitch, height;
+
+   /* The pitch is a signed value. */
+   pitch = MIN2(size, (1 << 15) - 1);
+   height = size / pitch;
+   intelEmitCopyBlit(intel, 1,
+		     pitch, src_bo, src_offset, I915_TILING_NONE,
+		     pitch, dst_bo, dst_offset, I915_TILING_NONE,
+		     0, 0, /* src x/y */
+		     0, 0, /* dst x/y */
+		     pitch, height, /* w, h */
+		     GL_COPY);
+
+   src_offset += pitch * height;
+   dst_offset += pitch * height;
+   size -= pitch * height;
+   assert (size < (1 << 15));
+   if (size != 0) {
+      intelEmitCopyBlit(intel, 1,
+			size, src_bo, src_offset, I915_TILING_NONE,
+			size, dst_bo, dst_offset, I915_TILING_NONE,
+			0, 0, /* src x/y */
+			0, 0, /* dst x/y */
+			size, 1, /* w, h */
+			GL_COPY);
+   }
 }
diff --git a/src/mesa/drivers/dri/intel/intel_blit.h b/src/mesa/drivers/dri/intel/intel_blit.h
index 52065b13ed..240cb7cd1b 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.h
+++ b/src/mesa/drivers/dri/intel/intel_blit.h
@@ -35,7 +35,8 @@ extern void intelCopyBuffer(const __DRIdrawablePrivate * dpriv,
 
 extern void intelClearWithBlit(GLcontext * ctx, GLbitfield mask);
 
-extern void intelEmitCopyBlit(struct intel_context *intel,
+GLboolean
+intelEmitCopyBlit(struct intel_context *intel,
                               GLuint cpp,
                               GLshort src_pitch,
                               dri_bo *src_buffer,
@@ -50,16 +51,7 @@ extern void intelEmitCopyBlit(struct intel_context *intel,
                               GLshort w, GLshort h,
 			      GLenum logicop );
 
-extern void intelEmitFillBlit(struct intel_context *intel,
-                              GLuint cpp,
-                              GLshort dst_pitch,
-                              dri_bo *dst_buffer,
-                              GLuint dst_offset,
-			      uint32_t dst_tiling,
-                              GLshort x, GLshort y,
-                              GLshort w, GLshort h, GLuint color);
-
-void
+GLboolean
 intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  GLuint cpp,
 				  GLubyte *src_bits, GLuint src_size,
@@ -71,5 +63,11 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  GLshort x, GLshort y,
 				  GLshort w, GLshort h,
 				  GLenum logic_op);
+void intel_emit_linear_blit(struct intel_context *intel,
+			    drm_intel_bo *dst_bo,
+			    unsigned int dst_offset,
+			    drm_intel_bo *src_bo,
+			    unsigned int src_offset,
+			    unsigned int size);
 
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.c b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
index 60d7bb3770..c55c5c426e 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
@@ -28,16 +28,18 @@
 
 #include "main/imports.h"
 #include "main/mtypes.h"
+#include "main/macros.h"
 #include "main/bufferobj.h"
 
 #include "intel_context.h"
+#include "intel_blit.h"
 #include "intel_buffer_objects.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 
-static GLboolean intel_bufferobj_unmap(GLcontext * ctx,
-				       GLenum target,
-				       struct gl_buffer_object *obj);
+static GLboolean
+intel_bufferobj_unmap(GLcontext * ctx,
+                      GLenum target, struct gl_buffer_object *obj);
 
 /** Allocates a new dri_bo to store the data for the buffer object. */
 static void
@@ -105,11 +107,13 @@ intel_bufferobj_free(GLcontext * ctx, struct gl_buffer_object *obj)
    assert(intel_obj);
 
    /* Buffer objects are automatically unmapped when deleting according
-    * to the spec.
+    * to the spec, but Mesa doesn't do UnmapBuffer for us at context destroy
+    * (though it does if you call glDeleteBuffers)
     */
    if (obj->Pointer)
       intel_bufferobj_unmap(ctx, 0, obj);
 
+   _mesa_free(intel_obj->sys_buffer);
    if (intel_obj->region) {
       intel_bufferobj_release_region(intel, intel_obj);
    }
@@ -126,9 +130,10 @@ intel_bufferobj_free(GLcontext * ctx, struct gl_buffer_object *obj)
  * Allocate space for and store data in a buffer object.  Any data that was
  * previously stored in the buffer object is lost.  If data is NULL,
  * memory will be allocated, but no copy will occur.
- * Called via glBufferDataARB().
+ * Called via ctx->Driver.BufferData().
+ * \return GL_TRUE for success, GL_FALSE if out of memory
  */
-static void
+static GLboolean
 intel_bufferobj_data(GLcontext * ctx,
                      GLenum target,
                      GLsizeiptrARB size,
@@ -141,11 +146,7 @@ intel_bufferobj_data(GLcontext * ctx,
    intel_obj->Base.Size = size;
    intel_obj->Base.Usage = usage;
 
-   /* Buffer objects are automatically unmapped when creating new data buffers
-    * according to the spec.
-    */
-   if (obj->Pointer)
-      intel_bufferobj_unmap(ctx, 0, obj);
+   assert(!obj->Pointer); /* Mesa should have unmapped it */
 
    if (intel_obj->region)
       intel_bufferobj_release_region(intel, intel_obj);
@@ -154,12 +155,32 @@ intel_bufferobj_data(GLcontext * ctx,
       dri_bo_unreference(intel_obj->buffer);
       intel_obj->buffer = NULL;
    }
+   _mesa_free(intel_obj->sys_buffer);
+   intel_obj->sys_buffer = NULL;
+
    if (size != 0) {
+#ifdef I915
+      /* On pre-965, stick VBOs in system memory, as we're always doing swtnl
+       * with their contents anyway.
+       */
+      if (target == GL_ARRAY_BUFFER || target == GL_ELEMENT_ARRAY_BUFFER) {
+	 intel_obj->sys_buffer = _mesa_malloc(size);
+	 if (intel_obj->sys_buffer != NULL) {
+	    if (data != NULL)
+	       memcpy(intel_obj->sys_buffer, data, size);
+	    return GL_TRUE;
+	 }
+      }
+#endif
       intel_bufferobj_alloc_buffer(intel, intel_obj);
+      if (!intel_obj->buffer)
+         return GL_FALSE;
 
       if (data != NULL)
 	 dri_bo_subdata(intel_obj->buffer, 0, size, data);
    }
+
+   return GL_TRUE;
 }
 
 
@@ -184,7 +205,10 @@ intel_bufferobj_subdata(GLcontext * ctx,
    if (intel_obj->region)
       intel_bufferobj_cow(intel, intel_obj);
 
-   dri_bo_subdata(intel_obj->buffer, offset, size, data);
+   if (intel_obj->sys_buffer)
+      memcpy((char *)intel_obj->sys_buffer + offset, data, size);
+   else
+      dri_bo_subdata(intel_obj->buffer, offset, size, data);
 }
 
 
@@ -216,11 +240,21 @@ intel_bufferobj_map(GLcontext * ctx,
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+   GLboolean read_only = (access == GL_READ_ONLY_ARB);
+   GLboolean write_only = (access == GL_WRITE_ONLY_ARB);
 
-   /* XXX: Translate access to flags arg below:
-    */
    assert(intel_obj);
 
+   if (intel_obj->sys_buffer) {
+      obj->Pointer = intel_obj->sys_buffer;
+      return obj->Pointer;
+   }
+
+   /* Flush any existing batchbuffer that might have written to this
+    * buffer.
+    */
+   intelFlush(ctx);
+
    if (intel_obj->region)
       intel_bufferobj_cow(intel, intel_obj);
 
@@ -229,27 +263,214 @@ intel_bufferobj_map(GLcontext * ctx,
       return NULL;
    }
 
-   dri_bo_map(intel_obj->buffer, GL_TRUE);
+   if (write_only && intel->intelScreen->kernel_exec_fencing) {
+      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
+      intel_obj->mapped_gtt = GL_TRUE;
+   } else {
+      drm_intel_bo_map(intel_obj->buffer, !read_only);
+      intel_obj->mapped_gtt = GL_FALSE;
+   }
+
    obj->Pointer = intel_obj->buffer->virtual;
+   obj->Length = obj->Size;
+   obj->Offset = 0;
+
+   return obj->Pointer;
+}
+
+/**
+ * Called via glMapBufferRange().
+ *
+ * The goal of this extension is to allow apps to accumulate their rendering
+ * at the same time as they accumulate their buffer object.  Without it,
+ * you'd end up blocking on execution of rendering every time you mapped
+ * the buffer to put new data in.
+ *
+ * We support it in 3 ways: If unsynchronized, then don't bother
+ * flushing the batchbuffer before mapping the buffer, which can save blocking
+ * in many cases.  If we would still block, and they allow the whole buffer
+ * to be invalidated, then just allocate a new buffer to replace the old one.
+ * If not, and we'd block, and they allow the subrange of the buffer to be
+ * invalidated, then we can make a new little BO, let them write into that,
+ * and blit it into the real BO at unmap time.
+ */
+static void *
+intel_bufferobj_map_range(GLcontext * ctx,
+			  GLenum target, GLintptr offset, GLsizeiptr length,
+			  GLbitfield access, struct gl_buffer_object *obj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+
+   assert(intel_obj);
+
+   /* _mesa_MapBufferRange (GL entrypoint) sets these, but the vbo module also
+    * internally uses our functions directly.
+    */
+   obj->Offset = offset;
+   obj->Length = length;
+   obj->AccessFlags = access;
+
+   if (intel_obj->sys_buffer) {
+      obj->Pointer = intel_obj->sys_buffer + offset;
+      return obj->Pointer;
+   }
+
+   if (intel_obj->region)
+      intel_bufferobj_cow(intel, intel_obj);
+
+   /* If the mapping is synchronized with other GL operations, flush
+    * the batchbuffer so that GEM knows about the buffer access for later
+    * syncing.
+    */
+   if (!(access & GL_MAP_UNSYNCHRONIZED_BIT))
+      intelFlush(ctx);
+
+   if (intel_obj->buffer == NULL) {
+      obj->Pointer = NULL;
+      return NULL;
+   }
+
+   /* If the user doesn't care about existing buffer contents and mapping
+    * would cause us to block, then throw out the old buffer.
+    */
+   if (!(access & GL_MAP_UNSYNCHRONIZED_BIT) &&
+       (access & GL_MAP_INVALIDATE_BUFFER_BIT) &&
+       drm_intel_bo_busy(intel_obj->buffer)) {
+      drm_intel_bo_unreference(intel_obj->buffer);
+      intel_obj->buffer = dri_bo_alloc(intel->bufmgr, "bufferobj",
+				       intel_obj->Base.Size, 64);
+   }
+
+   /* If the user is mapping a range of an active buffer object but
+    * doesn't require the current contents of that range, make a new
+    * BO, and we'll copy what they put in there out at unmap or
+    * FlushRange time.
+    */
+   if ((access & GL_MAP_INVALIDATE_RANGE_BIT) &&
+       drm_intel_bo_busy(intel_obj->buffer)) {
+      if (access & GL_MAP_FLUSH_EXPLICIT_BIT) {
+	 intel_obj->range_map_buffer = _mesa_malloc(length);
+	 obj->Pointer = intel_obj->range_map_buffer;
+      } else {
+	 intel_obj->range_map_bo = drm_intel_bo_alloc(intel->bufmgr,
+						      "range map",
+						      length, 64);
+	 if (!(access & GL_MAP_READ_BIT) &&
+	     intel->intelScreen->kernel_exec_fencing) {
+	    drm_intel_gem_bo_map_gtt(intel_obj->range_map_bo);
+	    intel_obj->mapped_gtt = GL_TRUE;
+	 } else {
+	    drm_intel_bo_map(intel_obj->range_map_bo,
+			     (access & GL_MAP_WRITE_BIT) != 0);
+	    intel_obj->mapped_gtt = GL_FALSE;
+	 }
+	 obj->Pointer = intel_obj->range_map_bo->virtual;
+      }
+      return obj->Pointer;
+   }
+
+   if (!(access & GL_MAP_READ_BIT) &&
+       intel->intelScreen->kernel_exec_fencing) {
+      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
+      intel_obj->mapped_gtt = GL_TRUE;
+   } else {
+      drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
+      intel_obj->mapped_gtt = GL_FALSE;
+   }
+
+   obj->Pointer = intel_obj->buffer->virtual + offset;
    return obj->Pointer;
 }
 
+/* Ideally we'd use a BO to avoid taking up cache space for the temporary
+ * data, but FlushMappedBufferRange may be followed by further writes to
+ * the pointer, so we would have to re-map after emitting our blit, which
+ * would defeat the point.
+ */
+static void
+intel_bufferobj_flush_mapped_range(GLcontext *ctx, GLenum target,
+				   GLintptr offset, GLsizeiptr length,
+				   struct gl_buffer_object *obj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
+   drm_intel_bo *temp_bo;
+
+   /* Unless we're in the range map using a temporary system buffer,
+    * there's no work to do.
+    */
+   if (intel_obj->range_map_buffer == NULL)
+      return;
+
+   temp_bo = drm_intel_bo_alloc(intel->bufmgr, "range map flush", length, 64);
+
+   drm_intel_bo_subdata(temp_bo, 0, length, intel_obj->range_map_buffer);
+
+   intel_emit_linear_blit(intel,
+			  intel_obj->buffer, obj->Offset + offset,
+			  temp_bo, 0,
+			  length);
+
+   drm_intel_bo_unreference(temp_bo);
+}
+
 
 /**
- * Called via glMapBufferARB().
+ * Called via glUnmapBuffer().
  */
 static GLboolean
 intel_bufferobj_unmap(GLcontext * ctx,
                       GLenum target, struct gl_buffer_object *obj)
 {
+   struct intel_context *intel = intel_context(ctx);
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
 
    assert(intel_obj);
-   if (intel_obj->buffer != NULL) {
-      assert(obj->Pointer);
-      dri_bo_unmap(intel_obj->buffer);
-      obj->Pointer = NULL;
+   assert(obj->Pointer);
+   if (intel_obj->sys_buffer != NULL) {
+      /* always keep the mapping around. */
+   } else if (intel_obj->range_map_buffer != NULL) {
+      /* Since we've emitted some blits to buffers that will (likely) be used
+       * in rendering operations in other cache domains in this batch, emit a
+       * flush.  Once again, we wish for a domain tracker in libdrm to cover
+       * usage inside of a batchbuffer.
+       */
+      intel_batchbuffer_emit_mi_flush(intel->batch);
+      free(intel_obj->range_map_buffer);
+      intel_obj->range_map_buffer = NULL;
+   } else if (intel_obj->range_map_bo != NULL) {
+      if (intel_obj->mapped_gtt) {
+	 drm_intel_gem_bo_unmap_gtt(intel_obj->range_map_bo);
+      } else {
+	 drm_intel_bo_unmap(intel_obj->range_map_bo);
+      }
+
+      intel_emit_linear_blit(intel,
+			     intel_obj->buffer, obj->Offset,
+			     intel_obj->range_map_bo, 0,
+			     obj->Length);
+
+      /* Since we've emitted some blits to buffers that will (likely) be used
+       * in rendering operations in other cache domains in this batch, emit a
+       * flush.  Once again, we wish for a domain tracker in libdrm to cover
+       * usage inside of a batchbuffer.
+       */
+      intel_batchbuffer_emit_mi_flush(intel->batch);
+
+      drm_intel_bo_unreference(intel_obj->range_map_bo);
+      intel_obj->range_map_bo = NULL;
+   } else if (intel_obj->buffer != NULL) {
+      if (intel_obj->mapped_gtt) {
+	 drm_intel_gem_bo_unmap_gtt(intel_obj->buffer);
+      } else {
+	 drm_intel_bo_unmap(intel_obj->buffer);
+      }
    }
+   obj->Pointer = NULL;
+   obj->Offset = 0;
+   obj->Length = 0;
+
    return GL_TRUE;
 }
 
@@ -266,19 +487,95 @@ intel_bufferobj_buffer(struct intel_context *intel,
       }
    }
 
+   if (intel_obj->buffer == NULL) {
+      void *sys_buffer = intel_obj->sys_buffer;
+
+      /* only one of buffer and sys_buffer could be non-NULL */
+      intel_bufferobj_alloc_buffer(intel, intel_obj);
+      intel_obj->sys_buffer = NULL;
+
+      intel_bufferobj_subdata(&intel->ctx,
+			      GL_ARRAY_BUFFER_ARB,
+			      0,
+			      intel_obj->Base.Size,
+			      sys_buffer,
+			      &intel_obj->Base);
+      _mesa_free(sys_buffer);
+      intel_obj->sys_buffer = NULL;
+   }
+
    return intel_obj->buffer;
 }
 
+static void
+intel_bufferobj_copy_subdata(GLcontext *ctx,
+			     struct gl_buffer_object *src,
+			     struct gl_buffer_object *dst,
+			     GLintptr read_offset, GLintptr write_offset,
+			     GLsizeiptr size)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_buffer_object *intel_src = intel_buffer_object(src);
+   struct intel_buffer_object *intel_dst = intel_buffer_object(dst);
+   drm_intel_bo *src_bo, *dst_bo;
+
+   if (size == 0)
+      return;
+
+   /* If we're in system memory, just map and memcpy. */
+   if (intel_src->sys_buffer || intel_dst->sys_buffer) {
+      /* The same buffer may be used, but note that regions copied may
+       * not overlap.
+       */
+      if (src == dst) {
+	 char *ptr = intel_bufferobj_map(ctx, GL_COPY_WRITE_BUFFER,
+					 GL_READ_WRITE, dst);
+	 memcpy(ptr + write_offset, ptr + read_offset, size);
+	 intel_bufferobj_unmap(ctx, GL_COPY_WRITE_BUFFER, dst);
+      } else {
+	 const char *src_ptr;
+	 char *dst_ptr;
+
+	 src_ptr =  intel_bufferobj_map(ctx, GL_COPY_READ_BUFFER,
+					GL_READ_ONLY, src);
+	 dst_ptr =  intel_bufferobj_map(ctx, GL_COPY_WRITE_BUFFER,
+					GL_WRITE_ONLY, dst);
+
+	 memcpy(dst_ptr + write_offset, src_ptr + read_offset, size);
+
+	 intel_bufferobj_unmap(ctx, GL_COPY_READ_BUFFER, src);
+	 intel_bufferobj_unmap(ctx, GL_COPY_WRITE_BUFFER, dst);
+      }
+   }
+
+   /* Otherwise, we have real BOs, so blit them. */
+
+   dst_bo = intel_bufferobj_buffer(intel, intel_dst, INTEL_WRITE_PART);
+   src_bo = intel_bufferobj_buffer(intel, intel_src, INTEL_READ);
+
+   intel_emit_linear_blit(intel,
+			  dst_bo, write_offset,
+			  src_bo, read_offset, size);
+
+   /* Since we've emitted some blits to buffers that will (likely) be used
+    * in rendering operations in other cache domains in this batch, emit a
+    * flush.  Once again, we wish for a domain tracker in libdrm to cover
+    * usage inside of a batchbuffer.
+    */
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+}
+
 void
-intel_bufferobj_init(struct intel_context *intel)
+intelInitBufferObjectFuncs(struct dd_function_table *functions)
 {
-   GLcontext *ctx = &intel->ctx;
-
-   ctx->Driver.NewBufferObject = intel_bufferobj_alloc;
-   ctx->Driver.DeleteBuffer = intel_bufferobj_free;
-   ctx->Driver.BufferData = intel_bufferobj_data;
-   ctx->Driver.BufferSubData = intel_bufferobj_subdata;
-   ctx->Driver.GetBufferSubData = intel_bufferobj_get_subdata;
-   ctx->Driver.MapBuffer = intel_bufferobj_map;
-   ctx->Driver.UnmapBuffer = intel_bufferobj_unmap;
+   functions->NewBufferObject = intel_bufferobj_alloc;
+   functions->DeleteBuffer = intel_bufferobj_free;
+   functions->BufferData = intel_bufferobj_data;
+   functions->BufferSubData = intel_bufferobj_subdata;
+   functions->GetBufferSubData = intel_bufferobj_get_subdata;
+   functions->MapBuffer = intel_bufferobj_map;
+   functions->MapBufferRange = intel_bufferobj_map_range;
+   functions->FlushMappedBufferRange = intel_bufferobj_flush_mapped_range;
+   functions->UnmapBuffer = intel_bufferobj_unmap;
+   functions->CopyBufferSubData = intel_bufferobj_copy_subdata;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.h b/src/mesa/drivers/dri/intel/intel_buffer_objects.h
index bf6dbd58f2..bf3e08a320 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.h
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.h
@@ -42,10 +42,19 @@ struct intel_buffer_object
 {
    struct gl_buffer_object Base;
    dri_bo *buffer;     /* the low-level buffer manager's buffer handle */
+   /** System memory buffer data, if not using a BO to store the data. */
+   void *sys_buffer;
 
    struct intel_region *region; /* Is there a zero-copy texture
                                    associated with this (pixel)
                                    buffer object? */
+
+   drm_intel_bo *range_map_bo;
+   void *range_map_buffer;
+   unsigned int range_map_offset;
+   GLsizei range_map_size;
+
+   GLboolean mapped_gtt;
 };
 
 
@@ -57,7 +66,7 @@ dri_bo *intel_bufferobj_buffer(struct intel_context *intel,
 
 /* Hook the bufferobject implementation into mesa: 
  */
-void intel_bufferobj_init(struct intel_context *intel);
+void intelInitBufferObjectFuncs(struct dd_function_table *functions);
 
 
 
@@ -69,10 +78,7 @@ void intel_bufferobj_init(struct intel_context *intel);
 static INLINE struct intel_buffer_object *
 intel_buffer_object(struct gl_buffer_object *obj)
 {
-   if (obj->Name)
-      return (struct intel_buffer_object *) obj;
-   else
-      return NULL;
+   return (struct intel_buffer_object *) obj;
 }
 
 /* Helpers for zerocopy image uploads.  See also intel_regions.h:
diff --git a/src/mesa/drivers/dri/intel/intel_buffers.c b/src/mesa/drivers/dri/intel/intel_buffers.c
index 0fd2f16a8f..e7357e78c5 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.c
+++ b/src/mesa/drivers/dri/intel/intel_buffers.c
@@ -25,25 +25,14 @@
  * 
  **************************************************************************/
 
-#include "intel_screen.h"
 #include "intel_context.h"
-#include "intel_blit.h"
 #include "intel_buffers.h"
-#include "intel_chipset.h"
-#include "intel_depthstencil.h"
 #include "intel_fbo.h"
 #include "intel_regions.h"
 #include "intel_batchbuffer.h"
-#include "intel_reg.h"
-#include "main/context.h"
 #include "main/framebuffer.h"
-#include "swrast/swrast.h"
-#include "utils.h"
 #include "drirenderbuffer.h"
-#include "vblank.h"
-#include "i915_drm.h"
 
-#define FILE_DEBUG_FLAG DEBUG_BLIT
 
 /**
  * XXX move this into a new dri/common/cliprects.c file.
@@ -114,7 +103,6 @@ intel_get_cliprects(struct intel_context *intel,
 		    int *x_off, int *y_off)
 {
    __DRIdrawablePrivate *dPriv = intel->driDrawable;
-   struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
 
    if (intel->constant_cliprect) {
       /* FBO or DRI2 rendering, which can just use the fb's size. */
@@ -143,399 +131,6 @@ intel_get_cliprects(struct intel_context *intel,
    }
 }
 
-/**
- * This will be called whenever the currently bound window is moved/resized.
- * XXX: actually, it seems to NOT be called when the window is only moved (BP).
- */
-void
-intelWindowMoved(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-   __DRIdrawablePrivate *dPriv = intel->driDrawable;
-   struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
-
-   if (!intel->intelScreen->driScrnPriv->dri2.enabled &&
-       intel->intelScreen->driScrnPriv->ddx_version.minor >= 7) {
-      volatile drm_i915_sarea_t *sarea = intel->sarea;
-      drm_clip_rect_t drw_rect = { .x1 = dPriv->x, .x2 = dPriv->x + dPriv->w,
-				   .y1 = dPriv->y, .y2 = dPriv->y + dPriv->h };
-      drm_clip_rect_t planeA_rect = { .x1 = sarea->planeA_x, .y1 = sarea->planeA_y,
-				     .x2 = sarea->planeA_x + sarea->planeA_w,
-				     .y2 = sarea->planeA_y + sarea->planeA_h };
-      drm_clip_rect_t planeB_rect = { .x1 = sarea->planeB_x, .y1 = sarea->planeB_y,
-				     .x2 = sarea->planeB_x + sarea->planeB_w,
-				     .y2 = sarea->planeB_y + sarea->planeB_h };
-      GLint areaA = driIntersectArea( drw_rect, planeA_rect );
-      GLint areaB = driIntersectArea( drw_rect, planeB_rect );
-      GLuint flags = dPriv->vblFlags;
-
-      /* Update vblank info
-       */
-      if (areaB > areaA || (areaA == areaB && areaB > 0)) {
-	 flags = dPriv->vblFlags | VBLANK_FLAG_SECONDARY;
-      } else {
-	 flags = dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
-      }
-
-      /* Check to see if we changed pipes */
-      if (flags != dPriv->vblFlags && dPriv->vblFlags &&
-	  !(dPriv->vblFlags & VBLANK_FLAG_NO_IRQ)) {
-	 int64_t count;
-	 drmVBlank vbl;
-	 int i;
-
-	 /*
-	  * Deal with page flipping
-	  */
-	 vbl.request.type = DRM_VBLANK_ABSOLUTE;
-
-	 if ( dPriv->vblFlags & VBLANK_FLAG_SECONDARY ) {
-	    vbl.request.type |= DRM_VBLANK_SECONDARY;
-	 }
-
-	 for (i = 0; i < 2; i++) {
-	    if (!intel_fb->color_rb[i] ||
-		(intel_fb->vbl_waited - intel_fb->color_rb[i]->vbl_pending) <=
-		(1<<23))
-	       continue;
-
-	    vbl.request.sequence = intel_fb->color_rb[i]->vbl_pending;
-	    drmWaitVBlank(intel->driFd, &vbl);
-	 }
-
-	 /*
-	  * Update msc_base from old pipe
-	  */
-	 driDrawableGetMSC32(dPriv->driScreenPriv, dPriv, &count);
-	 dPriv->msc_base = count;
-	 /*
-	  * Then get new vblank_base and vblSeq values
-	  */
-	 dPriv->vblFlags = flags;
-	 driGetCurrentVBlank(dPriv);
-	 dPriv->vblank_base = dPriv->vblSeq;
-
-	 intel_fb->vbl_waited = dPriv->vblSeq;
-
-	 for (i = 0; i < 2; i++) {
-	    if (intel_fb->color_rb[i])
-	       intel_fb->color_rb[i]->vbl_pending = intel_fb->vbl_waited;
-	 }
-      }
-   } else {
-      dPriv->vblFlags &= ~VBLANK_FLAG_SECONDARY;
-   }
-
-   /* Update Mesa's notion of window size */
-   driUpdateFramebufferSize(ctx, dPriv);
-   intel_fb->Base.Initialized = GL_TRUE; /* XXX remove someday */
-
-   /* Update hardware scissor */
-   if (ctx->Driver.Scissor != NULL) {
-      ctx->Driver.Scissor(ctx, ctx->Scissor.X, ctx->Scissor.Y,
-			  ctx->Scissor.Width, ctx->Scissor.Height);
-   }
-
-   /* Re-calculate viewport related state */
-   if (ctx->Driver.DepthRange != NULL)
-      ctx->Driver.DepthRange( ctx, ctx->Viewport.Near, ctx->Viewport.Far );
-}
-
-
-
-/* A true meta version of this would be very simple and additionally
- * machine independent.  Maybe we'll get there one day.
- */
-static void
-intelClearWithTris(struct intel_context *intel, GLbitfield mask)
-{
-   GLcontext *ctx = &intel->ctx;
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   GLuint buf;
-
-   intel->vtbl.install_meta_state(intel);
-
-   /* Back and stencil cliprects are the same.  Try and do both
-    * buffers at once:
-    */
-   if (mask & (BUFFER_BIT_BACK_LEFT | BUFFER_BIT_STENCIL | BUFFER_BIT_DEPTH)) {
-      struct intel_region *backRegion =
-	 intel_get_rb_region(fb, BUFFER_BACK_LEFT);
-      struct intel_region *depthRegion =
-	 intel_get_rb_region(fb, BUFFER_DEPTH);
-
-      intel->vtbl.meta_draw_region(intel, backRegion, depthRegion);
-
-      if (mask & BUFFER_BIT_BACK_LEFT)
-	 intel->vtbl.meta_color_mask(intel, GL_TRUE);
-      else
-	 intel->vtbl.meta_color_mask(intel, GL_FALSE);
-
-      if (mask & BUFFER_BIT_STENCIL)
-	 intel->vtbl.meta_stencil_replace(intel,
-					  intel->ctx.Stencil.WriteMask[0],
-					  intel->ctx.Stencil.Clear);
-      else
-	 intel->vtbl.meta_no_stencil_write(intel);
-
-      if (mask & BUFFER_BIT_DEPTH)
-	 intel->vtbl.meta_depth_replace(intel);
-      else
-	 intel->vtbl.meta_no_depth_write(intel);
-
-      intel->vtbl.meta_draw_quad(intel,
-				 fb->_Xmin,
-				 fb->_Xmax,
-				 fb->_Ymin,
-				 fb->_Ymax,
-				 intel->ctx.Depth.Clear,
-				 intel->ClearColor8888,
-				 0, 0, 0, 0);   /* texcoords */
-
-      mask &= ~(BUFFER_BIT_BACK_LEFT | BUFFER_BIT_STENCIL | BUFFER_BIT_DEPTH);
-   }
-
-   /* clear the remaining (color) renderbuffers */
-   for (buf = 0; buf < BUFFER_COUNT && mask; buf++) {
-      const GLuint bufBit = 1 << buf;
-      if (mask & bufBit) {
-	 struct intel_renderbuffer *irbColor =
-	    intel_renderbuffer(fb->Attachment[buf].Renderbuffer);
-
-	 ASSERT(irbColor);
-
-	 intel->vtbl.meta_no_depth_write(intel);
-	 intel->vtbl.meta_no_stencil_write(intel);
-	 intel->vtbl.meta_color_mask(intel, GL_TRUE);
-	 intel->vtbl.meta_draw_region(intel, irbColor->region, NULL);
-
-	 intel->vtbl.meta_draw_quad(intel,
-				    fb->_Xmin,
-				    fb->_Xmax,
-				    fb->_Ymin,
-				    fb->_Ymax,
-				    0, intel->ClearColor8888,
-				    0, 0, 0, 0);   /* texcoords */
-
-	 mask &= ~bufBit;
-      }
-   }
-
-   intel->vtbl.leave_meta_state(intel);
-}
-
-static const char *buffer_names[] = {
-   [BUFFER_FRONT_LEFT] = "front",
-   [BUFFER_BACK_LEFT] = "back",
-   [BUFFER_FRONT_RIGHT] = "front right",
-   [BUFFER_BACK_RIGHT] = "back right",
-   [BUFFER_AUX0] = "aux0",
-   [BUFFER_AUX1] = "aux1",
-   [BUFFER_AUX2] = "aux2",
-   [BUFFER_AUX3] = "aux3",
-   [BUFFER_DEPTH] = "depth",
-   [BUFFER_STENCIL] = "stencil",
-   [BUFFER_ACCUM] = "accum",
-   [BUFFER_COLOR0] = "color0",
-   [BUFFER_COLOR1] = "color1",
-   [BUFFER_COLOR2] = "color2",
-   [BUFFER_COLOR3] = "color3",
-   [BUFFER_COLOR4] = "color4",
-   [BUFFER_COLOR5] = "color5",
-   [BUFFER_COLOR6] = "color6",
-   [BUFFER_COLOR7] = "color7",
-};
-
-/**
- * Called by ctx->Driver.Clear.
- */
-static void
-intelClear(GLcontext *ctx, GLbitfield mask)
-{
-   struct intel_context *intel = intel_context(ctx);
-   const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
-   GLbitfield tri_mask = 0;
-   GLbitfield blit_mask = 0;
-   GLbitfield swrast_mask = 0;
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   GLuint i;
-
-   if (0)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   /* HW color buffers (front, back, aux, generic FBO, etc) */
-   if (colorMask == ~0) {
-      /* clear all R,G,B,A */
-      /* XXX FBO: need to check if colorbuffers are software RBOs! */
-      blit_mask |= (mask & BUFFER_BITS_COLOR);
-   }
-   else {
-      /* glColorMask in effect */
-      tri_mask |= (mask & BUFFER_BITS_COLOR);
-   }
-
-   /* HW stencil */
-   if (mask & BUFFER_BIT_STENCIL) {
-      const struct intel_region *stencilRegion
-         = intel_get_rb_region(fb, BUFFER_STENCIL);
-      if (stencilRegion) {
-         /* have hw stencil */
-         if (IS_965(intel->intelScreen->deviceID) ||
-	     (ctx->Stencil.WriteMask[0] & 0xff) != 0xff) {
-	    /* We have to use the 3D engine if we're clearing a partial mask
-	     * of the stencil buffer, or if we're on a 965 which has a tiled
-	     * depth/stencil buffer in a layout we can't blit to.
-	     */
-            tri_mask |= BUFFER_BIT_STENCIL;
-         }
-         else {
-            /* clearing all stencil bits, use blitting */
-            blit_mask |= BUFFER_BIT_STENCIL;
-         }
-      }
-   }
-
-   /* HW depth */
-   if (mask & BUFFER_BIT_DEPTH) {
-      /* clear depth with whatever method is used for stencil (see above) */
-      if (IS_965(intel->intelScreen->deviceID) ||
-	  tri_mask & BUFFER_BIT_STENCIL)
-         tri_mask |= BUFFER_BIT_DEPTH;
-      else
-         blit_mask |= BUFFER_BIT_DEPTH;
-   }
-
-   /* SW fallback clearing */
-   swrast_mask = mask & ~tri_mask & ~blit_mask;
-
-   for (i = 0; i < BUFFER_COUNT; i++) {
-      GLuint bufBit = 1 << i;
-      if ((blit_mask | tri_mask) & bufBit) {
-         if (!fb->Attachment[i].Renderbuffer->ClassID) {
-            blit_mask &= ~bufBit;
-            tri_mask &= ~bufBit;
-            swrast_mask |= bufBit;
-         }
-      }
-   }
-
-   if (blit_mask) {
-      if (INTEL_DEBUG & DEBUG_BLIT) {
-	 DBG("blit clear:");
-	 for (i = 0; i < BUFFER_COUNT; i++) {
-	    if (blit_mask & (1 << i))
-	       DBG(" %s", buffer_names[i]);
-	 }
-	 DBG("\n");
-      }
-      intelClearWithBlit(ctx, blit_mask);
-   }
-
-   if (tri_mask) {
-      if (INTEL_DEBUG & DEBUG_BLIT) {
-	 DBG("tri clear:");
-	 for (i = 0; i < BUFFER_COUNT; i++) {
-	    if (tri_mask & (1 << i))
-	       DBG(" %s", buffer_names[i]);
-	 }
-	 DBG("\n");
-      }
-      intelClearWithTris(intel, tri_mask);
-   }
-
-   if (swrast_mask) {
-      if (INTEL_DEBUG & DEBUG_BLIT) {
-	 DBG("swrast clear:");
-	 for (i = 0; i < BUFFER_COUNT; i++) {
-	    if (swrast_mask & (1 << i))
-	       DBG(" %s", buffer_names[i]);
-	 }
-	 DBG("\n");
-      }
-      _swrast_Clear(ctx, swrast_mask);
-   }
-}
-
-void
-intelSwapBuffers(__DRIdrawablePrivate * dPriv)
-{
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      GET_CURRENT_CONTEXT(ctx);
-      struct intel_context *intel;
-
-      if (ctx == NULL)
-	 return;
-
-      intel = intel_context(ctx);
-
-      if (ctx->Visual.doubleBufferMode) {
-	 GLboolean missed_target;
-	 struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
-	 int64_t ust;
-         
-	 _mesa_notifySwapBuffers(ctx);  /* flush pending rendering comands */
-
-	/*
-	 * The old swapping ioctl was incredibly racy, just wait for vblank
-	 * and do the swap ourselves.
-	 */
-	 driWaitForVBlank(dPriv, &missed_target);
-
-	 /*
-	  * Update each buffer's vbl_pending so we don't get too out of
-	  * sync
-	  */
-	 intel_get_renderbuffer(&intel_fb->Base,
-		   		BUFFER_BACK_LEFT)->vbl_pending = dPriv->vblSeq;
-         intel_get_renderbuffer(&intel_fb->Base,
-		   		BUFFER_FRONT_LEFT)->vbl_pending = dPriv->vblSeq;
-
-	 intelCopyBuffer(dPriv, NULL);
-
-	 intel_fb->swap_count++;
-	 (*psp->systemTime->getUST) (&ust);
-	 if (missed_target) {
-	    intel_fb->swap_missed_count++;
-	    intel_fb->swap_missed_ust = ust - intel_fb->swap_ust;
-	 }
-
-	 intel_fb->swap_ust = ust;
-      }
-      drmCommandNone(intel->driFd, DRM_I915_GEM_THROTTLE);
-
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      fprintf(stderr, "%s: drawable has no context!\n", __FUNCTION__);
-   }
-}
-
-void
-intelCopySubBuffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h)
-{
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      struct intel_context *intel =
-         (struct intel_context *) dPriv->driContextPriv->driverPrivate;
-      GLcontext *ctx = &intel->ctx;
-
-      if (ctx->Visual.doubleBufferMode) {
-         drm_clip_rect_t rect;
-         rect.x1 = x + dPriv->x;
-         rect.y1 = (dPriv->h - y - h) + dPriv->y;
-         rect.x2 = rect.x1 + w;
-         rect.y2 = rect.y1 + h;
-         _mesa_notifySwapBuffers(ctx);  /* flush pending rendering comands */
-         intelCopyBuffer(dPriv, &rect);
-      }
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      fprintf(stderr, "%s: drawable has no context!\n", __FUNCTION__);
-   }
-}
-
 
 /**
  * Update the hardware state for drawing into a window or framebuffer object.
@@ -559,10 +154,10 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       return;
    }
 
-   /* Do this here, note core Mesa, since this function is called from
+   /* Do this here, not core Mesa, since this function is called from
     * many places within the driver.
     */
-   if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+   if (ctx->NewState & _NEW_BUFFERS) {
       /* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
       _mesa_update_framebuffer(ctx);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
@@ -577,9 +172,6 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       return;
    }
 
-   if (fb->Name)
-      intel_validate_paired_depth_stencil(ctx, fb);
-
    /*
     * How many color buffers are we drawing into?
     */
@@ -587,7 +179,8 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       /* writing to 0  */
       colorRegions[0] = NULL;
       intel->constant_cliprect = GL_TRUE;
-   } else if (fb->_NumColorDrawBuffers > 1) {
+   }
+   else if (fb->_NumColorDrawBuffers > 1) {
        int i;
        struct intel_renderbuffer *irb;
 
@@ -609,6 +202,8 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
 	       intel_batchbuffer_flush(intel->batch);
 	    intel->front_cliprects = GL_TRUE;
 	    colorRegions[0] = intel_get_rb_region(fb, BUFFER_FRONT_LEFT);
+
+	    intel->front_buffer_dirty = GL_TRUE;
 	 }
 	 else {
 	    if (!intel->constant_cliprect && intel->front_cliprects)
@@ -626,14 +221,6 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       }
    }
 
-   /* Update culling direction which changes depending on the
-    * orientation of the buffer:
-    */
-   if (ctx->Driver.FrontFace)
-      ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
-   else
-      ctx->NewState |= _NEW_POLYGON;
-
    if (!colorRegions[0]) {
       FALLBACK(intel, INTEL_FALLBACK_DRAW_BUFFER, GL_TRUE);
    }
@@ -665,59 +252,50 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
    /***
     *** Stencil buffer
     *** This can only be hardware accelerated if we're using a
-    *** combined DEPTH_STENCIL buffer (for now anyway).
+    *** combined DEPTH_STENCIL buffer.
     ***/
    if (fb->_StencilBuffer && fb->_StencilBuffer->Wrapped) {
       irbStencil = intel_renderbuffer(fb->_StencilBuffer->Wrapped);
       if (irbStencil && irbStencil->region) {
          ASSERT(irbStencil->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
          FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, GL_FALSE);
-         /* need to re-compute stencil hw state */
-	 if (ctx->Driver.Enable != NULL)
-	    ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
-	 else
-	    ctx->NewState |= _NEW_STENCIL;
-         if (!depthRegion)
-            depthRegion = irbStencil->region;
       }
       else {
          FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, GL_TRUE);
       }
    }
    else {
-      /* XXX FBO: instead of FALSE, pass ctx->Stencil.Enabled ??? */
+      /* XXX FBO: instead of FALSE, pass ctx->Stencil._Enabled ??? */
       FALLBACK(intel, INTEL_FALLBACK_STENCIL_BUFFER, GL_FALSE);
-      /* need to re-compute stencil hw state */
-      if (ctx->Driver.Enable != NULL)
-	 ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
-      else
-	 ctx->NewState |= _NEW_STENCIL;
    }
 
    /*
-    * Update depth test state
+    * Update depth and stencil test state
     */
    if (ctx->Driver.Enable) {
-      if (ctx->Depth.Test && fb->Visual.depthBits > 0) {
-	 ctx->Driver.Enable(ctx, GL_DEPTH_TEST, GL_TRUE);
-      } else {
-	 ctx->Driver.Enable(ctx, GL_DEPTH_TEST, GL_FALSE);
-      }
-   } else {
-      ctx->NewState |= _NEW_DEPTH;
+      ctx->Driver.Enable(ctx, GL_DEPTH_TEST,
+                         (ctx->Depth.Test && fb->Visual.depthBits > 0));
+      ctx->Driver.Enable(ctx, GL_STENCIL_TEST,
+                         (ctx->Stencil.Enabled && fb->Visual.stencilBits > 0));
+   }
+   else {
+      /* Mesa's Stencil._Enabled field is updated when
+       * _NEW_BUFFERS | _NEW_STENCIL, but i965 code assumes that the value
+       * only changes with _NEW_STENCIL (which seems sensible).  So flag it
+       * here since this is the _NEW_BUFFERS path.
+       */
+      ctx->NewState |= (_NEW_DEPTH | _NEW_STENCIL);
    }
 
    intel->vtbl.set_draw_region(intel, colorRegions, depthRegion, 
-	fb->_NumColorDrawBuffers);
+                               fb->_NumColorDrawBuffers);
 
    /* update viewport since it depends on window size */
-   if (ctx->Driver.Viewport) {
-      ctx->Driver.Viewport(ctx, ctx->Viewport.X, ctx->Viewport.Y,
-			   ctx->Viewport.Width, ctx->Viewport.Height);
-   } else {
-      ctx->NewState |= _NEW_VIEWPORT;
-   }
-
+#ifdef I915
+   intelCalcViewport(ctx);
+#else
+   ctx->NewState |= _NEW_VIEWPORT;
+#endif
    /* Set state we know depends on drawable parameters:
     */
    if (ctx->Driver.Scissor)
@@ -729,12 +307,37 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
       ctx->Driver.DepthRange(ctx,
 			     ctx->Viewport.Near,
 			     ctx->Viewport.Far);
+
+   /* Update culling direction which changes depending on the
+    * orientation of the buffer:
+    */
+   if (ctx->Driver.FrontFace)
+      ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
+   else
+      ctx->NewState |= _NEW_POLYGON;
 }
 
 
 static void
 intelDrawBuffer(GLcontext * ctx, GLenum mode)
 {
+   if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+      struct intel_context *const intel = intel_context(ctx);
+      const GLboolean was_front_buffer_rendering =
+	intel->is_front_buffer_rendering;
+
+      intel->is_front_buffer_rendering = (mode == GL_FRONT_LEFT)
+	|| (mode == GL_FRONT);
+
+      /* If we weren't front-buffer rendering before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+      if (!was_front_buffer_rendering && intel->is_front_buffer_rendering) {
+	 intel_update_renderbuffers(intel->driContext,
+				    intel->driContext->driDrawablePriv);
+      }
+   }
+
    intel_draw_buffer(ctx, ctx->DrawBuffer);
 }
 
@@ -742,6 +345,23 @@ intelDrawBuffer(GLcontext * ctx, GLenum mode)
 static void
 intelReadBuffer(GLcontext * ctx, GLenum mode)
 {
+   if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+      struct intel_context *const intel = intel_context(ctx);
+      const GLboolean was_front_buffer_reading =
+	intel->is_front_buffer_reading;
+
+      intel->is_front_buffer_reading = (mode == GL_FRONT_LEFT)
+	|| (mode == GL_FRONT);
+
+      /* If we weren't front-buffer reading before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+      if (!was_front_buffer_reading && intel->is_front_buffer_reading) {
+	 intel_update_renderbuffers(intel->driContext,
+				    intel->driContext->driDrawablePriv);
+      }
+   }
+
    if (ctx->ReadBuffer == ctx->DrawBuffer) {
       /* This will update FBO completeness status.
        * A framebuffer will be incomplete if the GL_READ_BUFFER setting
@@ -759,7 +379,6 @@ intelReadBuffer(GLcontext * ctx, GLenum mode)
 void
 intelInitBufferFuncs(struct dd_function_table *functions)
 {
-   functions->Clear = intelClear;
    functions->DrawBuffer = intelDrawBuffer;
    functions->ReadBuffer = intelReadBuffer;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_buffers.h b/src/mesa/drivers/dri/intel/intel_buffers.h
index 0be1cee091..6069d38e9e 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.h
+++ b/src/mesa/drivers/dri/intel/intel_buffers.h
@@ -45,10 +45,6 @@ extern struct intel_region *intel_readbuf_region(struct intel_context *intel);
 
 extern struct intel_region *intel_drawbuf_region(struct intel_context *intel);
 
-extern void intelSwapBuffers(__DRIdrawablePrivate * dPriv);
-
-extern void intelWindowMoved(struct intel_context *intel);
-
 extern void intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb);
 
 extern void intelInitBufferFuncs(struct dd_function_table *functions);
@@ -57,5 +53,8 @@ void intel_get_cliprects(struct intel_context *intel,
 			 struct drm_clip_rect **cliprects,
 			 unsigned int *num_cliprects,
 			 int *x_off, int *y_off);
+#ifdef I915
+void intelCalcViewport(GLcontext * ctx);
+#endif
 
 #endif /* INTEL_BUFFERS_H */
diff --git a/src/mesa/drivers/dri/intel/intel_chipset.h b/src/mesa/drivers/dri/intel/intel_chipset.h
index d1b4941601..3dc8653a73 100644
--- a/src/mesa/drivers/dri/intel/intel_chipset.h
+++ b/src/mesa/drivers/dri/intel/intel_chipset.h
@@ -46,6 +46,13 @@
 #define PCI_CHIP_G33_G			0x29C2
 #define PCI_CHIP_Q33_G			0x29D2
 
+#define PCI_CHIP_IGD_GM			0xA011
+#define PCI_CHIP_IGD_G			0xA001
+
+#define IS_IGDGM(devid)	(devid == PCI_CHIP_IGD_GM)
+#define IS_IGDG(devid)	(devid == PCI_CHIP_IGD_G)
+#define IS_IGD(devid) (IS_IGDG(devid) || IS_IGDGM(devid))
+
 #define PCI_CHIP_I965_G			0x29A2
 #define PCI_CHIP_I965_Q			0x2992
 #define PCI_CHIP_I965_G_1		0x2982
@@ -59,6 +66,10 @@
 #define PCI_CHIP_Q45_G                  0x2E12
 #define PCI_CHIP_G45_G                  0x2E22
 #define PCI_CHIP_G41_G                  0x2E32
+#define PCI_CHIP_B43_G                  0x2E42
+
+#define PCI_CHIP_ILD_G                  0x0042
+#define PCI_CHIP_ILM_G                  0x0046
 
 #define IS_MOBILE(devid)	(devid == PCI_CHIP_I855_GM || \
 				 devid == PCI_CHIP_I915_GM || \
@@ -66,15 +77,22 @@
 				 devid == PCI_CHIP_I945_GME || \
 				 devid == PCI_CHIP_I965_GM || \
 				 devid == PCI_CHIP_I965_GME || \
-				 devid == PCI_CHIP_GM45_GM)
+				 devid == PCI_CHIP_GM45_GM || \
+				 IS_IGD(devid) || \
+				 devid == PCI_CHIP_ILM_G)
 
 #define IS_G45(devid)           (devid == PCI_CHIP_IGD_E_G || \
                                  devid == PCI_CHIP_Q45_G || \
                                  devid == PCI_CHIP_G45_G || \
-                                 devid == PCI_CHIP_G41_G)
+                                 devid == PCI_CHIP_G41_G || \
+                                 devid == PCI_CHIP_B43_G)
 #define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
 #define IS_G4X(devid)		(IS_G45(devid) || IS_GM45(devid))
 
+#define IS_ILD(devid)           (devid == PCI_CHIP_ILD_G)
+#define IS_ILM(devid)           (devid == PCI_CHIP_ILM_G)
+#define IS_IGDNG(devid)           (IS_ILD(devid) || IS_ILM(devid))
+
 #define IS_915(devid)		(devid == PCI_CHIP_I915_G || \
 				 devid == PCI_CHIP_E7221_G || \
 				 devid == PCI_CHIP_I915_GM)
@@ -84,7 +102,7 @@
 				 devid == PCI_CHIP_I945_GME || \
 				 devid == PCI_CHIP_G33_G || \
 				 devid == PCI_CHIP_Q33_G || \
-				 devid == PCI_CHIP_Q35_G)
+				 devid == PCI_CHIP_Q35_G || IS_IGD(devid))
 
 #define IS_965(devid)		(devid == PCI_CHIP_I965_G || \
 				 devid == PCI_CHIP_I965_Q || \
@@ -92,7 +110,8 @@
 				 devid == PCI_CHIP_I965_GM || \
 				 devid == PCI_CHIP_I965_GME || \
 				 devid == PCI_CHIP_I946_GZ || \
-				 IS_G4X(devid))
+				 IS_G4X(devid) || \
+				 IS_IGDNG(devid))
 
 #define IS_9XX(devid)		(IS_915(devid) || \
 				 IS_945(devid) || \
diff --git a/src/mesa/drivers/dri/intel/intel_clear.c b/src/mesa/drivers/dri/intel/intel_clear.c
new file mode 100644
index 0000000000..bce23724b3
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_clear.c
@@ -0,0 +1,194 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Intel Corporation.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "swrast/swrast.h"
+#include "drivers/common/meta.h"
+
+#include "intel_context.h"
+#include "intel_blit.h"
+#include "intel_chipset.h"
+#include "intel_clear.h"
+#include "intel_fbo.h"
+#include "intel_pixel.h"
+#include "intel_regions.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+static const char *buffer_names[] = {
+   [BUFFER_FRONT_LEFT] = "front",
+   [BUFFER_BACK_LEFT] = "back",
+   [BUFFER_FRONT_RIGHT] = "front right",
+   [BUFFER_BACK_RIGHT] = "back right",
+   [BUFFER_DEPTH] = "depth",
+   [BUFFER_STENCIL] = "stencil",
+   [BUFFER_ACCUM] = "accum",
+   [BUFFER_AUX0] = "aux0",
+   [BUFFER_COLOR0] = "color0",
+   [BUFFER_COLOR1] = "color1",
+   [BUFFER_COLOR2] = "color2",
+   [BUFFER_COLOR3] = "color3",
+   [BUFFER_COLOR4] = "color4",
+   [BUFFER_COLOR5] = "color5",
+   [BUFFER_COLOR6] = "color6",
+   [BUFFER_COLOR7] = "color7",
+};
+
+/**
+ * Called by ctx->Driver.Clear.
+ */
+static void
+intelClear(GLcontext *ctx, GLbitfield mask)
+{
+   struct intel_context *intel = intel_context(ctx);
+   const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+   GLbitfield tri_mask = 0;
+   GLbitfield blit_mask = 0;
+   GLbitfield swrast_mask = 0;
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   GLuint i;
+
+   if (0)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   /* HW color buffers (front, back, aux, generic FBO, etc) */
+   if (colorMask == ~0) {
+      /* clear all R,G,B,A */
+      /* XXX FBO: need to check if colorbuffers are software RBOs! */
+      blit_mask |= (mask & BUFFER_BITS_COLOR);
+   }
+   else {
+      /* glColorMask in effect */
+      tri_mask |= (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT));
+   }
+
+   /* HW stencil */
+   if (mask & BUFFER_BIT_STENCIL) {
+      const struct intel_region *stencilRegion
+         = intel_get_rb_region(fb, BUFFER_STENCIL);
+      if (stencilRegion) {
+         /* have hw stencil */
+         if (stencilRegion->tiling == I915_TILING_Y ||
+	     (ctx->Stencil.WriteMask[0] & 0xff) != 0xff) {
+	    /* We have to use the 3D engine if we're clearing a partial mask
+	     * of the stencil buffer, or if we're on a 965 which has a tiled
+	     * depth/stencil buffer in a layout we can't blit to.
+	     */
+            tri_mask |= BUFFER_BIT_STENCIL;
+         }
+         else {
+            /* clearing all stencil bits, use blitting */
+            blit_mask |= BUFFER_BIT_STENCIL;
+         }
+      }
+   }
+
+   /* HW depth */
+   if (mask & BUFFER_BIT_DEPTH) {
+      const struct intel_region *irb = intel_get_rb_region(fb, BUFFER_DEPTH);
+
+      /* clear depth with whatever method is used for stencil (see above) */
+      if (irb->tiling == I915_TILING_Y || tri_mask & BUFFER_BIT_STENCIL)
+         tri_mask |= BUFFER_BIT_DEPTH;
+      else
+         blit_mask |= BUFFER_BIT_DEPTH;
+   }
+
+   /* If we're doing a tri pass for depth/stencil, include a likely color
+    * buffer with it.
+    */
+   if (mask & (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL)) {
+      int color_bit = _mesa_ffs(mask & BUFFER_BITS_COLOR);
+      if (color_bit != 0) {
+	 tri_mask |= blit_mask & (1 << (color_bit - 1));
+	 blit_mask &= ~(1 << (color_bit - 1));
+      }
+   }
+
+   /* SW fallback clearing */
+   swrast_mask = mask & ~tri_mask & ~blit_mask;
+
+   {
+      /* look for non-Intel renderbuffers (clear them with swrast) */
+      GLbitfield blit_or_tri = blit_mask | tri_mask;
+      while (blit_or_tri) {
+         GLuint i = _mesa_ffs(blit_or_tri) - 1;
+         GLbitfield bufBit = 1 << i;
+         if (!fb->Attachment[i].Renderbuffer->ClassID) {
+            blit_mask &= ~bufBit;
+            tri_mask &= ~bufBit;
+            swrast_mask |= bufBit;
+         }
+         blit_or_tri ^= bufBit;
+      }
+   }
+
+   if (blit_mask) {
+      if (INTEL_DEBUG & DEBUG_BLIT) {
+	 DBG("blit clear:");
+	 for (i = 0; i < BUFFER_COUNT; i++) {
+	    if (blit_mask & (1 << i))
+	       DBG(" %s", buffer_names[i]);
+	 }
+	 DBG("\n");
+      }
+      intelClearWithBlit(ctx, blit_mask);
+   }
+
+   if (tri_mask) {
+      if (INTEL_DEBUG & DEBUG_BLIT) {
+	 DBG("tri clear:");
+	 for (i = 0; i < BUFFER_COUNT; i++) {
+	    if (tri_mask & (1 << i))
+	       DBG(" %s", buffer_names[i]);
+	 }
+	 DBG("\n");
+      }
+      _mesa_meta_clear(&intel->ctx, tri_mask);
+   }
+
+   if (swrast_mask) {
+      if (INTEL_DEBUG & DEBUG_BLIT) {
+	 DBG("swrast clear:");
+	 for (i = 0; i < BUFFER_COUNT; i++) {
+	    if (swrast_mask & (1 << i))
+	       DBG(" %s", buffer_names[i]);
+	 }
+	 DBG("\n");
+      }
+      _swrast_Clear(ctx, swrast_mask);
+   }
+}
+
+
+void
+intelInitClearFuncs(struct dd_function_table *functions)
+{
+   functions->Clear = intelClear;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_clear.h b/src/mesa/drivers/dri/intel/intel_clear.h
new file mode 100644
index 0000000000..7fd6b310a9
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_clear.h
@@ -0,0 +1,38 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef INTEL_CLEAR_H
+#define INTEL_CLEAR_H
+
+struct dd_function_table;
+
+extern void
+intelInitClearFuncs(struct dd_function_table *functions);
+
+
+#endif /* INTEL_CLEAR_H */
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index c4a24d7397..7a2e7617d0 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -27,9 +27,10 @@
 
 
 #include "main/glheader.h"
+#include "main/colortab.h"
 #include "main/context.h"
-#include "main/matrix.h"
-#include "main/simple_list.h"
+#include "main/convolve.h"
+#include "main/arrayobj.h"
 #include "main/extensions.h"
 #include "main/framebuffer.h"
 #include "main/imports.h"
@@ -38,13 +39,8 @@
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
 #include "tnl/tnl.h"
-
-#include "tnl/t_pipeline.h"
-#include "tnl/t_vertex.h"
-
 #include "drivers/common/driverfuncs.h"
-
-#include "intel_screen.h"
+#include "drivers/common/meta.h"
 
 #include "i830_dri.h"
 
@@ -52,52 +48,35 @@
 #include "intel_buffers.h"
 #include "intel_tex.h"
 #include "intel_batchbuffer.h"
-#include "intel_blit.h"
+#include "intel_clear.h"
+#include "intel_extensions.h"
 #include "intel_pixel.h"
 #include "intel_regions.h"
 #include "intel_buffer_objects.h"
 #include "intel_fbo.h"
+#include "intel_eglimage.h"
 #include "intel_decode.h"
 #include "intel_bufmgr.h"
+#include "intel_screen.h"
+#include "intel_swapbuffers.h"
 
 #include "drirenderbuffer.h"
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h"            /* for symbolic values of enum-type options */
+
+
 #ifndef INTEL_DEBUG
 int INTEL_DEBUG = (0);
 #endif
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_occlusion_query
-#define need_GL_ARB_point_parameters
-#define need_GL_ARB_shader_objects
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
-#define need_GL_ARB_vertex_program
-#define need_GL_ARB_vertex_shader
-#define need_GL_ARB_window_pos
-#define need_GL_EXT_blend_color
-#define need_GL_EXT_blend_equation_separate
-#define need_GL_EXT_blend_func_separate
-#define need_GL_EXT_blend_minmax
-#define need_GL_EXT_cull_vertex
-#define need_GL_EXT_fog_coord
-#define need_GL_EXT_framebuffer_object
-#define need_GL_EXT_multi_draw_arrays
-#define need_GL_EXT_point_parameters
-#define need_GL_EXT_secondary_color
-#define need_GL_ATI_separate_stencil
-#define need_GL_NV_point_sprite
-#define need_GL_NV_vertex_program
-#define need_GL_VERSION_2_0
-#define need_GL_VERSION_2_1
-
-#include "extension_helper.h"
-
-#define DRIVER_DATE                     "20090114"
+
+#define DRIVER_DATE                     "20090712 2009Q2 RC3"
 #define DRIVER_DATE_GEM                 "GEM " DRIVER_DATE
 
+
+static void intel_flush(GLcontext *ctx, GLboolean needs_mi_flush);
+
 static const GLubyte *
 intelGetString(GLcontext * ctx, GLenum name)
 {
@@ -151,6 +130,10 @@ intelGetString(GLcontext * ctx, GLenum name)
       case PCI_CHIP_Q33_G:
 	 chipset = "Intel(R) Q33";
 	 break;
+      case PCI_CHIP_IGD_GM:
+      case PCI_CHIP_IGD_G:
+	 chipset = "Intel(R) IGD";
+	 break;
       case PCI_CHIP_I965_Q:
 	 chipset = "Intel(R) 965Q";
 	 break;
@@ -182,6 +165,15 @@ intelGetString(GLcontext * ctx, GLenum name)
       case PCI_CHIP_G41_G:
          chipset = "Intel(R) G41";
          break;
+      case PCI_CHIP_B43_G:
+         chipset = "Intel(R) B43";
+         break;
+      case PCI_CHIP_ILD_G:
+         chipset = "Intel(R) IGDNG_D";
+         break;
+      case PCI_CHIP_ILM_G:
+         chipset = "Intel(R) IGDNG_M";
+         break;
       default:
          chipset = "Unknown Intel Chipset";
          break;
@@ -197,6 +189,24 @@ intelGetString(GLcontext * ctx, GLenum name)
    }
 }
 
+static unsigned
+intel_bits_per_pixel(const struct intel_renderbuffer *rb)
+{
+   switch (rb->Base._ActualFormat) {
+   case GL_RGB5:
+   case GL_DEPTH_COMPONENT16:
+      return 16;
+   case GL_RGB8:
+   case GL_RGBA8:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH24_STENCIL8_EXT:
+   case GL_STENCIL_INDEX8_EXT:
+      return 32;
+   default:
+      return 0;
+   }
+}
+
 void
 intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 {
@@ -204,7 +214,7 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
    struct intel_renderbuffer *rb;
    struct intel_region *region, *depth_region;
    struct intel_context *intel = context->driverPrivate;
-   __DRIbuffer *buffers;
+   __DRIbuffer *buffers = NULL;
    __DRIscreen *screen;
    int i, count;
    unsigned int attachments[10];
@@ -216,22 +226,65 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
    screen = intel->intelScreen->driScrnPriv;
 
-   i = 0;
-   if (intel_fb->color_rb[0])
-      attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
-   if (intel_fb->color_rb[1])
-      attachments[i++] = __DRI_BUFFER_BACK_LEFT;
-   if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH))
-      attachments[i++] = __DRI_BUFFER_DEPTH;
-   if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL))
-      attachments[i++] = __DRI_BUFFER_STENCIL;
-
-   buffers = (*screen->dri2.loader->getBuffers)(drawable,
-						&drawable->w,
-						&drawable->h,
-						attachments, i,
-						&count,
-						drawable->loaderPrivate);
+   if (screen->dri2.loader
+       && (screen->dri2.loader->base.version > 2)
+       && (screen->dri2.loader->getBuffersWithFormat != NULL)) {
+      struct intel_renderbuffer *depth_rb;
+      struct intel_renderbuffer *stencil_rb;
+
+      i = 0;
+      if ((intel->is_front_buffer_rendering ||
+	   intel->is_front_buffer_reading ||
+	   !intel_fb->color_rb[1])
+	   && intel_fb->color_rb[0]) {
+	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[0]);
+      }
+
+      if (intel_fb->color_rb[1]) {
+	 attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[1]);
+      }
+
+      depth_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+      stencil_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+
+      if ((depth_rb != NULL) && (stencil_rb != NULL)) {
+	 attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
+	 attachments[i++] = intel_bits_per_pixel(depth_rb);
+      } else if (depth_rb != NULL) {
+	 attachments[i++] = __DRI_BUFFER_DEPTH;
+	 attachments[i++] = intel_bits_per_pixel(depth_rb);
+      } else if (stencil_rb != NULL) {
+	 attachments[i++] = __DRI_BUFFER_STENCIL;
+	 attachments[i++] = intel_bits_per_pixel(stencil_rb);
+      }
+
+      buffers =
+	 (*screen->dri2.loader->getBuffersWithFormat)(drawable,
+						      &drawable->w,
+						      &drawable->h,
+						      attachments, i / 2,
+						      &count,
+						      drawable->loaderPrivate);
+   } else if (screen->dri2.loader) {
+      i = 0;
+      if (intel_fb->color_rb[0])
+	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+      if (intel_fb->color_rb[1])
+	 attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+      if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH))
+	 attachments[i++] = __DRI_BUFFER_DEPTH;
+      if (intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL))
+	 attachments[i++] = __DRI_BUFFER_STENCIL;
+
+      buffers = (*screen->dri2.loader->getBuffers)(drawable,
+						   &drawable->w,
+						   &drawable->h,
+						   attachments, i,
+						   &count,
+						   drawable->loaderPrivate);
+   }
 
    if (buffers == NULL)
       return;
@@ -259,6 +312,11 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 	   region_name = "dri2 front buffer";
 	   break;
 
+       case __DRI_BUFFER_FAKE_FRONT_LEFT:
+	   rb = intel_fb->color_rb[0];
+	   region_name = "dri2 fake front buffer";
+	   break;
+
        case __DRI_BUFFER_BACK_LEFT:
 	   rb = intel_fb->color_rb[1];
 	   region_name = "dri2 back buffer";
@@ -269,6 +327,11 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 	   region_name = "dri2 depth buffer";
 	   break;
 
+       case __DRI_BUFFER_DEPTH_STENCIL:
+	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+	   region_name = "dri2 depth / stencil buffer";
+	   break;
+
        case __DRI_BUFFER_STENCIL:
 	   rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
 	   region_name = "dri2 stencil buffer";
@@ -282,6 +345,9 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 	   return;
        }
 
+       if (rb == NULL)
+	  continue;
+
        if (rb->region) {
 	  dri_bo_flink(rb->region->buffer, &name);
 	  if (name == buffers[i].name)
@@ -312,6 +378,23 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
 
        intel_renderbuffer_set_region(rb, region);
        intel_region_release(&region);
+
+       if (buffers[i].attachment == __DRI_BUFFER_DEPTH_STENCIL) {
+	  rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+	  if (rb != NULL) {
+	     struct intel_region *stencil_region = NULL;
+
+	     if (rb->region) {
+		dri_bo_flink(rb->region->buffer, &name);
+		if (name == buffers[i].name)
+		   continue;
+	     }
+
+	     intel_region_reference(&stencil_region, region);
+	     intel_renderbuffer_set_region(rb, stencil_region);
+	     intel_region_release(&stencil_region);
+	  }
+       }
    }
 
    driUpdateFramebufferSize(&intel->ctx, drawable);
@@ -328,9 +411,20 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     if (!driContext->driScreenPriv->dri2.enabled)
 	return;
 
-    intel_update_renderbuffers(driContext, driContext->driDrawablePriv);
-    if (driContext->driDrawablePriv != driContext->driReadablePriv)
-	intel_update_renderbuffers(driContext, driContext->driReadablePriv);
+    if (!intel->meta.internal_viewport_call && ctx->DrawBuffer->Name == 0) {
+       /* If we're rendering to the fake front buffer, make sure all the pending
+	* drawing has landed on the real front buffer.  Otherwise when we
+	* eventually get to DRI2GetBuffersWithFormat the stale real front
+	* buffer contents will get copied to the new fake front buffer.
+	*/
+       if (intel->is_front_buffer_rendering) {
+	  intel_flush(ctx, GL_FALSE);
+       }
+
+       intel_update_renderbuffers(driContext, driContext->driDrawablePriv);
+       if (driContext->driDrawablePriv != driContext->driReadablePriv)
+	  intel_update_renderbuffers(driContext, driContext->driReadablePriv);
+    }
 
     old_viewport = ctx->Driver.Viewport;
     ctx->Driver.Viewport = NULL;
@@ -340,112 +434,6 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     ctx->Driver.Viewport = old_viewport;
 }
 
-/**
- * Extension strings exported by the intel driver.
- *
- * Extensions supported by all chips supported by i830_dri, i915_dri, or
- * i965_dri.
- */
-static const struct dri_extension card_extensions[] = {
-   { "GL_ARB_multisample",                GL_ARB_multisample_functions },
-   { "GL_ARB_multitexture",               NULL },
-   { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
-   { "GL_ARB_texture_border_clamp",       NULL },
-   { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
-   { "GL_ARB_texture_cube_map",           NULL },
-   { "GL_ARB_texture_env_add",            NULL },
-   { "GL_ARB_texture_env_combine",        NULL },
-   { "GL_ARB_texture_env_crossbar",       NULL },
-   { "GL_ARB_texture_env_dot3",           NULL },
-   { "GL_ARB_texture_mirrored_repeat",    NULL },
-   { "GL_ARB_texture_rectangle",          NULL },
-   { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
-   { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
-   { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
-   { "GL_EXT_blend_color",                GL_EXT_blend_color_functions },
-   { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
-   { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
-   { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
-   { "GL_EXT_blend_logic_op",             NULL },
-   { "GL_EXT_blend_subtract",             NULL },
-   { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
-   { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
-   { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
-   { "GL_EXT_packed_depth_stencil",       NULL },
-   { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
-   { "GL_EXT_stencil_wrap",               NULL },
-   { "GL_EXT_texture_edge_clamp",         NULL },
-   { "GL_EXT_texture_env_combine",        NULL },
-   { "GL_EXT_texture_env_dot3",           NULL },
-   { "GL_EXT_texture_filter_anisotropic", NULL },
-   { "GL_EXT_texture_lod_bias",           NULL },
-   { "GL_3DFX_texture_compression_FXT1",  NULL },
-   { "GL_APPLE_client_storage",           NULL },
-   { "GL_MESA_pack_invert",               NULL },
-   { "GL_MESA_ycbcr_texture",             NULL },
-   { "GL_NV_blend_square",                NULL },
-   { "GL_NV_point_sprite",                GL_NV_point_sprite_functions },
-   { "GL_NV_vertex_program",              GL_NV_vertex_program_functions },
-   { "GL_NV_vertex_program1_1",           NULL },
-   { "GL_SGIS_generate_mipmap",           NULL },
-   { NULL, NULL }
-};
-
-static const struct dri_extension brw_extensions[] = {
-   { "GL_ARB_depth_texture",              NULL },
-   { "GL_ARB_draw_buffers",               NULL },
-   { "GL_ARB_fragment_program",           NULL },
-   { "GL_ARB_fragment_program_shadow",    NULL },
-   { "GL_ARB_fragment_shader",            NULL },
-   { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
-   { "GL_ARB_point_sprite", 		  NULL },
-   { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
-   { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
-#if 0
-   /* Support for GLSL 1.20 is currently broken in core Mesa.
-    */
-   { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
-#endif
-   { "GL_ARB_shadow",                     NULL },
-   { "GL_ARB_texture_non_power_of_two",   NULL },
-   { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
-   { "GL_EXT_shadow_funcs",               NULL },
-   { "GL_EXT_texture_sRGB",		  NULL },
-   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
-   { "GL_ATI_texture_env_combine3",       NULL },
-   { NULL,                                NULL }
-};
-
-static const struct dri_extension arb_oq_extensions[] = {
-   { NULL, NULL }
-};
-
-static const struct dri_extension ttm_extensions[] = {
-   { "GL_ARB_pixel_buffer_object",        NULL },
-   { "GL_EXT_framebuffer_object",         GL_EXT_framebuffer_object_functions },
-   { NULL, NULL }
-};
-
-/**
- * Initializes potential list of extensions if ctx == NULL, or actually enables
- * extensions for a context.
- */
-void intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging)
-{
-   struct intel_context *intel = ctx?intel_context(ctx):NULL;
-
-   /* Disable imaging extension until convolution is working in teximage paths.
-    */
-   enable_imaging = GL_FALSE;
-
-   driInitExtensions(ctx, card_extensions, enable_imaging);
-
-   if (intel == NULL || intel->ttm)
-      driInitExtensions(ctx, ttm_extensions, GL_FALSE);
-
-   if (intel == NULL || IS_965(intel->intelScreen->deviceID))
-      driInitExtensions(ctx, brw_extensions, GL_FALSE);
-}
 
 static const struct dri_debug_control debug_control[] = {
    { "tex",   DEBUG_TEXTURE},
@@ -496,9 +484,8 @@ intelInvalidateState(GLcontext * ctx, GLuint new_state)
       intel->vtbl.invalidate_state( intel, new_state );
 }
 
-
-void
-intelFlush(GLcontext * ctx)
+static void
+intel_flush(GLcontext *ctx, GLboolean needs_mi_flush)
 {
    struct intel_context *intel = intel_context(ctx);
 
@@ -512,10 +499,64 @@ intelFlush(GLcontext * ctx)
     * lands onscreen in a timely manner, even if the X Server doesn't trigger
     * a flush for us.
     */
-   intel_batchbuffer_emit_mi_flush(intel->batch);
+   if (needs_mi_flush)
+      intel_batchbuffer_emit_mi_flush(intel->batch);
 
    if (intel->batch->map != intel->batch->ptr)
       intel_batchbuffer_flush(intel->batch);
+
+   if ((ctx->DrawBuffer->Name == 0) && intel->front_buffer_dirty) {
+      __DRIscreen *const screen = intel->intelScreen->driScrnPriv;
+
+      if (screen->dri2.loader &&
+          (screen->dri2.loader->base.version >= 2)
+	  && (screen->dri2.loader->flushFrontBuffer != NULL)) {
+	 (*screen->dri2.loader->flushFrontBuffer)(intel->driDrawable,
+						  intel->driDrawable->loaderPrivate);
+
+	 /* Only clear the dirty bit if front-buffer rendering is no longer
+	  * enabled.  This is done so that the dirty bit can only be set in
+	  * glDrawBuffer.  Otherwise the dirty bit would have to be set at
+	  * each of N places that do rendering.  This has worse performances,
+	  * but it is much easier to get correct.
+	  */
+	 if (!intel->is_front_buffer_rendering) {
+	    intel->front_buffer_dirty = GL_FALSE;
+	 }
+      }
+   }
+}
+
+void
+intelFlush(GLcontext * ctx)
+{
+   intel_flush(ctx, GL_FALSE);
+}
+
+static void
+intel_glFlush(GLcontext *ctx)
+{
+   struct intel_context *intel = intel_context(ctx);
+
+   intel_flush(ctx, GL_TRUE);
+
+   /* We're using glFlush as an indicator that a frame is done, which is
+    * what DRI2 does before calling SwapBuffers (and means we should catch
+    * people doing front-buffer rendering, as well)..
+    *
+    * Wait for the swapbuffers before the one we just emitted, so we don't
+    * get too many swaps outstanding for apps that are GPU-heavy but not
+    * CPU-heavy.
+    *
+    * Unfortunately, we don't have a handle to the batch containing the swap,
+    * and getting our hands on that doesn't seem worth it, so we just us the
+    * first batch we emitted after the last swap.
+    */
+   if (intel->first_post_swapbuffers_batch != NULL) {
+      drm_intel_bo_wait_rendering(intel->first_post_swapbuffers_batch);
+      drm_intel_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
+   }
 }
 
 void
@@ -531,7 +572,7 @@ intelFinish(GLcontext * ctx)
 
        irb = intel_renderbuffer(fb->_ColorDrawBuffers[i]);
 
-       if (irb->region)
+       if (irb && irb->region)
 	  dri_bo_wait_rendering(irb->region->buffer);
    }
    if (fb->_DepthBuffer) {
@@ -544,20 +585,26 @@ intelInitDriverFunctions(struct dd_function_table *functions)
 {
    _mesa_init_driver_functions(functions);
 
-   functions->Flush = intelFlush;
+   functions->Flush = intel_glFlush;
    functions->Finish = intelFinish;
    functions->GetString = intelGetString;
    functions->UpdateState = intelInvalidateState;
 
-   functions->CopyColorTable = _swrast_CopyColorTable;
-   functions->CopyColorSubTable = _swrast_CopyColorSubTable;
-   functions->CopyConvolutionFilter1D = _swrast_CopyConvolutionFilter1D;
-   functions->CopyConvolutionFilter2D = _swrast_CopyConvolutionFilter2D;
+   _MESA_INIT_COLORTABLE_FUNCTIONS(functions, _swrast_);
+   _MESA_INIT_CONVOLVE_FUNCTIONS(functions, _swrast_);
 
    intelInitTextureFuncs(functions);
+   intelInitTextureImageFuncs(functions);
+   intelInitTextureSubImageFuncs(functions);
+   intelInitTextureCopyImageFuncs(functions);
    intelInitStateFuncs(functions);
+   intelInitClearFuncs(functions);
    intelInitBufferFuncs(functions);
    intelInitPixelFuncs(functions);
+   intelInitBufferObjectFuncs(functions);
+   intel_init_syncobj_functions(functions);
+
+   intelInitEGLImageFuncs(functions);
 }
 
 
@@ -614,16 +661,20 @@ intelInitContext(struct intel_context *intel,
       }
    }
 
-   ctx->Const.MaxTextureMaxAnisotropy = 2.0;
-
    /* This doesn't yet catch all non-conformant rendering, but it's a
     * start.
     */
    if (getenv("INTEL_STRICT_CONFORMANCE")) {
-      intel->strict_conformance = 1;
+      unsigned int value = atoi(getenv("INTEL_STRICT_CONFORMANCE"));
+      if (value > 0) {
+         intel->conformance_mode = value;
+      }
+      else {
+         intel->conformance_mode = 1;
+      }
    }
 
-   if (intel->strict_conformance) {
+   if (intel->conformance_mode > 0) {
       ctx->Const.MinLineWidth = 1.0;
       ctx->Const.MinLineWidthAA = 1.0;
       ctx->Const.MaxLineWidth = 1.0;
@@ -649,7 +700,15 @@ intelInitContext(struct intel_context *intel,
     */
    _mesa_init_point(ctx);
 
+   meta_init_metaops(ctx, &intel->meta);
    ctx->Const.MaxColorAttachments = 4;  /* XXX FBO: review this */
+   if (IS_965(intelScreen->deviceID)) {
+      if (MAX_WIDTH > 8192)
+	 ctx->Const.MaxRenderbufferSize = 8192;
+   } else {
+      if (MAX_WIDTH > 2048)
+	 ctx->Const.MaxRenderbufferSize = 2048;
+   }
 
    /* Initialize the software rasterizer and helper modules. */
    _swrast_CreateContext(ctx);
@@ -661,6 +720,8 @@ intelInitContext(struct intel_context *intel,
    _swrast_allow_pixel_fog(ctx, GL_FALSE);
    _swrast_allow_vertex_fog(ctx, GL_TRUE);
 
+   _mesa_meta_init(ctx);
+
    intel->hw_stencil = mesaVis->stencilBits && mesaVis->depthBits == 24;
    intel->hw_stipple = 1;
 
@@ -691,8 +752,6 @@ intelInitContext(struct intel_context *intel,
 
    intel->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
 
-   _math_matrix_ctr(&intel->ViewportMatrix);
-
    if (IS_965(intelScreen->deviceID) && !intel->intelScreen->irq_active) {
       _mesa_printf("IRQs not active.  Exiting\n");
       exit(1);
@@ -709,7 +768,6 @@ intelInitContext(struct intel_context *intel,
 
    intel->batch = intel_batchbuffer_alloc(intel);
 
-   intel_bufferobj_init(intel);
    intel_fbo_init(intel);
 
    if (intel->ctx.Mesa_DXTn) {
@@ -719,6 +777,15 @@ intelInitContext(struct intel_context *intel,
    else if (driQueryOptionb(&intel->optionCache, "force_s3tc_enable")) {
       _mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
    }
+   intel->use_texture_tiling = driQueryOptionb(&intel->optionCache,
+					       "texture_tiling");
+   if (intel->use_texture_tiling &&
+       !intel->intelScreen->kernel_exec_fencing) {
+      fprintf(stderr, "No kernel support for execution fencing, "
+	      "disabling texture tiling\n");
+      intel->use_texture_tiling = GL_FALSE;
+   }
+   intel->use_early_z = driQueryOptionb(&intel->optionCache, "early_z");
 
    intel->prim.primitive = ~0;
 
@@ -728,6 +795,16 @@ intelInitContext(struct intel_context *intel,
       intel->no_rast = 1;
    }
 
+   if (driQueryOptionb(&intel->optionCache, "always_flush_batch")) {
+      fprintf(stderr, "flushing batchbuffer before/after each draw call\n");
+      intel->always_flush_batch = 1;
+   }
+
+   if (driQueryOptionb(&intel->optionCache, "always_flush_cache")) {
+      fprintf(stderr, "flushing GPU caches before/after each draw call\n");
+      intel->always_flush_cache = 1;
+   }
+
    /* Disable all hardware rendering (skip emitting batches and fences/waits
     * to the kernel)
     */
@@ -748,6 +825,10 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       INTEL_FIREVERTICES(intel);
 
+      _mesa_meta_free(&intel->ctx);
+
+      meta_destroy_metaops(&intel->meta);
+
       intel->vtbl.destroy(intel);
 
       release_texture_heaps = (intel->ctx.Shared->RefCount == 1);
@@ -765,15 +846,68 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
       intel->prim.vb = NULL;
       dri_bo_unreference(intel->prim.vb_bo);
       intel->prim.vb_bo = NULL;
+      dri_bo_unreference(intel->first_post_swapbuffers_batch);
+      intel->first_post_swapbuffers_batch = NULL;
 
       if (release_texture_heaps) {
-         /* This share group is about to go away, free our private
-          * texture object data.
+         /* Nothing is currently done here to free texture heaps;
+          * but we're not using the texture heap utilities, so I
+          * rather think we shouldn't.  I've taken a look, and can't
+          * find any private texture data hanging around anywhere, but
+          * I'm not yet certain there isn't any at all...
           */
-         if (INTEL_DEBUG & DEBUG_TEXTURE)
+         /* if (INTEL_DEBUG & DEBUG_TEXTURE)
             fprintf(stderr, "do something to free texture heaps\n");
+          */
       }
 
+      /* XXX In intelMakeCurrent() below, the context's static regions are 
+       * referenced inside the frame buffer; it's listed as a hack,
+       * with a comment of "XXX FBO temporary fix-ups!", but
+       * as long as it's there, we should release the regions here.
+       * The do/while loop around the block is used to allow the
+       * "continue" statements inside the block to exit the block,
+       * to avoid many layers of "if" constructs.
+       */
+      do {
+         __DRIdrawablePrivate * driDrawPriv = intel->driDrawable;
+         struct intel_framebuffer *intel_fb;
+         struct intel_renderbuffer *irbDepth, *irbStencil;
+         if (!driDrawPriv) {
+            /* We're already detached from the drawable; exit this block. */
+            continue;
+         }
+         intel_fb = (struct intel_framebuffer *) driDrawPriv->driverPrivate;
+         if (!intel_fb) {
+            /* The frame buffer is already gone; exit this block. */
+            continue;
+         }
+         irbDepth = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+         irbStencil = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+
+         /* If the regions of the frame buffer still match the regions
+          * of the context, release them.  If they've changed somehow,
+          * leave them alone.
+          */
+         if (intel_fb->color_rb[0] && intel_fb->color_rb[0]->region == intel->front_region) {
+	    intel_renderbuffer_set_region(intel_fb->color_rb[0], NULL);
+         }
+         if (intel_fb->color_rb[1] && intel_fb->color_rb[1]->region == intel->back_region) {
+	    intel_renderbuffer_set_region(intel_fb->color_rb[1], NULL);
+         }
+
+         if (irbDepth && irbDepth->region == intel->depth_region) {
+	    intel_renderbuffer_set_region(irbDepth, NULL);
+         }
+         /* Usually, the stencil buffer is the same as the depth buffer;
+          * but they're handled separately in MakeCurrent, so we'll
+          * handle them separately here.
+          */
+         if (irbStencil && irbStencil->region == intel->depth_region) {
+	    intel_renderbuffer_set_region(irbStencil, NULL);
+         }
+      } while (0);
+
       intel_region_release(&intel->front_region);
       intel_region_release(&intel->back_region);
       intel_region_release(&intel->depth_region);
@@ -782,6 +916,9 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       /* free the Mesa context */
       _mesa_free_context_data(&intel->ctx);
+
+      FREE(intel);
+      driContextPriv->driverPrivate = NULL;
    }
 }
 
@@ -810,7 +947,10 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
           if (driDrawPriv != driReadPriv)
               intel_update_renderbuffers(driContextPriv, driReadPriv);
       } else {
-          /* XXX FBO temporary fix-ups! */
+          /* XXX FBO temporary fix-ups!  These are released in 
+           * intelDextroyContext(), above.  Changes here should be
+           * reflected there.
+           */
           /* if the renderbuffers don't have regions, init them from the context */
          struct intel_renderbuffer *irbDepth
             = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
@@ -858,6 +998,11 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
 		  ? driGetDefaultVBlankFlags(&intel->optionCache)
 		 : VBLANK_FLAG_NO_IRQ;
 
+	       /* Prevent error printf if one crtc is disabled, this will
+		* be properly calculated in intelWindowMoved() next.
+		*/
+		driDrawPriv->vblFlags = intelFixupVblank(intel, driDrawPriv);
+
 	       (*psp->systemTime->getUST) (&intel_fb->swap_ust);
 	       driDrawableInitVBlank(driDrawPriv);
 	       intel_fb->vbl_waited = driDrawPriv->vblSeq;
@@ -890,7 +1035,6 @@ intelContendedLock(struct intel_context *intel, GLuint flags)
    int me = intel->hHWContext;
 
    drmGetLock(intel->driFd, intel->hHWContext, flags);
-   intel->locked = 1;
 
    if (INTEL_DEBUG & DEBUG_LOCK)
       _mesa_printf("%s - got contended lock\n", __progname);
@@ -947,9 +1091,12 @@ void LOCK_HARDWARE( struct intel_context *intel )
     struct intel_framebuffer *intel_fb = NULL;
     struct intel_renderbuffer *intel_rb = NULL;
 
-    _glthread_LOCK_MUTEX(lockMutex);
-    assert(!intel->locked);
-    intel->locked = 1;
+    intel->locked++;
+    if (intel->locked >= 2)
+       return;
+
+    if (!sPriv->dri2.enabled)
+       _glthread_LOCK_MUTEX(lockMutex);
 
     if (intel->driDrawable) {
        intel_fb = intel->driDrawable->driverPrivate;
@@ -996,13 +1143,16 @@ void UNLOCK_HARDWARE( struct intel_context *intel )
 {
     __DRIscreen *sPriv = intel->driScreen;
 
-   intel->vtbl.note_unlock( intel );
-   intel->locked = 0;
+   intel->locked--;
+   if (intel->locked > 0)
+      return;
 
-   if (!sPriv->dri2.enabled)
-      DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
+   assert(intel->locked == 0);
 
-   _glthread_UNLOCK_MUTEX(lockMutex);
+   if (!sPriv->dri2.enabled) {
+      DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
+      _glthread_UNLOCK_MUTEX(lockMutex);
+   }
 
    if (INTEL_DEBUG & DEBUG_LOCK)
       _mesa_printf("%s - unlocked\n", __progname);
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 048286c196..03e7cf39d6 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -33,6 +33,7 @@
 #include "main/mtypes.h"
 #include "main/mm.h"
 #include "texmem.h"
+#include "dri_metaops.h"
 #include "drm.h"
 #include "intel_bufmgr.h"
 
@@ -48,6 +49,8 @@
 #define DV_PF_555  (1<<8)
 #define DV_PF_565  (2<<8)
 #define DV_PF_8888 (3<<8)
+#define DV_PF_4444 (8<<8)
+#define DV_PF_1555 (9<<8)
 
 struct intel_region;
 struct intel_context;
@@ -77,9 +80,19 @@ extern void intelFallback(struct intel_context *intel, GLuint bit,
 
 #define INTEL_MAX_FIXUP 64
 
+struct intel_sync_object {
+   struct gl_sync_object Base;
+
+   /** Batch associated with this sync object */
+   drm_intel_bo *bo;
+};
+
+/**
+ * intel_context is derived from Mesa's context class: GLcontext.
+ */
 struct intel_context
 {
-   GLcontext ctx;               /* the parent class */
+   GLcontext ctx;  /**< base class, must be first field */
 
    struct
    {
@@ -89,7 +102,6 @@ struct intel_context
       void (*new_batch) (struct intel_context * intel);
       void (*emit_invarient_state) (struct intel_context * intel);
       void (*note_fence) (struct intel_context *intel, GLuint fence);
-      void (*note_unlock) (struct intel_context *intel);
       void (*update_texture_state) (struct intel_context * intel);
 
       void (*render_start) (struct intel_context * intel);
@@ -100,7 +112,6 @@ struct intel_context
 			       GLuint num_regions);
 
       GLuint (*flush_cmd) (void);
-      void (*emit_flush) (struct intel_context *intel, GLuint unused);
 
       void (*reduced_primitive_state) (struct intel_context * intel,
                                        GLenum rprim);
@@ -157,18 +168,7 @@ struct intel_context
       void (*debug_batch)(struct intel_context *intel);
    } vtbl;
 
-   struct {
-      struct gl_fragment_program *bitmap_fp;
-      struct gl_vertex_program *passthrough_vp;
-
-      struct gl_fragment_program *saved_fp;
-      GLboolean saved_fp_enable;
-      struct gl_vertex_program *saved_vp;
-      GLboolean saved_vp_enable;
-
-      GLint saved_vp_x, saved_vp_y;
-      GLsizei saved_vp_width, saved_vp_height;
-   } meta;
+   struct dri_metaops meta;
 
    GLint refcount;
    GLuint Fallback;
@@ -188,6 +188,7 @@ struct intel_context
    GLboolean ttm;
 
    struct intel_batchbuffer *batch;
+   drm_intel_bo *first_post_swapbuffers_batch;
    GLboolean no_batch_wrap;
    unsigned batch_id;
 
@@ -209,10 +210,10 @@ struct intel_context
    char *prevLockFile;
    int prevLockLine;
 
-   GLubyte clear_chan[4];
    GLuint ClearColor565;
    GLuint ClearColor8888;
 
+
    /* Offsets of fields within the current vertex:
     */
    GLuint coloroffset;
@@ -229,7 +230,14 @@ struct intel_context
    GLboolean hw_stipple;
    GLboolean depth_buffer_is_float;
    GLboolean no_rast;
-   GLboolean strict_conformance;
+   GLboolean always_flush_batch;
+   GLboolean always_flush_cache;
+
+   /* 0 - nonconformant, best performance;
+    * 1 - fallback to sw for known conformance bugs
+    * 2 - always fallback to sw
+    */
+   GLuint conformance_mode;
 
    /* State for intelvb.c and inteltris.c.
     */
@@ -255,11 +263,40 @@ struct intel_context
     * flush time while the lock is held.
     */
    GLboolean constant_cliprect;
+
    /**
     * In !constant_cliprect mode, set to true if the front cliprects should be
     * used instead of back.
     */
    GLboolean front_cliprects;
+
+   /**
+    * Set if rendering has occured to the drawable's front buffer.
+    *
+    * This is used in the DRI2 case to detect that glFlush should also copy
+    * the contents of the fake front buffer to the real front buffer.
+    */
+   GLboolean front_buffer_dirty;
+
+   /**
+    * Track whether front-buffer rendering is currently enabled
+    *
+    * A separate flag is used to track this in order to support MRT more
+    * easily.
+    */
+   GLboolean is_front_buffer_rendering;
+   /**
+    * Track whether front-buffer is the current read target.
+    *
+    * This is closely associated with is_front_buffer_rendering, but may
+    * be set separately.  The DRI2 fake front buffer must be referenced
+    * either way.
+    */
+   GLboolean is_front_buffer_reading;
+
+   GLboolean use_texture_tiling;
+   GLboolean use_early_z;
+
    drm_clip_rect_t fboRect;     /**< cliprect for FBO rendering */
 
    int perf_boxes;
@@ -281,7 +318,7 @@ struct intel_context
    __DRIdrawablePrivate *driReadDrawable;
    __DRIscreenPrivate *driScreen;
    intelScreenPrivate *intelScreen;
-   volatile struct drm_i915_sarea *sarea;
+   volatile drm_i915_sarea_t *sarea;
 
    GLuint lastStamp;
 
@@ -312,6 +349,7 @@ extern char *__progname;
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
 #define ALIGN(value, alignment)  ((value + alignment - 1) & ~(alignment - 1))
+#define IS_POWER_OF_TWO(val) (((val) & (val - 1)) == 0)
 
 #define INTEL_FIREVERTICES(intel)		\
 do {						\
@@ -437,7 +475,8 @@ extern void intelFinish(GLcontext * ctx);
 extern void intelFlush(GLcontext * ctx);
 
 extern void intelInitDriverFunctions(struct dd_function_table *functions);
-extern void intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging);
+
+void intel_init_syncobj_functions(struct dd_function_table *functions);
 
 
 /* ================================================================
@@ -514,6 +553,9 @@ void intel_viewport(GLcontext * ctx, GLint x, GLint y,
 void intel_update_renderbuffers(__DRIcontext *context,
 				__DRIdrawable *drawable);
 
+void i915_set_buf_info_for_region(uint32_t *state, struct intel_region *region,
+				  uint32_t buffer_id);
+
 /*======================================================================
  * Inline conversion functions.  
  * These are better-typed than the macros used previously:
@@ -524,4 +566,10 @@ intel_context(GLcontext * ctx)
    return (struct intel_context *) ctx;
 }
 
+static INLINE GLboolean
+is_power_of_two(uint32_t value)
+{
+   return (value & (value - 1)) == 0;
+}
+
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_decode.c b/src/mesa/drivers/dri/intel/intel_decode.c
index 5f90ca22ec..a9dfe281cb 100644
--- a/src/mesa/drivers/dri/intel/intel_decode.c
+++ b/src/mesa/drivers/dri/intel/intel_decode.c
@@ -87,27 +87,28 @@ decode_mi(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 
     struct {
 	uint32_t opcode;
+	int len_mask;
 	int min_len;
 	int max_len;
 	char *name;
     } opcodes_mi[] = {
-	{ 0x08, 1, 1, "MI_ARB_ON_OFF" },
-	{ 0x0a, 1, 1, "MI_BATCH_BUFFER_END" },
-	{ 0x31, 2, 2, "MI_BATCH_BUFFER_START" },
-	{ 0x14, 3, 3, "MI_DISPLAY_BUFFER_INFO" },
-	{ 0x04, 1, 1, "MI_FLUSH" },
-	{ 0x22, 3, 3, "MI_LOAD_REGISTER_IMM" },
-	{ 0x13, 2, 2, "MI_LOAD_SCAN_LINES_EXCL" },
-	{ 0x12, 2, 2, "MI_LOAD_SCAN_LINES_INCL" },
-	{ 0x00, 1, 1, "MI_NOOP" },
-	{ 0x11, 2, 2, "MI_OVERLAY_FLIP" },
-	{ 0x07, 1, 1, "MI_REPORT_HEAD" },
-	{ 0x18, 2, 2, "MI_SET_CONTEXT" },
-	{ 0x20, 3, 4, "MI_STORE_DATA_IMM" },
-	{ 0x21, 3, 4, "MI_STORE_DATA_INDEX" },
-	{ 0x24, 3, 3, "MI_STORE_REGISTER_MEM" },
-	{ 0x02, 1, 1, "MI_USER_INTERRUPT" },
-	{ 0x03, 1, 1, "MI_WAIT_FOR_EVENT" },
+	{ 0x08, 0, 1, 1, "MI_ARB_ON_OFF" },
+	{ 0x0a, 0, 1, 1, "MI_BATCH_BUFFER_END" },
+	{ 0x31, 0x3f, 2, 2, "MI_BATCH_BUFFER_START" },
+	{ 0x14, 0x3f, 3, 3, "MI_DISPLAY_BUFFER_INFO" },
+	{ 0x04, 0, 1, 1, "MI_FLUSH" },
+	{ 0x22, 0, 3, 3, "MI_LOAD_REGISTER_IMM" },
+	{ 0x13, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_EXCL" },
+	{ 0x12, 0x3f, 2, 2, "MI_LOAD_SCAN_LINES_INCL" },
+	{ 0x00, 0, 1, 1, "MI_NOOP" },
+	{ 0x11, 0x3f, 2, 2, "MI_OVERLAY_FLIP" },
+	{ 0x07, 0, 1, 1, "MI_REPORT_HEAD" },
+	{ 0x18, 0x3f, 2, 2, "MI_SET_CONTEXT" },
+	{ 0x20, 0x3f, 3, 4, "MI_STORE_DATA_IMM" },
+	{ 0x21, 0x3f, 3, 4, "MI_STORE_DATA_INDEX" },
+	{ 0x24, 0x3f, 3, 3, "MI_STORE_REGISTER_MEM" },
+	{ 0x02, 0, 1, 1, "MI_USER_INTERRUPT" },
+	{ 0x03, 0, 1, 1, "MI_WAIT_FOR_EVENT" },
     };
 
 
@@ -118,12 +119,14 @@ decode_mi(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 
 	    instr_out(data, hw_offset, 0, "%s\n", opcodes_mi[opcode].name);
 	    if (opcodes_mi[opcode].max_len > 1) {
-		len = (data[0] & 0x000000ff) + 2;
+		len = (data[0] & opcodes_mi[opcode].len_mask) + 2;
 		if (len < opcodes_mi[opcode].min_len ||
 		    len > opcodes_mi[opcode].max_len)
 		{
-		    fprintf(out, "Bad length in %s\n",
-			    opcodes_mi[opcode].name);
+		    fprintf(out, "Bad length (%d) in %s, [%d, %d]\n",
+			    len, opcodes_mi[opcode].name,
+			    opcodes_mi[opcode].min_len,
+			    opcodes_mi[opcode].max_len);
 		}
 	    }
 
@@ -797,6 +800,7 @@ static int
 decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i830)
 {
     unsigned int len, i, c, opcode, word, map, sampler, instr;
+    char *format;
 
     struct {
 	uint32_t opcode;
@@ -932,7 +936,7 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	instr_out(data, hw_offset, 0, "3DSTATE_PIXEL_SHADER_CONSTANTS\n");
 	len = (data[0] & 0x000000ff) + 2;
 
-	i = 1;
+	i = 2;
 	for (c = 0; c <= 31; c++) {
 	    if (data[1] & (1 << c)) {
 		if (i + 4 >= count)
@@ -952,7 +956,7 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	    }
 	}
 	if (len != i) {
-	    fprintf(out, "Bad count in 3DSTATE_MAP_STATE\n");
+	    fprintf(out, "Bad count in 3DSTATE_PIXEL_SHADER_CONSTANTS\n");
 	    (*failures)++;
 	}
 	return len;
@@ -998,6 +1002,35 @@ decode_3d_1d(uint32_t *data, int count, uint32_t hw_offset, int *failures, int i
 	    (*failures)++;
 	}
 	return len;
+    case 0x85:
+	len = (data[0] & 0x0000000f) + 2;
+
+	if (len != 2)
+	    fprintf(out, "Bad count in 3DSTATE_DEST_BUFFER_VARIABLES\n");
+	if (count < 2)
+	    BUFFER_FAIL(count, len, "3DSTATE_DEST_BUFFER_VARIABLES");
+
+	instr_out(data, hw_offset, 0,
+		  "3DSTATE_DEST_BUFFER_VARIABLES\n");
+
+	switch ((data[1] >> 8) & 0xf) {
+	case 0x0: format = "g8"; break;
+	case 0x1: format = "x1r5g5b5"; break;
+	case 0x2: format = "r5g6b5"; break;
+	case 0x3: format = "a8r8g8b8"; break;
+	case 0x4: format = "ycrcb_swapy"; break;
+	case 0x5: format = "ycrcb_normal"; break;
+	case 0x6: format = "ycrcb_swapuv"; break;
+	case 0x7: format = "ycrcb_swapuvy"; break;
+	case 0x8: format = "a4r4g4b4"; break;
+	case 0x9: format = "a1r5g5b5"; break;
+	case 0xa: format = "a2r10g10b10"; break;
+	default: format = "BAD"; break;
+	}
+	instr_out(data, hw_offset, 1, "%s format, early Z %sabled\n",
+		  format,
+		  (data[1] & (1 << 31)) ? "en" : "dis");
+	return len;
     }
 
     for (opcode = 0; opcode < sizeof(opcodes_3d_1d) / sizeof(opcodes_3d_1d[0]);
@@ -1510,7 +1543,7 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 
 	for (i = 1; i < len;) {
 	    instr_out(data, hw_offset, i, "buffer %d: %svalid, type 0x%04x, "
-		      "src offset 0x%04xd bytes\n",
+		      "src offset 0x%04x bytes\n",
 		      data[i] >> 27,
 		      data[i] & (1 << 26) ? "" : "in",
 		      (data[i] >> 16) & 0x1ff,
@@ -1592,7 +1625,7 @@ decode_3d_965(uint32_t *data, int count, uint32_t hw_offset, int *failures)
 		  "3DPRIMITIVE: %s %s\n",
 		  get_965_prim_type(data[0]),
 		  (data[0] & (1 << 15)) ? "random" : "sequential");
-	instr_out(data, hw_offset, 1, "primitive count\n");
+	instr_out(data, hw_offset, 1, "vertex count\n");
 	instr_out(data, hw_offset, 2, "start vertex\n");
 	instr_out(data, hw_offset, 3, "instance count\n");
 	instr_out(data, hw_offset, 4, "start instance\n");
diff --git a/src/mesa/drivers/dri/intel/intel_depthstencil.c b/src/mesa/drivers/dri/intel/intel_depthstencil.c
deleted file mode 100644
index 354b3bf0d7..0000000000
--- a/src/mesa/drivers/dri/intel/intel_depthstencil.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/depthstencil.h"
-#include "main/fbobject.h"
-#include "main/framebuffer.h"
-#include "main/hash.h"
-#include "main/mtypes.h"
-#include "main/renderbuffer.h"
-
-#include "intel_context.h"
-#include "intel_fbo.h"
-#include "intel_depthstencil.h"
-#include "intel_regions.h"
-#include "intel_span.h"
-
-/**
- * The GL_EXT_framebuffer_object allows the user to create their own
- * framebuffer objects consisting of color renderbuffers (0 or more),
- * depth renderbuffers (0 or 1) and stencil renderbuffers (0 or 1).
- *
- * The spec considers depth and stencil renderbuffers to be totally independent
- * buffers.  In reality, most graphics hardware today uses a combined
- * depth+stencil buffer (one 32-bit pixel = 24 bits of Z + 8 bits of stencil).
- *
- * This causes difficulty because the user may create some number of depth
- * renderbuffers and some number of stencil renderbuffers and bind them
- * together in framebuffers in any combination.
- *
- * This code manages all that.
- *
- * 1. Depth renderbuffers are always allocated in hardware as 32bpp
- *    GL_DEPTH24_STENCIL8 buffers.
- *
- * 2. Stencil renderbuffers are initially allocated in software as 8bpp
- *    GL_STENCIL_INDEX8 buffers.
- *
- * 3. Depth and Stencil renderbuffers use the PairedStencil and PairedDepth
- *    fields (respectively) to indicate if the buffer's currently paired
- *    with another stencil or depth buffer (respectively).
- *
- * 4. When a depth and stencil buffer are initially both attached to the
- *    current framebuffer, we merge the stencil buffer values into the
- *    depth buffer (really a depth+stencil buffer).  The then hardware uses
- *    the combined buffer.
- *
- * 5. Whenever a depth or stencil buffer is reallocated (with
- *    glRenderbufferStorage) we undo the pairing and copy the stencil values
- *    from the combined depth/stencil buffer back to the stencil-only buffer.
- *
- * 6. We also undo the pairing when we find a change in buffer bindings.
- *
- * 7. If a framebuffer is only using a depth renderbuffer (no stencil), we
- *    just use the combined depth/stencil buffer and ignore the stencil values.
- *
- * 8. If a framebuffer is only using a stencil renderbuffer (no depth) we have
- *    to promote the 8bpp software stencil buffer to a 32bpp hardware
- *    depth+stencil buffer.
- *
- */
-
-/**
- * Undo the pairing/interleaving between depth and stencil buffers.
- * irb should be a depth/stencil or stencil renderbuffer.
- */
-void
-intel_unpair_depth_stencil(GLcontext *ctx, struct intel_renderbuffer *irb)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct gl_renderbuffer *rb = &irb->Base;
-
-   if (irb->PairedStencil) {
-      /* irb is a depth/stencil buffer */
-      struct gl_renderbuffer *stencilRb;
-      struct intel_renderbuffer *stencilIrb;
-
-      ASSERT(rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-
-      stencilRb = _mesa_lookup_renderbuffer(ctx, irb->PairedStencil);
-      stencilIrb = intel_renderbuffer(stencilRb);
-      if (stencilIrb) {
-         /* need to extract stencil values from the depth buffer */
-	 ASSERT(stencilIrb->PairedDepth == rb->Name);
-	 intel_renderbuffer_map(intel, rb);
-	 intel_renderbuffer_map(intel, stencilRb);
-#if 0
-         /* disable for now */
-	 _mesa_extract_stencil(ctx, rb, stencilRb);
-#endif
-	 intel_renderbuffer_unmap(intel, stencilRb);
-	 intel_renderbuffer_unmap(intel, rb);
-         stencilIrb->PairedDepth = 0;
-      }
-      irb->PairedStencil = 0;
-   }
-   else if (irb->PairedDepth) {
-      /* irb is a stencil buffer */
-      struct gl_renderbuffer *depthRb;
-      struct intel_renderbuffer *depthIrb;
-
-      ASSERT(rb->_ActualFormat == GL_STENCIL_INDEX8_EXT ||
-             rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-
-      depthRb = _mesa_lookup_renderbuffer(ctx, irb->PairedDepth);
-      depthIrb = intel_renderbuffer(depthRb);
-      if (depthIrb) {
-         /* need to extract stencil values from the depth buffer */
-	 ASSERT(depthIrb->PairedStencil == rb->Name);
-	 intel_renderbuffer_map(intel, rb);
-	 intel_renderbuffer_map(intel, depthRb);
-#if 0
-         /* disable for now */
-	 _mesa_extract_stencil(ctx, depthRb, rb);
-#endif
-	 intel_renderbuffer_unmap(intel, depthRb);
-	 intel_renderbuffer_unmap(intel, rb);
-         depthIrb->PairedStencil = 0;
-      }
-      irb->PairedDepth = 0;
-   }
-   else {
-      _mesa_problem(ctx, "Problem in undo_depth_stencil_pairing");
-   }
-
-   ASSERT(irb->PairedStencil == 0);
-   ASSERT(irb->PairedDepth == 0);
-}
-
-
-/**
- * Examine the depth and stencil renderbuffers which are attached to the
- * framebuffer.  If both depth and stencil are attached, make sure that the
- * renderbuffers are 'paired' (combined).  If only depth or only stencil is
- * attached, undo any previous pairing.
- *
- * Must be called if NewState & _NEW_BUFFER (when renderbuffer attachments
- * change, for example).
- */
-void
-intel_validate_paired_depth_stencil(GLcontext * ctx,
-                                    struct gl_framebuffer *fb)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_renderbuffer *depthRb, *stencilRb;
-
-   depthRb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
-   stencilRb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
-
-   if (depthRb && stencilRb) {
-      if (depthRb == stencilRb) {
-         /* Using a user-created combined depth/stencil buffer.
-          * Nothing to do.
-          */
-         ASSERT(depthRb->Base._BaseFormat == GL_DEPTH_STENCIL_EXT);
-         ASSERT(depthRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-      }
-      else {
-         /* Separate depth/stencil buffers, need to interleave now */
-         ASSERT(depthRb->Base._BaseFormat == GL_DEPTH_COMPONENT ||
-                depthRb->Base._BaseFormat == GL_DEPTH_STENCIL);
-         ASSERT(stencilRb->Base._BaseFormat == GL_STENCIL_INDEX ||
-                stencilRb->Base._BaseFormat == GL_DEPTH_STENCIL);
-
-         /* may need to interleave depth/stencil now */
-         if (depthRb->PairedStencil == stencilRb->Base.Name) {
-            /* OK, the depth and stencil buffers are already interleaved */
-            ASSERT(stencilRb->PairedDepth == depthRb->Base.Name);
-         }
-         else {
-            /* need to setup new pairing/interleaving */
-            if (depthRb->PairedStencil) {
-               intel_unpair_depth_stencil(ctx, depthRb);
-            }
-            if (stencilRb->PairedDepth) {
-               intel_unpair_depth_stencil(ctx, stencilRb);
-            }
-
-            ASSERT(depthRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-            ASSERT(stencilRb->Base._ActualFormat == GL_STENCIL_INDEX8_EXT ||
-                   stencilRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-
-            /* establish new pairing: interleave stencil into depth buffer */
-	    intel_renderbuffer_map(intel, &depthRb->Base);
-	    intel_renderbuffer_map(intel, &stencilRb->Base);
-            _mesa_insert_stencil(ctx, &depthRb->Base, &stencilRb->Base);
-	    intel_renderbuffer_unmap(intel, &stencilRb->Base);
-	    intel_renderbuffer_unmap(intel, &depthRb->Base);
-            depthRb->PairedStencil = stencilRb->Base.Name;
-            stencilRb->PairedDepth = depthRb->Base.Name;
-         }
-
-      }
-   }
-   else if (depthRb) {
-      /* Depth buffer but no stencil buffer.
-       * We'll use a GL_DEPTH24_STENCIL8 buffer and ignore the stencil bits.
-       */
-      /* can't assert this until storage is allocated:
-         ASSERT(depthRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-       */
-      /* intel_undo any previous pairing */
-      if (depthRb->PairedStencil) {
-         intel_unpair_depth_stencil(ctx, depthRb);
-      }
-   }
-   else if (stencilRb) {
-      /* Stencil buffer but no depth buffer.
-       * Since h/w doesn't typically support just 8bpp stencil w/out Z,
-       * we'll use a GL_DEPTH24_STENCIL8 buffer and ignore the depth bits.
-       */
-      /* undo any previous pairing */
-      if (stencilRb->PairedDepth) {
-         intel_unpair_depth_stencil(ctx, stencilRb);
-      }
-      if (stencilRb->Base._ActualFormat == GL_STENCIL_INDEX8_EXT) {
-         /* promote buffer to GL_DEPTH24_STENCIL8 for hw rendering */
-         _mesa_promote_stencil(ctx, &stencilRb->Base);
-         ASSERT(stencilRb->Base._ActualFormat == GL_DEPTH24_STENCIL8_EXT);
-      }
-   }
-
-   /* Finally, update the fb->_DepthBuffer and fb->_StencilBuffer fields */
-   _mesa_update_depth_buffer(ctx, fb, BUFFER_DEPTH);
-   if (depthRb && depthRb->PairedStencil)
-      _mesa_update_stencil_buffer(ctx, fb, BUFFER_DEPTH);
-   else
-      _mesa_update_stencil_buffer(ctx, fb, BUFFER_STENCIL);
-
-
-   /* The hardware should use fb->Attachment[BUFFER_DEPTH].Renderbuffer
-    * first, if present, then fb->Attachment[BUFFER_STENCIL].Renderbuffer
-    * if present.
-    */
-}
diff --git a/src/mesa/drivers/dri/intel/intel_depthstencil.h b/src/mesa/drivers/dri/intel/intel_depthstencil.h
deleted file mode 100644
index 740eb0d989..0000000000
--- a/src/mesa/drivers/dri/intel/intel_depthstencil.h
+++ /dev/null
@@ -1,15 +0,0 @@
-
-#ifndef INTEL_DEPTH_STENCIL_H
-#define INTEL_DEPTH_STENCIL_H
-
-#include "intel_fbo.h"
-
-extern void
-intel_unpair_depth_stencil(GLcontext * ctx, struct intel_renderbuffer *irb);
-
-extern void
-intel_validate_paired_depth_stencil(GLcontext * ctx,
-                                    struct gl_framebuffer *fb);
-
-
-#endif /* INTEL_DEPTH_STENCIL_H */
diff --git a/src/mesa/drivers/dri/intel/intel_depthtmp.h b/src/mesa/drivers/dri/intel/intel_depthtmp.h
new file mode 100644
index 0000000000..16d7708453
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_depthtmp.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/**
+ * Wrapper around the depthtmp.h macrofest to generate spans code for
+ * all the tiling styles.
+ */
+
+#define VALUE_TYPE INTEL_VALUE_TYPE
+#define WRITE_DEPTH(_x, _y, d) INTEL_WRITE_DEPTH(NO_TILE(_x, _y), d)
+#define READ_DEPTH(d, _x, _y) d = INTEL_READ_DEPTH(NO_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel##x)
+#include "depthtmp.h"
+
+#define VALUE_TYPE INTEL_VALUE_TYPE
+#define WRITE_DEPTH(_x, _y, d) INTEL_WRITE_DEPTH(X_TILE(_x, _y), d)
+#define READ_DEPTH(d, _x, _y) d = INTEL_READ_DEPTH(X_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel_XTile_##x)
+#include "depthtmp.h"
+
+#define VALUE_TYPE INTEL_VALUE_TYPE
+#define WRITE_DEPTH(_x, _y, d) INTEL_WRITE_DEPTH(Y_TILE(_x, _y), d)
+#define READ_DEPTH(d, _x, _y) d = INTEL_READ_DEPTH(Y_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel_YTile_##x)
+#include "depthtmp.h"
+
+#undef INTEL_VALUE_TYPE
+#undef INTEL_WRITE_DEPTH
+#undef INTEL_READ_DEPTH
+#undef INTEL_TAG
diff --git a/src/mesa/drivers/dri/intel/intel_eglimage.c b/src/mesa/drivers/dri/intel/intel_eglimage.c
new file mode 100644
index 0000000000..fed8675c26
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_eglimage.c
@@ -0,0 +1,139 @@
+/**************************************************************************
+ * 
+ * Copyright (C) 2009 Chia-I Wu <olvaffe@gmail.com>
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "main/mtypes.h"
+#include "main/teximage.h"
+
+#include "intel_context.h"
+#include "intel_fbo.h"
+#include "intel_tex.h"
+
+#include "intel_eglimage.h"
+#include "main/eglimage.h"
+
+#include "EGL/internal/eglimage_dri.h"
+
+
+#if FEATURE_OES_EGL_image
+
+
+/* move to intel_fbo.c */
+static void
+copy_renderbuffer(struct intel_renderbuffer *dst,
+                  struct intel_renderbuffer *src)
+{
+   dst->Base.Width          = src->Base.Width;
+   dst->Base.Height         = src->Base.Height;
+   dst->Base.InternalFormat = src->Base.InternalFormat;
+   dst->Base._ActualFormat  = src->Base._ActualFormat;
+   dst->Base._BaseFormat    = src->Base._BaseFormat;
+
+   dst->Base.ColorEncoding  = src->Base.ColorEncoding;
+   dst->Base.ComponentType  = src->Base.ComponentType;
+
+   dst->Base.RedBits        = src->Base.RedBits;
+   dst->Base.GreenBits      = src->Base.GreenBits;
+   dst->Base.BlueBits       = src->Base.BlueBits;
+   dst->Base.AlphaBits      = src->Base.AlphaBits;
+   dst->Base.IndexBits      = src->Base.IndexBits;
+   dst->Base.DepthBits      = src->Base.DepthBits;
+   dst->Base.StencilBits    = src->Base.StencilBits;
+   dst->Base.NumSamples     = src->Base.NumSamples;
+
+   dst->Base.DataType       = src->Base.DataType;
+
+   intel_renderbuffer_set_region(dst, src->region);
+}
+
+static __DRIEGLImage *
+get_image(GLcontext *ctx, GLeglImageOES image)
+{
+   __DRIEGLImage *driImage;
+
+   driImage = _eglClientGetImageData((__DRIEGLImageHandle) image);
+   if (!driImage || driImage->magic != __DRI_EGL_IMAGE_MAGIC) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "glEGLImageTargetTexture2DOES(image=0x%x)", image);
+      return NULL;
+   }
+
+   return driImage;
+}
+
+void
+intelEGLImageTargetRenderbufferStorage(GLcontext *ctx,
+                                          struct gl_renderbuffer *rb,
+                                          GLeglImageOES image)
+{
+   struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+   __DRIEGLImage *driImage;
+   struct intel_framebuffer *image_fb;
+   struct intel_renderbuffer *image_rb;
+
+   driImage = get_image(ctx, image);
+   if (!driImage)
+      return;
+
+   image_fb = driImage->drawable->driverPrivate;
+   image_rb = image_fb->color_rb[0];
+
+   copy_renderbuffer(irb, image_rb);
+}
+
+
+void
+intelEGLImageTargetTexture2D(GLcontext *ctx,
+                                struct gl_texture_object *texObj,
+                                GLeglImageOES image)
+{
+   struct intel_context *intel = intel_context(ctx);
+   __DRIEGLImage *driImage;
+   GLint glx_texture_format;
+
+   driImage = get_image(ctx, image);
+   if (!driImage)
+      return;
+
+   /* only level == 0 is supported */
+   if (driImage->level) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glEGLImageTargetTexture2DOES(level=%d)", driImage->level);
+      return;
+   }
+
+   glx_texture_format = (driImage->texture_format_rgba)
+      ? GLX_TEXTURE_FORMAT_RGBA_EXT : GLX_TEXTURE_FORMAT_RGB_EXT;
+
+   /* TODO refactor intelSetTexBuffer2 to avoid lock... */
+   _mesa_unlock_texture(ctx, texObj);
+   intelSetTexBuffer2(intel->driContext, texObj->Target, glx_texture_format,
+                      driImage->drawable);
+   _mesa_lock_texture(ctx, texObj);
+}
+
+
+#endif /* FEATURE_OES_EGL_image */
diff --git a/src/mesa/drivers/dri/intel/intel_eglimage.h b/src/mesa/drivers/dri/intel/intel_eglimage.h
new file mode 100644
index 0000000000..ef1bf2eacd
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_eglimage.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ * 
+ * Copyright (C) 2009 Chia-I Wu <olvaffe@gmail.com>
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef INTEL_EGLIMAGE_H
+#define INTEL_EGLIMAGE_H
+
+#include "main/mtypes.h"
+
+/* main/eglimage.h might not exist */
+#if FEATURE_OES_EGL_image
+
+#ifndef GL_OES_EGL_image
+typedef void *GLeglImageOES;
+#endif
+
+#include "main/eglimage.h"
+
+extern void
+intelEGLImageTargetRenderbufferStorage(GLcontext *ctx,
+                                       struct gl_renderbuffer *rb,
+                                       GLeglImageOES image);
+
+extern void
+intelEGLImageTargetTexture2D(GLcontext *ctx,
+                             struct gl_texture_object *texObj,
+                             GLeglImageOES image);
+
+#else
+#define _MESA_INIT_EGLIMAGE_FUNCTIONS(driver, impl) do { } while (0)
+#endif /* FEATURE_OES_EGL_image */
+
+static INLINE void
+intelInitEGLImageFuncs(struct dd_function_table *functions)
+{
+   _MESA_INIT_EGLIMAGE_FUNCTIONS(functions, intel);
+}
+
+#endif /* INTEL_EGLIMAGE_H */
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c
new file mode 100644
index 0000000000..21af22fbfb
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_extensions.c
@@ -0,0 +1,213 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "intel_chipset.h"
+#include "intel_context.h"
+#include "intel_extensions.h"
+
+
+#define need_GL_ARB_copy_buffer
+#define need_GL_ARB_draw_elements_base_vertex
+#define need_GL_ARB_framebuffer_object
+#define need_GL_ARB_map_buffer_range
+#define need_GL_ARB_occlusion_query
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_shader_objects
+#define need_GL_ARB_sync
+#define need_GL_ARB_vertex_array_object
+#define need_GL_ARB_vertex_program
+#define need_GL_ARB_vertex_shader
+#define need_GL_ARB_window_pos
+#define need_GL_EXT_blend_color
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_cull_vertex
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_framebuffer_blit
+#define need_GL_EXT_gpu_program_parameters
+#define need_GL_EXT_point_parameters
+#define need_GL_EXT_provoking_vertex
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_stencil_two_side
+#define need_GL_APPLE_vertex_array_object
+#define need_GL_ATI_separate_stencil
+#define need_GL_ATI_envmap_bumpmap
+#define need_GL_NV_point_sprite
+#define need_GL_NV_vertex_program
+#define need_GL_VERSION_2_0
+#define need_GL_VERSION_2_1
+
+#include "extension_helper.h"
+
+
+/**
+ * Extension strings exported by the intel driver.
+ *
+ * Extensions supported by all chips supported by i830_dri, i915_dri, or
+ * i965_dri.
+ */
+static const struct dri_extension card_extensions[] = {
+   { "GL_ARB_copy_buffer",                GL_ARB_copy_buffer_functions },
+   { "GL_ARB_draw_elements_base_vertex",  GL_ARB_draw_elements_base_vertex_functions },
+   { "GL_ARB_half_float_pixel",           NULL },
+   { "GL_ARB_map_buffer_range",           GL_ARB_map_buffer_range_functions },
+   { "GL_ARB_multitexture",               NULL },
+   { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
+   { "GL_ARB_point_sprite",               NULL },
+   { "GL_ARB_sync",                       GL_ARB_sync_functions },
+   { "GL_ARB_texture_border_clamp",       NULL },
+   { "GL_ARB_texture_cube_map",           NULL },
+   { "GL_ARB_texture_env_add",            NULL },
+   { "GL_ARB_texture_env_combine",        NULL },
+   { "GL_ARB_texture_env_crossbar",       NULL },
+   { "GL_ARB_texture_env_dot3",           NULL },
+   { "GL_ARB_texture_mirrored_repeat",    NULL },
+   { "GL_ARB_texture_rectangle",          NULL },
+   { "GL_ARB_vertex_array_object",        GL_ARB_vertex_array_object_functions},
+   { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
+   { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
+   { "GL_EXT_blend_color",                GL_EXT_blend_color_functions },
+   { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
+   { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
+   { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
+   { "GL_EXT_blend_logic_op",             NULL },
+   { "GL_EXT_blend_subtract",             NULL },
+   { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
+   { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+   { "GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions },
+   { "GL_EXT_packed_depth_stencil",       NULL },
+   { "GL_EXT_provoking_vertex",           GL_EXT_provoking_vertex_functions },
+   { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
+   { "GL_EXT_stencil_wrap",               NULL },
+   { "GL_EXT_texture_edge_clamp",         NULL },
+   { "GL_EXT_texture_env_combine",        NULL },
+   { "GL_EXT_texture_env_dot3",           NULL },
+   { "GL_EXT_texture_filter_anisotropic", NULL },
+   { "GL_EXT_texture_lod_bias",           NULL },
+   { "GL_3DFX_texture_compression_FXT1",  NULL },
+   { "GL_APPLE_client_storage",           NULL },
+   { "GL_APPLE_vertex_array_object",      GL_APPLE_vertex_array_object_functions},
+   { "GL_MESA_pack_invert",               NULL },
+   { "GL_MESA_ycbcr_texture",             NULL },
+   { "GL_NV_blend_square",                NULL },
+   { "GL_NV_point_sprite",                GL_NV_point_sprite_functions },
+   { "GL_NV_vertex_program",              GL_NV_vertex_program_functions },
+   { "GL_NV_vertex_program1_1",           NULL },
+   { "GL_SGIS_generate_mipmap",           NULL },
+   { NULL, NULL }
+};
+
+
+/** i915 / i945-only extensions */
+static const struct dri_extension i915_extensions[] = {
+   { "GL_ARB_depth_texture",              NULL },
+   { "GL_ARB_fragment_program",           NULL },
+   { "GL_ARB_shadow",                     NULL },
+   { "GL_ARB_texture_non_power_of_two",   NULL },
+   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
+   { "GL_ATI_texture_env_combine3",       NULL },
+   { "GL_EXT_shadow_funcs",               NULL },
+   { "GL_EXT_stencil_two_side",           GL_EXT_stencil_two_side_functions },
+   { "GL_NV_texture_env_combine4",        NULL },
+   { NULL,                                NULL }
+};
+
+
+/** i965-only extensions */
+static const struct dri_extension brw_extensions[] = {
+   { "GL_ARB_depth_clamp",                NULL },
+   { "GL_ARB_depth_texture",              NULL },
+   { "GL_ARB_fragment_program",           NULL },
+   { "GL_ARB_fragment_program_shadow",    NULL },
+   { "GL_ARB_fragment_shader",            NULL },
+   { "GL_ARB_framebuffer_object",         GL_ARB_framebuffer_object_functions},
+   { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
+   { "GL_ARB_point_sprite", 		  NULL },
+   { "GL_ARB_seamless_cube_map",          NULL },
+   { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
+   { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
+   { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
+   { "GL_ARB_shadow",                     NULL },
+   { "GL_MESA_texture_signed_rgba",       NULL },
+   { "GL_ARB_texture_non_power_of_two",   NULL },
+   { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
+   { "GL_EXT_shadow_funcs",               NULL },
+   { "GL_EXT_stencil_two_side",           GL_EXT_stencil_two_side_functions },
+   { "GL_EXT_texture_sRGB",		  NULL },
+   { "GL_EXT_texture_swizzle",		  NULL },
+   { "GL_EXT_vertex_array_bgra",	  NULL },
+   { "GL_ATI_envmap_bumpmap",             GL_ATI_envmap_bumpmap_functions },
+   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
+   { "GL_ATI_texture_env_combine3",       NULL },
+   { "GL_NV_texture_env_combine4",        NULL },
+   { NULL,                                NULL }
+};
+
+
+static const struct dri_extension arb_oq_extensions[] = {
+   { NULL, NULL }
+};
+
+
+static const struct dri_extension ttm_extensions[] = {
+   { "GL_ARB_pixel_buffer_object",      NULL },
+   { "GL_EXT_framebuffer_blit",         GL_EXT_framebuffer_blit_functions },
+   { "GL_EXT_framebuffer_object",       GL_EXT_framebuffer_object_functions },
+#if FEATURE_OES_EGL_image
+   { "GL_OES_EGL_image",                NULL },
+#endif
+   { NULL, NULL }
+};
+
+
+/**
+ * Initializes potential list of extensions if ctx == NULL, or actually enables
+ * extensions for a context.
+ */
+void
+intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging)
+{
+   struct intel_context *intel = ctx?intel_context(ctx):NULL;
+
+   /* Disable imaging extension until convolution is working in teximage paths.
+    */
+   enable_imaging = GL_FALSE;
+
+   driInitExtensions(ctx, card_extensions, enable_imaging);
+
+   if (intel == NULL || intel->ttm)
+      driInitExtensions(ctx, ttm_extensions, GL_FALSE);
+
+   if (intel == NULL || IS_965(intel->intelScreen->deviceID))
+      driInitExtensions(ctx, brw_extensions, GL_FALSE);
+
+   if (intel == NULL || IS_915(intel->intelScreen->deviceID)
+       || IS_945(intel->intelScreen->deviceID))
+      driInitExtensions(ctx, i915_extensions, GL_FALSE);
+}
diff --git a/src/mesa/drivers/dri/i915/i915_tex.c b/src/mesa/drivers/dri/intel/intel_extensions.h
index e38d8fe79d..97147ecdb0 100644
--- a/src/mesa/drivers/dri/i915/i915_tex.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions.h
@@ -25,54 +25,12 @@
  * 
  **************************************************************************/
 
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/imports.h"
-#include "main/simple_list.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/mm.h"
-#include "main/texstore.h"
-#include "main/texformat.h"
-#include "swrast/swrast.h"
+#ifndef INTEL_EXTENSIONS_H
+#define INTEL_EXTENSIONS_H
 
-#include "texmem.h"
 
-#include "i915_context.h"
-#include "i915_reg.h"
+extern void
+intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging);
 
 
-
-static void
-i915TexEnv(GLcontext * ctx, GLenum target,
-           GLenum pname, const GLfloat * param)
-{
-   struct i915_context *i915 = I915_CONTEXT(ctx);
-
-   switch (pname) {
-   case GL_TEXTURE_LOD_BIAS:{
-         GLuint unit = ctx->Texture.CurrentUnit;
-         GLint b = (int) ((*param) * 16.0);
-         if (b > 255)
-            b = 255;
-         if (b < -256)
-            b = -256;
-         I915_STATECHANGE(i915, I915_UPLOAD_TEX(unit));
-         i915->lodbias_ss2[unit] =
-            ((b << SS2_LOD_BIAS_SHIFT) & SS2_LOD_BIAS_MASK);
-         break;
-      }
-
-   default:
-      break;
-   }
-}
-
-
-void
-i915InitTextureFuncs(struct dd_function_table *functions)
-{
-/*
-   functions->TexEnv = i915TexEnv;
-*/
-}
+#endif
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 7cf12619d6..084095fc96 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -27,6 +27,7 @@
 
 
 #include "main/imports.h"
+#include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/fbobject.h"
 #include "main/framebuffer.h"
@@ -34,61 +35,17 @@
 #include "main/context.h"
 #include "main/texformat.h"
 #include "main/texrender.h"
+#include "drivers/common/meta.h"
 
 #include "intel_context.h"
 #include "intel_buffers.h"
-#include "intel_depthstencil.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 #include "intel_regions.h"
-#include "intel_span.h"
 
 
 #define FILE_DEBUG_FLAG DEBUG_FBO
 
-#define INTEL_RB_CLASS 0x12345678
-
-
-/* XXX FBO: move this to intel_context.h (inlined) */
-/**
- * Return a gl_renderbuffer ptr casted to intel_renderbuffer.
- * NULL will be returned if the rb isn't really an intel_renderbuffer.
- * This is determiend by checking the ClassID.
- */
-struct intel_renderbuffer *
-intel_renderbuffer(struct gl_renderbuffer *rb)
-{
-   struct intel_renderbuffer *irb = (struct intel_renderbuffer *) rb;
-   if (irb && irb->Base.ClassID == INTEL_RB_CLASS) {
-      /*_mesa_warning(NULL, "Returning non-intel Rb\n");*/
-      return irb;
-   }
-   else
-      return NULL;
-}
-
-
-struct intel_renderbuffer *
-intel_get_renderbuffer(struct gl_framebuffer *fb, int attIndex)
-{
-   if (attIndex >= 0)
-      return intel_renderbuffer(fb->Attachment[attIndex].Renderbuffer);
-   else
-      return NULL;
-}
-
-struct intel_region *
-intel_get_rb_region(struct gl_framebuffer *fb, GLuint attIndex)
-{
-   struct intel_renderbuffer *irb = intel_get_renderbuffer(fb, attIndex);
-
-   if (irb)
-      return irb->region;
-   else
-      return NULL;
-}
-
-
 
 /**
  * Create a new framebuffer object.
@@ -103,6 +60,7 @@ intel_new_framebuffer(GLcontext * ctx, GLuint name)
 }
 
 
+/** Called by gl_renderbuffer::Delete() */
 static void
 intel_delete_renderbuffer(struct gl_renderbuffer *rb)
 {
@@ -112,10 +70,6 @@ intel_delete_renderbuffer(struct gl_renderbuffer *rb)
 
    ASSERT(irb);
 
-   if (irb->PairedStencil || irb->PairedDepth) {
-      intel_unpair_depth_stencil(ctx, irb);
-   }
-
    if (irb->span_cache != NULL)
       _mesa_free(irb->span_cache);
 
@@ -127,7 +81,6 @@ intel_delete_renderbuffer(struct gl_renderbuffer *rb)
 }
 
 
-
 /**
  * Return a pointer to a specific pixel in a renderbuffer.
  */
@@ -142,7 +95,6 @@ intel_get_pointer(GLcontext * ctx, struct gl_renderbuffer *rb,
 }
 
 
-
 /**
  * Called via glRenderbufferStorageEXT() to set the format and allocate
  * storage for a user-created renderbuffer.
@@ -168,6 +120,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->RedBits = 5;
       rb->GreenBits = 6;
       rb->BlueBits = 5;
+      irb->texformat = &_mesa_texformat_rgb565;
       cpp = 2;
       break;
    case GL_RGB:
@@ -181,6 +134,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->GreenBits = 8;
       rb->BlueBits = 8;
       rb->AlphaBits = 0;
+      irb->texformat = &_mesa_texformat_argb8888; /* XXX: Need xrgb8888 */
       cpp = 4;
       break;
    case GL_RGBA:
@@ -197,6 +151,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->GreenBits = 8;
       rb->BlueBits = 8;
       rb->AlphaBits = 8;
+      irb->texformat = &_mesa_texformat_argb8888;
       cpp = 4;
       break;
    case GL_STENCIL_INDEX:
@@ -209,20 +164,15 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
       rb->StencilBits = 8;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_COMPONENT16:
-#if 0
       rb->_ActualFormat = GL_DEPTH_COMPONENT16;
       rb->DataType = GL_UNSIGNED_SHORT;
       rb->DepthBits = 16;
       cpp = 2;
+      irb->texformat = &_mesa_texformat_z16;
       break;
-#else
-      /* fall-through.
-       * 16bpp depth renderbuffer can't be paired with a stencil buffer so
-       * always used combined depth/stencil format.
-       */
-#endif
    case GL_DEPTH_COMPONENT:
    case GL_DEPTH_COMPONENT24:
    case GL_DEPTH_COMPONENT32:
@@ -230,6 +180,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
       rb->DepthBits = 24;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_STENCIL_EXT:
    case GL_DEPTH24_STENCIL8_EXT:
@@ -238,6 +189,7 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       rb->DepthBits = 24;
       rb->StencilBits = 8;
       cpp = 4;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    default:
       _mesa_problem(ctx,
@@ -266,7 +218,9 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       DBG("Allocating %d x %d Intel RBO (pitch %d)\n", width,
 	  height, pitch);
 
-      irb->region = intel_region_alloc(intel, cpp, width, height, pitch);
+      irb->region = intel_region_alloc(intel, I915_TILING_NONE,
+				       cpp, width, height, pitch,
+				       GL_TRUE);
       if (!irb->region)
          return GL_FALSE;       /* out of memory? */
 
@@ -280,7 +234,6 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
 }
 
 
-
 /**
  * Called for each hardware renderbuffer when a _window_ is resized.
  * Just update fields.
@@ -298,6 +251,7 @@ intel_alloc_window_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
    return GL_TRUE;
 }
 
+
 static void
 intel_resize_buffers(GLcontext *ctx, struct gl_framebuffer *fb,
 		     GLuint width, GLuint height)
@@ -324,6 +278,8 @@ intel_resize_buffers(GLcontext *ctx, struct gl_framebuffer *fb,
    }
 }
 
+
+/** Dummy function for gl_renderbuffer::AllocStorage() */
 static GLboolean
 intel_nop_alloc_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
                         GLenum internalFormat, GLuint width, GLuint height)
@@ -343,10 +299,9 @@ intel_renderbuffer_set_region(struct intel_renderbuffer *rb,
    rb->region = NULL;
    intel_region_reference(&rb->region, region);
    intel_region_release(&old);
-
-   rb->pfPitch = region->pitch;
 }
 
+
 /**
  * Create a new intel_renderbuffer which corresponds to an on-screen window,
  * not a user-created renderbuffer.
@@ -376,6 +331,17 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.GreenBits = 6;
       irb->Base.BlueBits = 5;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_rgb565;
+      break;
+   case GL_RGB8:
+      irb->Base._ActualFormat = GL_RGB8;
+      irb->Base._BaseFormat = GL_RGB;
+      irb->Base.RedBits = 8;
+      irb->Base.GreenBits = 8;
+      irb->Base.BlueBits = 8;
+      irb->Base.AlphaBits = 0;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_argb8888; /* XXX: Need xrgb8888 */
       break;
    case GL_RGBA8:
       irb->Base._ActualFormat = GL_RGBA8;
@@ -385,24 +351,28 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.BlueBits = 8;
       irb->Base.AlphaBits = 8;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_argb8888;
       break;
    case GL_STENCIL_INDEX8_EXT:
       irb->Base._ActualFormat = GL_STENCIL_INDEX8_EXT;
       irb->Base._BaseFormat = GL_STENCIL_INDEX;
       irb->Base.StencilBits = 8;
       irb->Base.DataType = GL_UNSIGNED_BYTE;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH_COMPONENT16:
       irb->Base._ActualFormat = GL_DEPTH_COMPONENT16;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
       irb->Base.DepthBits = 16;
       irb->Base.DataType = GL_UNSIGNED_SHORT;
+      irb->texformat = &_mesa_texformat_z16;
       break;
    case GL_DEPTH_COMPONENT24:
       irb->Base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
       irb->Base.DepthBits = 24;
       irb->Base.DataType = GL_UNSIGNED_INT;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    case GL_DEPTH24_STENCIL8_EXT:
       irb->Base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
@@ -410,6 +380,7 @@ intel_create_renderbuffer(GLenum intFormat)
       irb->Base.DepthBits = 24;
       irb->Base.StencilBits = 8;
       irb->Base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+      irb->texformat = &_mesa_texformat_s8_z24;
       break;
    default:
       _mesa_problem(NULL,
@@ -466,9 +437,6 @@ intel_bind_framebuffer(GLcontext * ctx, GLenum target,
 {
    if (target == GL_FRAMEBUFFER_EXT || target == GL_DRAW_FRAMEBUFFER_EXT) {
       intel_draw_buffer(ctx, fb);
-      /* Integer depth range depends on depth buffer bits */
-      if (ctx->Driver.DepthRange != NULL)
-	 ctx->Driver.DepthRange(ctx, ctx->Viewport.Near, ctx->Viewport.Far);
    }
    else {
       /* don't need to do anything if target == GL_READ_FRAMEBUFFER_EXT */
@@ -492,10 +460,13 @@ intel_framebuffer_renderbuffer(GLcontext * ctx,
    intel_draw_buffer(ctx, fb);
 }
 
+
 static GLboolean
 intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb, 
 		     struct gl_texture_image *texImage)
 {
+   irb->texformat = texImage->TexFormat;
+
    if (texImage->TexFormat == &_mesa_texformat_argb8888) {
       irb->Base._ActualFormat = GL_RGBA8;
       irb->Base._BaseFormat = GL_RGBA;
@@ -505,9 +476,21 @@ intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb,
    else if (texImage->TexFormat == &_mesa_texformat_rgb565) {
       irb->Base._ActualFormat = GL_RGB5;
       irb->Base._BaseFormat = GL_RGB;
-      irb->Base.DataType = GL_UNSIGNED_SHORT;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to RGB5 texture OK\n");
    }
+   else if (texImage->TexFormat == &_mesa_texformat_argb1555) {
+      irb->Base._ActualFormat = GL_RGB5_A1;
+      irb->Base._BaseFormat = GL_RGBA;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
+      DBG("Render to ARGB1555 texture OK\n");
+   }
+   else if (texImage->TexFormat == &_mesa_texformat_argb4444) {
+      irb->Base._ActualFormat = GL_RGBA4;
+      irb->Base._BaseFormat = GL_RGBA;
+      irb->Base.DataType = GL_UNSIGNED_BYTE;
+      DBG("Render to ARGB4444 texture OK\n");
+   }
    else if (texImage->TexFormat == &_mesa_texformat_z16) {
       irb->Base._ActualFormat = GL_DEPTH_COMPONENT16;
       irb->Base._BaseFormat = GL_DEPTH_COMPONENT;
@@ -534,15 +517,15 @@ intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb,
    irb->Base.BlueBits = texImage->TexFormat->BlueBits;
    irb->Base.AlphaBits = texImage->TexFormat->AlphaBits;
    irb->Base.DepthBits = texImage->TexFormat->DepthBits;
+   irb->Base.StencilBits = texImage->TexFormat->StencilBits;
 
    irb->Base.Delete = intel_delete_renderbuffer;
    irb->Base.AllocStorage = intel_nop_alloc_storage;
 
-   irb->RenderToTexture = GL_TRUE;
-
    return GL_TRUE;
 }
 
+
 /**
  * When glFramebufferTexture[123]D is called this function sets up the
  * gl_renderbuffer wrapper around the texture image.
@@ -551,7 +534,7 @@ intel_update_wrapper(GLcontext *ctx, struct intel_renderbuffer *irb,
 static struct intel_renderbuffer *
 intel_wrap_texture(GLcontext * ctx, struct gl_texture_image *texImage)
 {
-   const GLuint name = ~0;      /* not significant, but distinct for debugging */
+   const GLuint name = ~0;   /* not significant, but distinct for debugging */
    struct intel_renderbuffer *irb;
 
    /* make an intel_renderbuffer to wrap the texture image */
@@ -594,14 +577,16 @@ intel_render_texture(GLcontext * ctx,
 
    ASSERT(newImage);
 
-   if (newImage->Border != 0) {
-      /* Fallback on drawing to a texture with a border, which won't have a
-       * miptree.
+   intel_image = intel_texture_image(newImage);
+   if (!intel_image->mt) {
+      /* Fallback on drawing to a texture that doesn't have a miptree
+       * (has a border, width/height 0, etc.)
        */
-       _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
-       _mesa_render_texture(ctx, fb, att);
-       return;
-   } else if (!irb) {
+      _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
+      _mesa_render_texture(ctx, fb, att);
+      return;
+   }
+   else if (!irb) {
       irb = intel_wrap_texture(ctx, newImage);
       if (irb) {
          /* bind the wrapper to the attachment point */
@@ -612,7 +597,9 @@ intel_render_texture(GLcontext * ctx,
          _mesa_render_texture(ctx, fb, att);
          return;
       }
-   } if (!intel_update_wrapper(ctx, irb, newImage)) {
+   }
+
+   if (!intel_update_wrapper(ctx, irb, newImage)) {
        _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
        _mesa_render_texture(ctx, fb, att);
        return;
@@ -624,7 +611,6 @@ intel_render_texture(GLcontext * ctx,
        irb->Base.RefCount);
 
    /* point the renderbufer's region to the texture image region */
-   intel_image = intel_texture_image(newImage);
    if (irb->region != intel_image->mt->region) {
       if (irb->region)
 	 intel_region_release(&irb->region);
@@ -657,18 +643,59 @@ static void
 intel_finish_render_texture(GLcontext * ctx,
                             struct gl_renderbuffer_attachment *att)
 {
-   struct intel_renderbuffer *irb = intel_renderbuffer(att->Renderbuffer);
+   /* no-op
+    * Previously we released the renderbuffer's intel_region but
+    * that's not necessary and actually caused problems when trying
+    * to do a glRead/CopyPixels from the renderbuffer later.
+    * The region will be released later if the texture is replaced
+    * or the renderbuffer deleted.
+    *
+    * The intention of this driver hook is more of a "done rendering
+    * to texture, please re-twiddle/etc if necessary".
+    */
+}
 
-   DBG("End render texture (tid %x) tex %u\n", _glthread_GetID(), att->Texture->Name);
 
-   if (irb) {
-      /* just release the region */
-      intel_region_release(&irb->region);
+/**
+ * Do additional "completeness" testing of a framebuffer object.
+ */
+static void
+intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
+{
+   const struct intel_renderbuffer *depthRb =
+      intel_get_renderbuffer(fb, BUFFER_DEPTH);
+   const struct intel_renderbuffer *stencilRb =
+      intel_get_renderbuffer(fb, BUFFER_STENCIL);
+   int i;
+
+   if (stencilRb && stencilRb != depthRb) {
+      /* we only support combined depth/stencil buffers, not separate
+       * stencil buffers.
+       */
+      fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
    }
-   else if (att->Renderbuffer) {
-      /* software fallback */
-      _mesa_finish_render_texture(ctx, att);
-      /* XXX FBO: Need to unmap the buffer (or in intelSpanRenderStart???) */
+
+   for (i = 0; i < ctx->Const.MaxDrawBuffers; i++) {
+      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+      struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+      if (rb == NULL)
+	 continue;
+
+      if (irb == NULL) {
+	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+	 continue;
+      }
+
+      switch (irb->texformat->MesaFormat) {
+      case MESA_FORMAT_ARGB8888:
+      case MESA_FORMAT_RGB565:
+      case MESA_FORMAT_ARGB1555:
+      case MESA_FORMAT_ARGB4444:
+	 break;
+      default:
+	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+      }
    }
 }
 
@@ -687,4 +714,8 @@ intel_fbo_init(struct intel_context *intel)
    intel->ctx.Driver.RenderTexture = intel_render_texture;
    intel->ctx.Driver.FinishRenderTexture = intel_finish_render_texture;
    intel->ctx.Driver.ResizeBuffers = intel_resize_buffers;
+   intel->ctx.Driver.ValidateFramebuffer = intel_validate_framebuffer;
+#if FEATURE_EXT_framebuffer_blit
+   intel->ctx.Driver.BlitFramebuffer = _mesa_meta_blit_framebuffer;
+#endif
 }
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.h b/src/mesa/drivers/dri/intel/intel_fbo.h
index b7e9280e8c..f0665af482 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.h
+++ b/src/mesa/drivers/dri/intel/intel_fbo.h
@@ -55,19 +55,13 @@ struct intel_framebuffer
 
 /**
  * Intel renderbuffer, derived from gl_renderbuffer.
- * Note: The PairedDepth and PairedStencil fields use renderbuffer IDs,
- * not pointers because in some circumstances a deleted renderbuffer could
- * result in a dangling pointer here.
  */
 struct intel_renderbuffer
 {
    struct gl_renderbuffer Base;
    struct intel_region *region;
-   GLuint pfPitch;              /* possibly paged flipped pitch */
-   GLboolean RenderToTexture;   /* RTT? */
 
-   GLuint PairedDepth;   /**< only used if this is a depth renderbuffer */
-   GLuint PairedStencil; /**< only used if this is a stencil renderbuffer */
+   const struct gl_texture_format *texformat;
 
    GLuint vbl_pending;   /**< vblank sequence number of pending flip */
 
@@ -75,48 +69,70 @@ struct intel_renderbuffer
    unsigned long span_cache_offset;
 };
 
-extern struct intel_renderbuffer *intel_renderbuffer(struct gl_renderbuffer
-                                                     *rb);
+
+/**
+ * gl_renderbuffer is a base class which we subclass.  The Class field
+ * is used for simple run-time type checking.
+ */
+#define INTEL_RB_CLASS 0x12345678
+
+
+/**
+ * Return a gl_renderbuffer ptr casted to intel_renderbuffer.
+ * NULL will be returned if the rb isn't really an intel_renderbuffer.
+ * This is determined by checking the ClassID.
+ */
+static INLINE struct intel_renderbuffer *
+intel_renderbuffer(struct gl_renderbuffer *rb)
+{
+   struct intel_renderbuffer *irb = (struct intel_renderbuffer *) rb;
+   if (irb && irb->Base.ClassID == INTEL_RB_CLASS) {
+      /*_mesa_warning(NULL, "Returning non-intel Rb\n");*/
+      return irb;
+   }
+   else
+      return NULL;
+}
+
+
+/**
+ * Return a framebuffer's renderbuffer, named by a BUFFER_x index.
+ */
+static INLINE struct intel_renderbuffer *
+intel_get_renderbuffer(struct gl_framebuffer *fb, int attIndex)
+{
+   if (attIndex >= 0)
+      return intel_renderbuffer(fb->Attachment[attIndex].Renderbuffer);
+   else
+      return NULL;
+}
+
 
 extern void
 intel_renderbuffer_set_region(struct intel_renderbuffer *irb,
 			      struct intel_region *region);
 
+
 extern struct intel_renderbuffer *
 intel_create_renderbuffer(GLenum intFormat);
 
-extern void intel_fbo_init(struct intel_context *intel);
-
-
-/* XXX make inline or macro */
-extern struct intel_renderbuffer *intel_get_renderbuffer(struct gl_framebuffer
-                                                         *fb,
-                                                         int attIndex);
-
-extern void intel_flip_renderbuffers(struct intel_framebuffer *intel_fb);
 
+extern void
+intel_fbo_init(struct intel_context *intel);
 
-/* XXX make inline or macro */
-extern struct intel_region *intel_get_rb_region(struct gl_framebuffer *fb,
-                                                GLuint attIndex);
 
+extern void
+intel_flip_renderbuffers(struct intel_framebuffer *intel_fb);
 
 
-/**
- * Are we currently rendering into a texture?
- */
-static INLINE GLboolean
-intel_rendering_to_texture(const GLcontext *ctx)
+static INLINE struct intel_region *
+intel_get_rb_region(struct gl_framebuffer *fb, GLuint attIndex)
 {
-   if (ctx->DrawBuffer->Name) {
-      /* User-created FBO */
-      const struct intel_renderbuffer *irb =
-         intel_renderbuffer(ctx->DrawBuffer->_ColorDrawBuffers[0]);
-      return irb && irb->RenderToTexture;
-   }
-   else {
-      return GL_FALSE;
-   }
+   struct intel_renderbuffer *irb = intel_get_renderbuffer(fb, attIndex);
+   if (irb)
+      return irb->region;
+   else
+      return NULL;
 }
 
 
diff --git a/src/mesa/drivers/dri/intel/intel_generatemipmap.c b/src/mesa/drivers/dri/intel/intel_generatemipmap.c
new file mode 100644
index 0000000000..0052abb42d
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_generatemipmap.c
@@ -0,0 +1,286 @@
+/*
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "main/glheader.h"
+#include "main/enums.h"
+#include "main/image.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/bufferobj.h"
+#include "main/teximage.h"
+#include "main/texenv.h"
+#include "main/texobj.h"
+#include "main/texstate.h"
+#include "main/texparam.h"
+#include "main/varray.h"
+#include "main/attrib.h"
+#include "main/enable.h"
+#include "main/buffers.h"
+#include "main/fbobject.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "main/depth.h"
+#include "main/hash.h"
+#include "main/mipmap.h"
+#include "main/blend.h"
+#include "swrast/swrast.h"
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_pixel.h"
+#include "intel_tex.h"
+#include "intel_mipmap_tree.h"
+
+#if FEATURE_attrib_stack
+static const char *intel_fp_tex2d =
+      "!!ARBfp1.0\n"
+      "TEX result.color, fragment.texcoord[0], texture[0], 2D;\n"
+      "END\n";
+
+static GLboolean
+intel_generate_mipmap_level(GLcontext *ctx, GLuint tex_name,
+			    int level, int width, int height)
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLfloat vertices[4][2];
+   GLint status;
+
+   /* Set to source from the previous level */
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, level - 1);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, level - 1);
+
+   /* Set to draw into the current level */
+   _mesa_FramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
+				 GL_COLOR_ATTACHMENT0_EXT,
+				 GL_TEXTURE_2D,
+				 tex_name,
+				 level);
+   /* Choose to render to the color attachment. */
+   _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
+
+   status = _mesa_CheckFramebufferStatusEXT (GL_FRAMEBUFFER_EXT);
+   if (status != GL_FRAMEBUFFER_COMPLETE_EXT)
+      return GL_FALSE;
+
+   meta_set_passthrough_transform(&intel->meta);
+
+   /* XXX: Doing it right would involve setting up the transformation to do
+    * 0-1 mapping or something, and not changing the vertex data.
+    */
+   vertices[0][0] = 0;
+   vertices[0][1] = 0;
+   vertices[1][0] = width;
+   vertices[1][1] = 0;
+   vertices[2][0] = width;
+   vertices[2][1] = height;
+   vertices[3][0] = 0;
+   vertices[3][1] = height;
+
+   _mesa_VertexPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &vertices);
+   _mesa_Enable(GL_VERTEX_ARRAY);
+   meta_set_default_texrect(&intel->meta);
+
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
+
+   return GL_TRUE;
+}
+
+static GLboolean
+intel_generate_mipmap_2d(GLcontext *ctx,
+			 GLenum target,
+			 struct gl_texture_object *texObj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLint old_active_texture;
+   int level, max_levels, start_level, end_level;
+   GLuint fb_name;
+   GLboolean success = GL_FALSE;
+   struct gl_framebuffer *saved_fbo = NULL;
+
+   _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
+		    GL_CURRENT_BIT | GL_COLOR_BUFFER_BIT |
+		    GL_DEPTH_BUFFER_BIT);
+   _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
+   old_active_texture = ctx->Texture.CurrentUnit;
+   _mesa_reference_framebuffer(&saved_fbo, ctx->DrawBuffer);
+
+   _mesa_Disable(GL_POLYGON_STIPPLE);
+   _mesa_Disable(GL_DEPTH_TEST);
+   _mesa_Disable(GL_STENCIL_TEST);
+   _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+   _mesa_DepthMask(GL_FALSE);
+
+   /* Bind the given texture to GL_TEXTURE_2D with linear filtering for our
+    * minification.
+    */
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB);
+   _mesa_Enable(GL_TEXTURE_2D);
+   _mesa_BindTexture(GL_TEXTURE_2D, texObj->Name);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER,
+		       GL_LINEAR_MIPMAP_NEAREST);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+
+   /* Bind the new renderbuffer to the color attachment point. */
+   _mesa_GenFramebuffersEXT(1, &fb_name);
+   _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, fb_name);
+
+   meta_set_fragment_program(&intel->meta, &intel->meta.tex2d_fp,
+			     intel_fp_tex2d);
+   meta_set_passthrough_vertex_program(&intel->meta);
+
+   max_levels = _mesa_max_texture_levels(ctx, texObj->Target);
+   start_level = texObj->BaseLevel;
+   end_level = texObj->MaxLevel;
+
+   /* Loop generating level+1 from level. */
+   for (level = start_level; level < end_level && level < max_levels - 1; level++) {
+      const struct gl_texture_image *srcImage;
+      int width, height;
+
+      srcImage = _mesa_select_tex_image(ctx, texObj, target, level);
+      if (srcImage->Border != 0)
+	 goto fail;
+
+      width = srcImage->Width / 2;
+      if (width < 1)
+	 width = 1;
+      height = srcImage->Height / 2;
+      if (height < 1)
+	 height = 1;
+
+      if (width == srcImage->Width &&
+	  height == srcImage->Height) {
+	 /* Neither _mesa_max_texture_levels nor texObj->MaxLevel are the
+	  * maximum texture level for the object, so break out when we've gone
+	  * over the edge.
+	  */
+	 break;
+      }
+
+      /* Make sure that there's space allocated for the target level.
+       * We could skip this if there's already space allocated and save some
+       * time.
+       */
+      _mesa_TexImage2D(GL_TEXTURE_2D, level + 1, srcImage->InternalFormat,
+		       width, height, 0,
+		       GL_RGBA, GL_UNSIGNED_INT, NULL);
+
+      if (!intel_generate_mipmap_level(ctx, texObj->Name, level + 1,
+				       width, height))
+	 goto fail;
+   }
+
+   success = GL_TRUE;
+
+fail:
+   meta_restore_fragment_program(&intel->meta);
+   meta_restore_vertex_program(&intel->meta);
+
+   _mesa_DeleteFramebuffersEXT(1, &fb_name);
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
+   if (saved_fbo)
+      _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, saved_fbo->Name);
+   _mesa_reference_framebuffer(&saved_fbo, NULL);
+   _mesa_PopClientAttrib();
+   _mesa_PopAttrib();
+
+   return success;
+}
+#endif /* FEATURE_attrib_stack */
+
+
+/**
+ * Generate new mipmap data from BASE+1 to BASE+p (the minimally-sized mipmap
+ * level).
+ *
+ * The texture object's miptree must be mapped.
+ *
+ * It would be really nice if this was just called by Mesa whenever mipmaps
+ * needed to be regenerated, rather than us having to remember to do so in
+ * each texture image modification path.
+ *
+ * This function should also include an accelerated path.
+ */
+void
+intel_generate_mipmap(GLcontext *ctx, GLenum target,
+                      struct gl_texture_object *texObj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_texture_object *intelObj = intel_texture_object(texObj);
+   GLuint nr_faces = (intelObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+   int face, i;
+
+#if FEATURE_attrib_stack
+   /* HW path */
+   if (target == GL_TEXTURE_2D &&
+       ctx->Extensions.EXT_framebuffer_object &&
+       ctx->Extensions.ARB_fragment_program &&
+       ctx->Extensions.ARB_vertex_program) {
+      GLboolean success;
+
+      /* We'll be accessing this texture using GL entrypoints, which should
+       * be resilient against other access to this texture.
+       */
+      _mesa_unlock_texture(ctx, texObj);
+      success = intel_generate_mipmap_2d(ctx, target, texObj);
+      _mesa_lock_texture(ctx, texObj);
+
+      if (success)
+	 return;
+   }
+#endif /* FEATURE_attrib_stack */
+
+   /* SW path */
+   intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
+   _mesa_generate_mipmap(ctx, target, texObj);
+   intel_tex_unmap_level_images(intel, intelObj, texObj->BaseLevel);
+
+   /* Update the level information in our private data in the new images, since
+    * it didn't get set as part of a normal TexImage path.
+    */
+   for (face = 0; face < nr_faces; face++) {
+      for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
+         struct intel_texture_image *intelImage;
+
+	 intelImage = intel_texture_image(texObj->Image[face][i]);
+	 if (intelImage == NULL)
+	    break;
+
+	 intelImage->level = i;
+	 intelImage->face = face;
+	 /* Unreference the miptree to signal that the new Data is a bare
+	  * pointer from mesa.
+	  */
+	 intel_miptree_release(intel, &intelImage->mt);
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index bf1c3f03f0..c985da5aa2 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -57,14 +57,16 @@ intel_miptree_create_internal(struct intel_context *intel,
 			      GLuint last_level,
 			      GLuint width0,
 			      GLuint height0,
-			      GLuint depth0, GLuint cpp, GLuint compress_byte)
+			      GLuint depth0, GLuint cpp, GLuint compress_byte,
+			      uint32_t tiling)
 {
    GLboolean ok;
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
 
-   DBG("%s target %s format %s level %d..%d\n", __FUNCTION__,
+   DBG("%s target %s format %s level %d..%d <-- %p\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target),
-       _mesa_lookup_enum_by_nr(internal_format), first_level, last_level);
+       _mesa_lookup_enum_by_nr(internal_format), 
+       first_level, last_level, mt);
 
    mt->target = target_to_target(target);
    mt->internal_format = internal_format;
@@ -80,15 +82,16 @@ intel_miptree_create_internal(struct intel_context *intel,
 
 #ifdef I915
    if (IS_945(intel->intelScreen->deviceID))
-      ok = i945_miptree_layout(intel, mt);
+      ok = i945_miptree_layout(intel, mt, tiling);
    else
-      ok = i915_miptree_layout(intel, mt);
+      ok = i915_miptree_layout(intel, mt, tiling);
 #else
-   ok = brw_miptree_layout(intel, mt);
+   ok = brw_miptree_layout(intel, mt, tiling);
 #endif
 
    if (!ok) {
       free(mt);
+      DBG("%s not okay - returning NULL\n", __FUNCTION__);
       return NULL;
    }
 
@@ -98,18 +101,33 @@ intel_miptree_create_internal(struct intel_context *intel,
 struct intel_mipmap_tree *
 intel_miptree_create(struct intel_context *intel,
 		     GLenum target,
+		     GLenum base_format,
 		     GLenum internal_format,
 		     GLuint first_level,
 		     GLuint last_level,
 		     GLuint width0,
 		     GLuint height0,
-		     GLuint depth0, GLuint cpp, GLuint compress_byte)
+		     GLuint depth0, GLuint cpp, GLuint compress_byte,
+		     GLboolean expect_accelerated_upload)
 {
    struct intel_mipmap_tree *mt;
+   uint32_t tiling;
+
+   if (intel->use_texture_tiling && compress_byte == 0 &&
+       intel->intelScreen->kernel_exec_fencing) {
+      if (IS_965(intel->intelScreen->deviceID) &&
+	  (base_format == GL_DEPTH_COMPONENT ||
+	   base_format == GL_DEPTH_STENCIL_EXT))
+	 tiling = I915_TILING_Y;
+      else
+	 tiling = I915_TILING_X;
+   } else
+      tiling = I915_TILING_NONE;
 
    mt = intel_miptree_create_internal(intel, target, internal_format,
 				      first_level, last_level, width0,
-				      height0, depth0, cpp, compress_byte);
+				      height0, depth0, cpp, compress_byte,
+				      tiling);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -117,10 +135,12 @@ intel_miptree_create(struct intel_context *intel,
       return NULL;
 
    mt->region = intel_region_alloc(intel,
+				   tiling,
 				   mt->cpp,
 				   mt->pitch,
 				   mt->total_height,
-				   mt->pitch);
+				   mt->pitch,
+				   expect_accelerated_upload);
 
    if (!mt->region) {
        free(mt);
@@ -145,7 +165,8 @@ intel_miptree_create_for_region(struct intel_context *intel,
    mt = intel_miptree_create_internal(intel, target, internal_format,
 				      first_level, last_level,
 				      region->width, region->height, 1,
-				      region->cpp, compress_byte);
+				      region->cpp, compress_byte,
+				      I915_TILING_NONE);
    if (!mt)
       return mt;
 #if 0
@@ -183,6 +204,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
 
 int intel_miptree_pitch_align (struct intel_context *intel,
 			       struct intel_mipmap_tree *mt,
+			       uint32_t tiling,
 			       int pitch)
 {
 #ifdef I915
@@ -203,6 +225,11 @@ int intel_miptree_pitch_align (struct intel_context *intel,
 	 pitch_align = 4;
       }
 
+      if (tiling == I915_TILING_X)
+	 pitch_align = 512;
+      else if (tiling == I915_TILING_Y)
+	 pitch_align = 128;
+
       pitch = ALIGN(pitch * mt->cpp, pitch_align);
 
 #ifdef I915
@@ -326,23 +353,31 @@ intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
 }
 
 
-
 void
-intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
-			       GLuint level, GLuint img,
-			       GLuint x, GLuint y)
+intel_miptree_set_image_offset_ex(struct intel_mipmap_tree *mt,
+                                  GLuint level, GLuint img,
+                                  GLuint x, GLuint y, 
+                                  GLuint offset)
 {
    if (img == 0 && level == 0)
       assert(x == 0 && y == 0);
 
    assert(img < mt->level[level].nr_images);
 
-   mt->level[level].image_offset[img] = (x + y * mt->pitch) * mt->cpp;
+   mt->level[level].image_offset[img] = (x + y * mt->pitch) * mt->cpp + offset;
 
    DBG("%s level %d img %d pos %d,%d image_offset %x\n",
        __FUNCTION__, level, img, x, y, mt->level[level].image_offset[img]);
 }
 
+void
+intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
+			       GLuint level, GLuint img,
+			       GLuint x, GLuint y)
+{
+    intel_miptree_set_image_offset_ex(mt, level, img, x, y, 0);
+}
+
 
 /* Although we use the image_offset[] array to store relative offsets
  * to cube faces, Mesa doesn't know anything about this and expects
@@ -452,11 +487,11 @@ intel_miptree_image_data(struct intel_context *intel,
 			0, 0,                             /* source x, y */
 			dst->level[level].width, height); /* width, height */
 
-      src += src_image_pitch * dst->cpp;
+      src = (char *)src + src_image_pitch * dst->cpp;
    }
 }
 
-extern GLuint intel_compressed_alignment(GLenum);
+extern void intel_get_texture_alignment_unit(GLenum, GLuint *, GLuint *);
 /* Copy mipmap image between trees
  */
 void
@@ -473,20 +508,37 @@ intel_miptree_image_copy(struct intel_context *intel,
    const GLuint *dst_depth_offset = intel_miptree_depth_offsets(dst, level);
    const GLuint *src_depth_offset = intel_miptree_depth_offsets(src, level);
    GLuint i;
+   GLboolean success;
 
    if (dst->compressed) {
-       GLuint alignment = intel_compressed_alignment(dst->internal_format);
+       GLuint align_w, align_h;
+
+       intel_get_texture_alignment_unit(dst->internal_format, &align_w, &align_h);
        height = (height + 3) / 4;
-       width = ((width + alignment - 1) & ~(alignment - 1));
+       width = ALIGN(width, align_w);
    }
 
    for (i = 0; i < depth; i++) {
-      intel_region_copy(intel,
-                        dst->region, dst_offset + dst_depth_offset[i],
-                        0,
-                        0,
-                        src->region, src_offset + src_depth_offset[i],
-                        0, 0, width, height);
+      success = intel_region_copy(intel,
+				  dst->region, dst_offset + dst_depth_offset[i],
+				  0, 0,
+				  src->region, src_offset + src_depth_offset[i],
+				  0, 0, width, height, GL_COPY);
+      if (!success) {
+	 GLubyte *src_ptr, *dst_ptr;
+
+	 src_ptr = intel_region_map(intel, src->region);
+	 dst_ptr = intel_region_map(intel, dst->region);
+
+	 _mesa_copy_rect(dst_ptr + dst_offset + dst_depth_offset[i],
+			 dst->cpp,
+			 dst->pitch,
+			 0, 0, width, height,
+			 src_ptr + src_offset + src_depth_offset[i],
+			 src->pitch,
+			 0, 0);
+	 intel_region_unmap(intel, src->region);
+	 intel_region_unmap(intel, dst->region);
+      }
    }
-
 }
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
index c9537dbb9a..c890b2a0d0 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
@@ -126,6 +126,7 @@ struct intel_mipmap_tree
 
 struct intel_mipmap_tree *intel_miptree_create(struct intel_context *intel,
                                                GLenum target,
+                                               GLenum base_format,
                                                GLenum internal_format,
                                                GLuint first_level,
                                                GLuint last_level,
@@ -133,7 +134,8 @@ struct intel_mipmap_tree *intel_miptree_create(struct intel_context *intel,
                                                GLuint height0,
                                                GLuint depth0,
                                                GLuint cpp,
-                                               GLuint compress_byte);
+                                               GLuint compress_byte,
+					       GLboolean expect_accelerated_upload);
 
 struct intel_mipmap_tree *
 intel_miptree_create_for_region(struct intel_context *intel,
@@ -147,6 +149,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
 
 int intel_miptree_pitch_align (struct intel_context *intel,
 			       struct intel_mipmap_tree *mt,
+			       uint32_t tiling,
 			       int pitch);
 
 void intel_miptree_reference(struct intel_mipmap_tree **dst,
@@ -193,6 +196,11 @@ void intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
                                   GLuint x, GLuint y,
                                   GLuint w, GLuint h, GLuint d);
 
+void intel_miptree_set_image_offset_ex(struct intel_mipmap_tree *mt,
+                                       GLuint level,
+                                       GLuint img, GLuint x, GLuint y,
+                                       GLuint offset);
+
 void intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
                                     GLuint level,
                                     GLuint img, GLuint x, GLuint y);
@@ -217,10 +225,13 @@ void intel_miptree_image_copy(struct intel_context *intel,
 /* i915_mipmap_tree.c:
  */
 GLboolean i915_miptree_layout(struct intel_context *intel,
-			      struct intel_mipmap_tree *mt);
+			      struct intel_mipmap_tree *mt,
+			      uint32_t tiling);
 GLboolean i945_miptree_layout(struct intel_context *intel,
-			      struct intel_mipmap_tree *mt);
+			      struct intel_mipmap_tree *mt,
+			      uint32_t tiling);
 GLboolean brw_miptree_layout(struct intel_context *intel,
-			     struct intel_mipmap_tree *mt);
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling);
 
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.c b/src/mesa/drivers/dri/intel/intel_pixel.c
index cf2f32d384..b5850df9ff 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel.c
@@ -25,11 +25,17 @@
  * 
  **************************************************************************/
 
+#include "main/accum.h"
+#include "main/drawpix.h"
 #include "main/enums.h"
 #include "main/state.h"
+#include "main/bufferobj.h"
 #include "main/context.h"
 #include "main/enable.h"
 #include "main/matrix.h"
+#include "main/texstate.h"
+#include "main/varray.h"
+#include "main/viewport.h"
 #include "swrast/swrast.h"
 #include "shader/arbprogram.h"
 #include "shader/program.h"
@@ -112,7 +118,7 @@ intel_check_blit_fragment_ops(GLcontext * ctx, GLboolean src_alpha_is_one)
       return GL_FALSE;
    }
 
-   if (ctx->Stencil.Enabled) {
+   if (ctx->Stencil._Enabled) {
       DBG("fallback due to image stencil\n");
       return GL_FALSE;
    }
@@ -173,179 +179,12 @@ intel_check_blit_format(struct intel_region * region,
 }
 
 void
-intel_meta_set_passthrough_transform(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   intel->meta.saved_vp_x = ctx->Viewport.X;
-   intel->meta.saved_vp_y = ctx->Viewport.Y;
-   intel->meta.saved_vp_width = ctx->Viewport.Width;
-   intel->meta.saved_vp_height = ctx->Viewport.Height;
-
-   _mesa_Viewport(0, 0, ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);
-
-   _mesa_MatrixMode(GL_PROJECTION);
-   _mesa_PushMatrix();
-   _mesa_LoadIdentity();
-   _mesa_Ortho(0, ctx->DrawBuffer->Width, 0, ctx->DrawBuffer->Height, 1, -1);
-
-   _mesa_MatrixMode(GL_MODELVIEW);
-   _mesa_PushMatrix();
-   _mesa_LoadIdentity();
-}
-
-void
-intel_meta_restore_transform(struct intel_context *intel)
-{
-   _mesa_MatrixMode(GL_PROJECTION);
-   _mesa_PopMatrix();
-   _mesa_MatrixMode(GL_MODELVIEW);
-   _mesa_PopMatrix();
-
-   _mesa_Viewport(intel->meta.saved_vp_x, intel->meta.saved_vp_y,
-		  intel->meta.saved_vp_width, intel->meta.saved_vp_height);
-}
-
-/**
- * Set up a vertex program to pass through the position and first texcoord
- * for pixel path.
- */
-void
-intel_meta_set_passthrough_vertex_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-   static const char *vp =
-      "!!ARBvp1.0\n"
-      "TEMP vertexClip;\n"
-      "DP4 vertexClip.x, state.matrix.mvp.row[0], vertex.position;\n"
-      "DP4 vertexClip.y, state.matrix.mvp.row[1], vertex.position;\n"
-      "DP4 vertexClip.z, state.matrix.mvp.row[2], vertex.position;\n"
-      "DP4 vertexClip.w, state.matrix.mvp.row[3], vertex.position;\n"
-      "MOV result.position, vertexClip;\n"
-      "MOV result.texcoord[0], vertex.texcoord[0];\n"
-      "MOV result.color, vertex.color;\n"
-      "END\n";
-
-   assert(intel->meta.saved_vp == NULL);
-
-   _mesa_reference_vertprog(ctx, &intel->meta.saved_vp,
-			    ctx->VertexProgram.Current);
-   if (intel->meta.passthrough_vp == NULL) {
-      GLuint prog_name;
-      _mesa_GenPrograms(1, &prog_name);
-      _mesa_BindProgram(GL_VERTEX_PROGRAM_ARB, prog_name);
-      _mesa_ProgramStringARB(GL_VERTEX_PROGRAM_ARB,
-			     GL_PROGRAM_FORMAT_ASCII_ARB,
-			     strlen(vp), (const GLubyte *)vp);
-      _mesa_reference_vertprog(ctx, &intel->meta.passthrough_vp,
-			       ctx->VertexProgram.Current);
-      _mesa_DeletePrograms(1, &prog_name);
-   }
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
-			    intel->meta.passthrough_vp);
-   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
-			   &intel->meta.passthrough_vp->Base);
-
-   intel->meta.saved_vp_enable = ctx->VertexProgram.Enabled;
-   _mesa_Enable(GL_VERTEX_PROGRAM_ARB);
-}
-
-/**
- * Restores the previous vertex program after
- * intel_meta_set_passthrough_vertex_program()
- */
-void
-intel_meta_restore_vertex_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
-			    intel->meta.saved_vp);
-   _mesa_reference_vertprog(ctx, &intel->meta.saved_vp, NULL);
-   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
-			   &ctx->VertexProgram.Current->Base);
-
-   if (!intel->meta.saved_vp_enable)
-      _mesa_Disable(GL_VERTEX_PROGRAM_ARB);
-}
-
-/**
- * Binds the given program string to GL_FRAGMENT_PROGRAM_ARB, caching the
- * program object.
- */
-void
-intel_meta_set_fragment_program(struct intel_context *intel,
-				struct gl_fragment_program **prog,
-				const char *prog_string)
-{
-   GLcontext *ctx = &intel->ctx;
-   assert(intel->meta.saved_fp == NULL);
-
-   _mesa_reference_fragprog(ctx, &intel->meta.saved_fp,
-			    ctx->FragmentProgram.Current);
-   if (*prog == NULL) {
-      GLuint prog_name;
-      _mesa_GenPrograms(1, &prog_name);
-      _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, prog_name);
-      _mesa_ProgramStringARB(GL_FRAGMENT_PROGRAM_ARB,
-			     GL_PROGRAM_FORMAT_ASCII_ARB,
-			     strlen(prog_string), (const GLubyte *)prog_string);
-      _mesa_reference_fragprog(ctx, prog, ctx->FragmentProgram.Current);
-      /* Note that DeletePrograms unbinds the program on us */
-      _mesa_DeletePrograms(1, &prog_name);
-   }
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, *prog);
-   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, &((*prog)->Base));
-
-   intel->meta.saved_fp_enable = ctx->FragmentProgram.Enabled;
-   _mesa_Enable(GL_FRAGMENT_PROGRAM_ARB);
-}
-
-/**
- * Restores the previous fragment program after
- * intel_meta_set_fragment_program()
- */
-void
-intel_meta_restore_fragment_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current,
-			    intel->meta.saved_fp);
-   _mesa_reference_fragprog(ctx, &intel->meta.saved_fp, NULL);
-   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
-			   &ctx->FragmentProgram.Current->Base);
-
-   if (!intel->meta.saved_fp_enable)
-      _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
-}
-
-void
 intelInitPixelFuncs(struct dd_function_table *functions)
 {
-   functions->Accum = _swrast_Accum;
+   _MESA_INIT_ACCUM_FUNCTIONS(functions, _swrast_);
    if (!getenv("INTEL_NO_BLIT")) {
-      functions->Bitmap = intelBitmap;
-      functions->CopyPixels = intelCopyPixels;
-      functions->DrawPixels = intelDrawPixels;
-#ifdef I915
-      functions->ReadPixels = intelReadPixels;
-#endif
+      _MESA_INIT_DRAWPIX_FUNCTIONS(functions, intel);
    }
-}
-
-void
-intel_free_pixel_state(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   _mesa_reference_vertprog(ctx, &intel->meta.passthrough_vp, NULL);
-   _mesa_reference_fragprog(ctx, &intel->meta.bitmap_fp, NULL);
+   functions->ReadPixels = intelReadPixels;
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.h b/src/mesa/drivers/dri/intel/intel_pixel.h
index 76b8781316..96a6dd17b2 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.h
+++ b/src/mesa/drivers/dri/intel/intel_pixel.h
@@ -31,16 +31,6 @@
 #include "main/mtypes.h"
 
 void intelInitPixelFuncs(struct dd_function_table *functions);
-void intel_meta_set_passthrough_transform(struct intel_context *intel);
-void intel_meta_restore_transform(struct intel_context *intel);
-void intel_meta_set_passthrough_vertex_program(struct intel_context *intel);
-void intel_meta_restore_vertex_program(struct intel_context *intel);
-void intel_meta_set_fragment_program(struct intel_context *intel,
-				     struct gl_fragment_program **prog,
-				     const char *prog_string);
-void intel_meta_restore_fragment_program(struct intel_context *intel);
-void intel_free_pixel_state(struct intel_context *intel);
-
 GLboolean intel_check_blit_fragment_ops(GLcontext * ctx,
 					GLboolean src_alpha_is_one);
 
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index 3a01f63dc7..9a0bcc07a5 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -42,6 +42,7 @@
 #include "main/varray.h"
 #include "main/attrib.h"
 #include "main/enable.h"
+#include "main/viewport.h"
 #include "shader/arbprogram.h"
 #include "glapi/dispatch.h"
 #include "swrast/swrast.h"
@@ -92,19 +93,12 @@ static const GLubyte *map_pbo( GLcontext *ctx,
    return ADD_POINTERS(buf, bitmap);
 }
 
-static GLboolean test_bit( const GLubyte *src,
-			    GLuint bit )
+static GLboolean test_bit( const GLubyte *src, GLuint bit )
 {
    return (src[bit/8] & (1<<(bit % 8))) ? 1 : 0;
 }
 
-static GLboolean test_msb_bit(const GLubyte *src, GLuint bit)
-{
-   return (src[bit/8] & (1<<(7 - (bit % 8)))) ? 1 : 0;
-}
-
-static void set_bit( GLubyte *dest,
-			  GLuint bit )
+static void set_bit( GLubyte *dest, GLuint bit )
 {
    dest[bit/8] |= 1 << (bit % 8);
 }
@@ -194,7 +188,7 @@ do_blit_bitmap( GLcontext *ctx,
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    GLfloat tmpColor[4];
    GLubyte ubcolor[4];
-   GLuint color8888, color565;
+   GLuint color;
    unsigned int num_cliprects;
    drm_clip_rect_t *cliprects;
    int x_off, y_off;
@@ -215,7 +209,7 @@ do_blit_bitmap( GLcontext *ctx,
    if (!dst)
        return GL_FALSE;
 
-   if (unpack->BufferObj->Name) {
+   if (_mesa_is_bufferobj(unpack->BufferObj)) {
       bitmap = map_pbo(ctx, width, height, unpack, bitmap);
       if (bitmap == NULL)
 	 return GL_TRUE;	/* even though this is an error, we're done */
@@ -232,8 +226,11 @@ do_blit_bitmap( GLcontext *ctx,
    UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[2], tmpColor[2]);
    UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[3], tmpColor[3]);
 
-   color8888 = INTEL_PACKCOLOR8888(ubcolor[0], ubcolor[1], ubcolor[2], ubcolor[3]);
-   color565 = INTEL_PACKCOLOR565(ubcolor[0], ubcolor[1], ubcolor[2]);
+   if (dst->cpp == 2)
+      color = INTEL_PACKCOLOR565(ubcolor[0], ubcolor[1], ubcolor[2]);
+   else
+      color = INTEL_PACKCOLOR8888(ubcolor[0], ubcolor[1],
+				  ubcolor[2], ubcolor[3]);
 
    if (!intel_check_blit_fragment_ops(ctx, tmpColor[3] == 1.0F))
       return GL_FALSE;
@@ -307,21 +304,21 @@ do_blit_bitmap( GLcontext *ctx,
 				   fb->Name == 0 ? GL_TRUE : GL_FALSE) == 0)
 		  continue;
 
-	       /* 
-		*/
-	       intelEmitImmediateColorExpandBlit( intel,
-						  dst->cpp,
-						  (GLubyte *)stipple, 
-						  sz,
-						  (dst->cpp == 2) ? color565 : color8888,
-						  dst->pitch,
-						  dst->buffer,
-						  0,
-						  dst->tiling,
-						  box_x + px,
-						  box_y + py,
-						  w, h,
-						  logic_op);
+	       if (!intelEmitImmediateColorExpandBlit(intel,
+						      dst->cpp,
+						      (GLubyte *)stipple,
+						      sz,
+						      color,
+						      dst->pitch,
+						      dst->buffer,
+						      0,
+						      dst->tiling,
+						      box_x + px,
+						      box_y + py,
+						      w, h,
+						      logic_op)) {
+		  return GL_FALSE;
+	       }
 	    } 
 	 } 
       }
@@ -332,7 +329,7 @@ out:
    if (INTEL_DEBUG & DEBUG_SYNC)
       intel_batchbuffer_flush(intel->batch);
 
-   if (unpack->BufferObj->Name) {
+   if (_mesa_is_bufferobj(unpack->BufferObj)) {
       /* done with PBO so unmap it now */
       ctx->Driver.UnmapBuffer(ctx, GL_PIXEL_UNPACK_BUFFER_EXT,
                               unpack->BufferObj);
@@ -360,11 +357,8 @@ intel_texture_bitmap(GLcontext * ctx,
       "END\n";
    GLuint texname;
    GLfloat vertices[4][4];
-   GLfloat texcoords[4][2];
    GLint old_active_texture;
-   GLubyte *unpacked_bitmap;
    GLubyte *a8_bitmap;
-   int x, y;
    GLfloat dst_z;
 
    /* We need a fragment program for the KIL effect */
@@ -401,6 +395,20 @@ intel_texture_bitmap(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
+       (!is_power_of_two(width) || !is_power_of_two(height))) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr,
+		 "glBitmap() fallback: NPOT texture\n");
+      return GL_FALSE;
+   }
+
+   if (ctx->Fog.Enabled) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "glBitmap() fallback: fog\n");
+      return GL_FALSE;
+   }
+
    /* Check that we can load in a texture this big. */
    if (width > (1 << (ctx->Const.MaxTextureLevels - 1)) ||
        height > (1 << (ctx->Const.MaxTextureLevels - 1))) {
@@ -410,23 +418,17 @@ intel_texture_bitmap(GLcontext * ctx,
       return GL_FALSE;
    }
 
-   /* Convert the A1 bitmap to an A8 format suitable for glTexImage */
-   if (unpack->BufferObj->Name) {
+   if (_mesa_is_bufferobj(unpack->BufferObj)) {
       bitmap = map_pbo(ctx, width, height, unpack, bitmap);
       if (bitmap == NULL)
 	 return GL_TRUE;	/* even though this is an error, we're done */
    }
-   unpacked_bitmap = _mesa_unpack_bitmap(width, height, bitmap,
-					 unpack);
+
+   /* Convert the A1 bitmap to an A8 format suitable for glTexImage */
    a8_bitmap = _mesa_calloc(width * height);
-   for (y = 0; y < height; y++) {
-      for (x = 0; x < width; x++) {
-	 if (test_msb_bit(unpacked_bitmap, ALIGN(width, 8) * y + x))
-	    a8_bitmap[y * width + x] = 0xff;
-      }
-   }
-   _mesa_free(unpacked_bitmap);
-   if (unpack->BufferObj->Name) {
+   _mesa_expand_bitmap(width, height, unpack, bitmap, a8_bitmap, width, 0xff);
+
+   if (_mesa_is_bufferobj(unpack->BufferObj)) {
       /* done with PBO so unmap it now */
       ctx->Driver.UnmapBuffer(ctx, GL_PIXEL_UNPACK_BUFFER_EXT,
                               unpack->BufferObj);
@@ -459,15 +461,18 @@ intel_texture_bitmap(GLcontext * ctx,
 		    GL_ALPHA, GL_UNSIGNED_BYTE, a8_bitmap);
    _mesa_free(a8_bitmap);
 
-   intel_meta_set_fragment_program(intel, &intel->meta.bitmap_fp, fp);
+   meta_set_fragment_program(&intel->meta, &intel->meta.bitmap_fp, fp);
    _mesa_ProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0,
 				     ctx->Current.RasterColor);
-   intel_meta_set_passthrough_vertex_program(intel);
-   intel_meta_set_passthrough_transform(intel);
+   meta_set_passthrough_vertex_program(&intel->meta);
+   meta_set_passthrough_transform(&intel->meta);
 
    /* convert rasterpos Z from [0,1] to NDC coord in [-1,1] */
    dst_z = -1.0 + 2.0 * ctx->Current.RasterPos[2];
 
+   /* RasterPos[2] already takes into account the DepthRange mapping. */
+   _mesa_DepthRange(0.0, 1.0);
+
    vertices[0][0] = dst_x;
    vertices[0][1] = dst_y;
    vertices[0][2] = dst_z;
@@ -485,24 +490,15 @@ intel_texture_bitmap(GLcontext * ctx,
    vertices[3][2] = dst_z;
    vertices[3][3] = 1.0;
 
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
    _mesa_VertexPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &vertices);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
-   CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
+   meta_set_default_texrect(&intel->meta);
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
-   intel_meta_restore_transform(intel);
-   intel_meta_restore_fragment_program(intel);
-   intel_meta_restore_vertex_program(intel);
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
+   meta_restore_fragment_program(&intel->meta);
+   meta_restore_vertex_program(&intel->meta);
 
    _mesa_PopClientAttrib();
    _mesa_Disable(GL_TEXTURE_2D); /* asserted that it was disabled at entry */
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_copy.c b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
index 7c7aa6097c..07ca8f7ddb 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
@@ -26,18 +26,13 @@
  **************************************************************************/
 
 #include "main/glheader.h"
-#include "main/enums.h"
 #include "main/image.h"
 #include "main/state.h"
 #include "main/mtypes.h"
-#include "main/macros.h"
-#include "swrast/swrast.h"
+#include "drivers/common/meta.h"
 
-#include "intel_screen.h"
 #include "intel_context.h"
-#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
-#include "intel_blit.h"
 #include "intel_regions.h"
 #include "intel_pixel.h"
 
@@ -87,7 +82,7 @@ intel_check_copypixel_blit_fragment_ops(GLcontext * ctx)
             ctx->Color.AlphaEnabled ||
             ctx->Depth.Test ||
             ctx->Fog.Enabled ||
-            ctx->Stencil.Enabled ||
+            ctx->Stencil._Enabled ||
             !ctx->Color.ColorMask[0] ||
             !ctx->Color.ColorMask[1] ||
             !ctx->Color.ColorMask[2] ||
@@ -97,162 +92,6 @@ intel_check_copypixel_blit_fragment_ops(GLcontext * ctx)
 	    ctx->Color.BlendEnabled);
 }
 
-#ifdef I915
-/* Doesn't work for overlapping regions.  Could do a double copy or
- * just fallback.
- */
-static GLboolean
-do_texture_copypixels(GLcontext * ctx,
-                      GLint srcx, GLint srcy,
-                      GLsizei width, GLsizei height,
-                      GLint dstx, GLint dsty, GLenum type)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_region *dst = intel_drawbuf_region(intel);
-   struct intel_region *src = copypix_src_region(intel, type);
-   GLenum src_format;
-   GLenum src_type;
-
-   DBG("%s %d,%d %dx%d --> %d,%d\n", __FUNCTION__, 
-       srcx, srcy, width, height, dstx, dsty);
-
-   if (!src || !dst || type != GL_COLOR)
-      return GL_FALSE;
-
-   if (ctx->_ImageTransferState) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         fprintf(stderr, "%s: check_color failed\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   /* Can't handle overlapping regions.  Don't have sufficient control
-    * over rasterization to pull it off in-place.  Punt on these for
-    * now.
-    * 
-    * XXX: do a copy to a temporary. 
-    */
-   if (src->buffer == dst->buffer) {
-      drm_clip_rect_t srcbox;
-      drm_clip_rect_t dstbox;
-      drm_clip_rect_t tmp;
-
-      srcbox.x1 = srcx;
-      srcbox.y1 = srcy;
-      srcbox.x2 = srcx + width;
-      srcbox.y2 = srcy + height;
-
-      if (ctx->Pixel.ZoomX > 0) {
-	 dstbox.x1 = dstx;
-	 dstbox.x2 = dstx + width * ctx->Pixel.ZoomX;
-      } else {
-	 dstbox.x1 = dstx + width * ctx->Pixel.ZoomX;
-	 dstbox.x2 = dstx;
-      }
-      if (ctx->Pixel.ZoomY > 0) {
-	 dstbox.y1 = dsty;
-	 dstbox.y2 = dsty + height * ctx->Pixel.ZoomY;
-      } else {
-	 dstbox.y1 = dsty + height * ctx->Pixel.ZoomY;
-	 dstbox.y2 = dsty;
-      }
-
-      DBG("src %d,%d %d,%d\n", srcbox.x1, srcbox.y1, srcbox.x2, srcbox.y2);
-      DBG("dst %d,%d %d,%d (%dx%d) (%f,%f)\n", dstbox.x1, dstbox.y1, dstbox.x2, dstbox.y2,
-	  width, height, ctx->Pixel.ZoomX, ctx->Pixel.ZoomY);
-
-      if (intel_intersect_cliprects(&tmp, &srcbox, &dstbox)) {
-         DBG("%s: regions overlap\n", __FUNCTION__);
-         return GL_FALSE;
-      }
-   }
-
-   intelFlush(&intel->ctx);
-
-   intel->vtbl.install_meta_state(intel);
-
-   /* Is this true?  Also will need to turn depth testing on according
-    * to state:
-    */
-   intel->vtbl.meta_no_stencil_write(intel);
-   intel->vtbl.meta_no_depth_write(intel);
-
-   /* Set the 3d engine to draw into the destination region:
-    */
-   intel->vtbl.meta_draw_region(intel, dst, intel->depth_region);
-
-   intel->vtbl.meta_import_pixel_state(intel);
-
-   if (src->cpp == 2) {
-      src_format = GL_RGB;
-      src_type = GL_UNSIGNED_SHORT_5_6_5;
-   }
-   else {
-      src_format = GL_BGRA;
-      src_type = GL_UNSIGNED_BYTE;
-   }
-
-   /* Set the frontbuffer up as a large rectangular texture.
-    */
-   if (!intel->vtbl.meta_tex_rect_source(intel, src->buffer, 0,
-                                         src->pitch,
-                                         src->height, src_format, src_type)) {
-      intel->vtbl.leave_meta_state(intel);
-      return GL_FALSE;
-   }
-
-
-   intel->vtbl.meta_texture_blend_replace(intel);
-
-   LOCK_HARDWARE(intel);
-
-   if (intel->driDrawable->numClipRects) {
-      __DRIdrawablePrivate *dPriv = intel->driDrawable;
-
-
-      srcy = dPriv->h - srcy - height;  /* convert from gl to hardware coords */
-
-      srcx += dPriv->x;
-      srcy += dPriv->y;
-
-      /* Clip against the source region.  This is the only source
-       * clipping we do.  XXX: Just set the texcord wrap mode to clamp
-       * or similar.
-       *
-       */
-      if (0) {
-         GLint orig_x = srcx;
-         GLint orig_y = srcy;
-
-         if (!_mesa_clip_to_region(0, 0, src->pitch, src->height,
-                                   &srcx, &srcy, &width, &height))
-            goto out;
-
-         dstx += srcx - orig_x;
-         dsty += (srcy - orig_y) * ctx->Pixel.ZoomY;
-      }
-
-      /* Just use the regular cliprect mechanism...  Does this need to
-       * even hold the lock???
-       */
-      intel->vtbl.meta_draw_quad(intel,
-				 dstx,
-				 dstx + width * ctx->Pixel.ZoomX,
-				 dPriv->h - (dsty + height * ctx->Pixel.ZoomY),
-				 dPriv->h - (dsty), 0, /* XXX: what z value? */
-				 0x00ff00ff,
-				 srcx, srcx + width, srcy, srcy + height);
-
-    out:
-      intel->vtbl.leave_meta_state(intel);
-      intel_batchbuffer_emit_mi_flush(intel->batch);
-   }
-   UNLOCK_HARDWARE(intel);
-
-   DBG("%s: success\n", __FUNCTION__);
-   return GL_TRUE;
-}
-#endif /* I915 */
-
 
 /**
  * CopyPixels with the blitter.  Don't support zooming, pixel transfer, etc.
@@ -272,6 +111,12 @@ do_blit_copypixels(GLcontext * ctx,
    drm_clip_rect_t *cliprects;
    int x_off, y_off;
 
+   if (type == GL_DEPTH || type == GL_STENCIL) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "glCopyPixels() fallback: GL_DEPTH || GL_STENCIL\n");
+      return GL_FALSE;
+   }
+
    /* Update draw buffer bounds */
    _mesa_update_state(ctx);
 
@@ -362,14 +207,16 @@ do_blit_copypixels(GLcontext * ctx,
 				   &clip_x, &clip_y, &clip_w, &clip_h))
             continue;
 
-         intelEmitCopyBlit(intel, dst->cpp,
-			   src->pitch, src->buffer, 0, src->tiling,
-			   dst->pitch, dst->buffer, 0, dst->tiling,
-			   clip_x + delta_x, clip_y + delta_y, /* srcx, srcy */
-			   clip_x, clip_y, /* dstx, dsty */
-			   clip_w, clip_h,
-			   ctx->Color.ColorLogicOpEnabled ?
-			   ctx->Color.LogicOp : GL_COPY);
+	 if (!intel_region_copy(intel,
+				dst, 0, clip_x, clip_y,
+				src, 0, clip_x + delta_x, clip_y + delta_y,
+				clip_w, clip_h,
+				ctx->Color.ColorLogicOpEnabled ?
+				ctx->Color.LogicOp : GL_COPY)) {
+	    DBG("%s: blit failure\n", __FUNCTION__);
+	    UNLOCK_HARDWARE(intel);
+	    return GL_FALSE;
+	 }
       }
    }
 out:
@@ -392,12 +239,6 @@ intelCopyPixels(GLcontext * ctx,
    if (do_blit_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
       return;
 
-#ifdef I915
-   if (do_texture_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
-      return;
-#endif
-
-   DBG("fallback to _swrast_CopyPixels\n");
-
-   _swrast_CopyPixels(ctx, srcx, srcy, width, height, destx, desty, type);
+   /* this will use swrast if needed */
+   _mesa_meta_copy_pixels(ctx, srcx, srcy, width, height, destx, desty, type);
 }
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_draw.c b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
index 0e83afa645..7fbb89fd6a 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
@@ -29,8 +29,6 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/mtypes.h"
-#include "main/macros.h"
-#include "main/bufferobj.h"
 #include "main/teximage.h"
 #include "main/texenv.h"
 #include "main/texobj.h"
@@ -41,155 +39,22 @@
 #include "main/enable.h"
 #include "main/buffers.h"
 #include "main/fbobject.h"
-#include "main/renderbuffer.h"
 #include "main/depth.h"
 #include "main/hash.h"
 #include "main/blend.h"
-#include "glapi/dispatch.h"
 #include "swrast/swrast.h"
+#include "drivers/common/meta.h"
 
-#include "intel_screen.h"
 #include "intel_context.h"
 #include "intel_batchbuffer.h"
 #include "intel_blit.h"
 #include "intel_buffers.h"
 #include "intel_regions.h"
 #include "intel_pixel.h"
-#include "intel_buffer_objects.h"
 #include "intel_fbo.h"
 
-static GLboolean
-intel_texture_drawpixels(GLcontext * ctx,
-			 GLint x, GLint y,
-			 GLsizei width, GLsizei height,
-			 GLenum format,
-			 GLenum type,
-			 const struct gl_pixelstore_attrib *unpack,
-			 const GLvoid *pixels)
-{
-   struct intel_context *intel = intel_context(ctx);
-   GLuint texname;
-   GLfloat vertices[4][4];
-   GLfloat texcoords[4][2];
-   GLfloat z;
-
-   /* We're going to mess with texturing with no regard to existing texture
-    * state, so if there is some set up we have to bail.
-    */
-   if (ctx->Texture._EnabledUnits != 0) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "glDrawPixels() fallback: texturing enabled\n");
-      return GL_FALSE;
-   }
-
-   /* Can't do textured DrawPixels with a fragment program, unless we were
-    * to generate a new program that sampled our texture and put the results
-    * in the fragment color before the user's program started.
-    */
-   if (ctx->FragmentProgram.Enabled) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "glDrawPixels() fallback: fragment program enabled\n");
-      return GL_FALSE;
-   }
-
-   /* We don't have a way to generate fragments with stencil values which *
-    * will set the resulting stencil value.
-    */
-   if (format == GL_STENCIL_INDEX)
-      return GL_FALSE;
-
-   /* Check that we can load in a texture this big. */
-   if (width > (1 << (ctx->Const.MaxTextureLevels - 1)) ||
-       height > (1 << (ctx->Const.MaxTextureLevels - 1))) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "glDrawPixels() fallback: bitmap too large (%dx%d)\n",
-		 width, height);
-      return GL_FALSE;
-   }
-
-   /* To do DEPTH_COMPONENT, we would need to change our setup to not draw to
-    * the color buffer, and sample the texture values into the fragment depth
-    * in a program.
-    */
-   if (format == GL_DEPTH_COMPONENT) {
-      if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr,
-		 "glDrawPixels() fallback: format == GL_DEPTH_COMPONENT\n");
-      return GL_FALSE;
-   }
-
-   _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
-		    GL_CURRENT_BIT);
-   _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
-
-   /* XXX: pixel store stuff */
-   _mesa_Disable(GL_POLYGON_STIPPLE);
-
-   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB);
-   _mesa_Enable(GL_TEXTURE_2D);
-   _mesa_GenTextures(1, &texname);
-   _mesa_BindTexture(GL_TEXTURE_2D, texname);
-   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-   _mesa_TexEnvf(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
-   /*
-   _mesa_TexEnvf(GL_TEXTURE_ENV, GL_COMBINE_RGB, GL_REPLACE);
-   _mesa_TexEnvf(GL_TEXTURE_ENV, GL_COMBINE_ALPHA, GL_REPLACE);
-   */
-   _mesa_TexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, width, height, 0, format,
-		    type, pixels);
-
-   intel_meta_set_passthrough_transform(intel);
-
-   /* convert rasterpos Z from [0,1] to NDC coord in [-1,1] */
-   z = -1.0 + 2.0 * ctx->Current.RasterPos[2];
-
-   /* Create the vertex buffer based on the current raster pos.  The x and y
-    * we're handed are ctx->Current.RasterPos[0,1] rounded to integers.
-    * We also apply the depth.  However, the W component is already multiplied
-    * into ctx->Current.RasterPos[0,1,2] and we can ignore it at this point.
-    */
-   vertices[0][0] = x;
-   vertices[0][1] = y;
-   vertices[0][2] = z;
-   vertices[0][3] = 1.0;
-   vertices[1][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[1][1] = y;
-   vertices[1][2] = z;
-   vertices[1][3] = 1.0;
-   vertices[2][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[2][1] = y + height * ctx->Pixel.ZoomY;
-   vertices[2][2] = z;
-   vertices[2][3] = 1.0;
-   vertices[3][0] = x;
-   vertices[3][1] = y + height * ctx->Pixel.ZoomY;
-   vertices[3][2] = z;
-   vertices[3][3] = 1.0;
-
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
-   _mesa_VertexPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &vertices);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
-   _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
-   CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
-
-   intel_meta_restore_transform(intel);
-   _mesa_PopClientAttrib();
-   _mesa_PopAttrib();
-
-   _mesa_DeleteTextures(1, &texname);
-
-   return GL_TRUE;
-}
 
+/** XXX compare perf of this vs. _mesa_meta_draw_pixels(STENCIL) */
 static GLboolean
 intel_stencil_drawpixels(GLcontext * ctx,
 			 GLint x, GLint y,
@@ -202,13 +67,14 @@ intel_stencil_drawpixels(GLcontext * ctx,
    struct intel_context *intel = intel_context(ctx);
    GLuint texname, rb_name, fb_name, old_fb_name;
    GLfloat vertices[4][2];
-   GLfloat texcoords[4][2];
    struct intel_renderbuffer *irb;
    struct intel_renderbuffer *depth_irb;
    struct gl_renderbuffer *rb;
    struct gl_pixelstore_attrib old_unpack;
    GLstencil *stencil_pixels;
-   int row;
+   int row, y1, y2;
+   GLint old_active_texture;
+   GLboolean rendering_to_fbo = ctx->DrawBuffer->Name != 0;
 
    if (format != GL_STENCIL_INDEX)
       return GL_FALSE;
@@ -225,6 +91,10 @@ intel_stencil_drawpixels(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   /* We don't support stencil testing/ops here */
+   if (ctx->Stencil._Enabled)
+      return GL_FALSE;
+
    /* We use FBOs for our wrapping of the depthbuffer into a color
     * destination.
     */
@@ -262,10 +132,19 @@ intel_stencil_drawpixels(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (!ctx->Extensions.ARB_texture_non_power_of_two &&
+       (!is_power_of_two(width) || !is_power_of_two(height))) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr,
+		 "glDrawPixels(GL_STENCIL_INDEX) fallback: NPOT texture\n");
+      return GL_FALSE;
+   }
+
    _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
 		    GL_CURRENT_BIT | GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
    _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
    old_fb_name = ctx->DrawBuffer->Name;
+   old_active_texture = ctx->Texture.CurrentUnit;
 
    _mesa_Disable(GL_POLYGON_STIPPLE);
    _mesa_Disable(GL_DEPTH_TEST);
@@ -330,34 +209,37 @@ intel_stencil_drawpixels(GLcontext * ctx,
    ctx->Unpack = old_unpack;
    _mesa_free(stencil_pixels);
 
-   intel_meta_set_passthrough_transform(intel);
+   meta_set_passthrough_transform(&intel->meta);
 
+   /* Since we're rendering to the framebuffer as if it was an FBO,
+    * if it's the window system we have to flip the coordinates.
+    */
+   if (rendering_to_fbo) {
+      y1 = y;
+      y2 = y + height * ctx->Pixel.ZoomY;
+   } else {
+      y1 = irb->Base.Height - (y + height * ctx->Pixel.ZoomY);
+      y2 = irb->Base.Height - y;
+   }
    vertices[0][0] = x;
-   vertices[0][1] = y;
+   vertices[0][1] = y1;
    vertices[1][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[1][1] = y;
+   vertices[1][1] = y1;
    vertices[2][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[2][1] = y + height * ctx->Pixel.ZoomY;
+   vertices[2][1] = y2;
    vertices[3][0] = x;
-   vertices[3][1] = y + height * ctx->Pixel.ZoomY;
-
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
+   vertices[3][1] = y2;
 
    _mesa_VertexPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &vertices);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
-   CALL_DrawArrays(ctx->Exec, (GL_TRIANGLE_FAN, 0, 4));
+   meta_set_default_texrect(&intel->meta);
 
-   intel_meta_restore_transform(intel);
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
+
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
    _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, old_fb_name);
 
    _mesa_PopClientAttrib();
@@ -379,17 +261,25 @@ intelDrawPixels(GLcontext * ctx,
                 const struct gl_pixelstore_attrib *unpack,
                 const GLvoid * pixels)
 {
-   if (intel_texture_drawpixels(ctx, x, y, width, height, format, type,
-				unpack, pixels))
-      return;
-
+#if 0
+   /* XXX this function doesn't seem to work reliably even when all
+    * the pre-requisite conditions are met.
+    * Note that this function is never hit with conform.
+    * Fall back to swrast because even the _mesa_meta_draw_pixels() approach
+    * isn't working because of an apparent stencil bug.
+    */
    if (intel_stencil_drawpixels(ctx, x, y, width, height, format, type,
 				unpack, pixels))
       return;
+#else
+   (void) intel_stencil_drawpixels; /* silence warning */
+   if (format == GL_STENCIL_INDEX) {
+      _swrast_DrawPixels(ctx, x, y, width, height, format, type,
+                         unpack, pixels);
+      return;
+   }
+#endif
 
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      _mesa_printf("%s: fallback to swrast\n", __FUNCTION__);
-
-   _swrast_DrawPixels(ctx, x, y, width, height, format, type,
-		      unpack, pixels);
+   _mesa_meta_draw_pixels(ctx, x, y, width, height, format, type,
+                          unpack, pixels);
 }
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_read.c b/src/mesa/drivers/dri/intel/intel_pixel_read.c
new file mode 100644
index 0000000000..bc67f6242a
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_pixel_read.c
@@ -0,0 +1,315 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "main/glheader.h"
+#include "main/enums.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/image.h"
+#include "main/bufferobj.h"
+#include "main/state.h"
+#include "swrast/swrast.h"
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_blit.h"
+#include "intel_buffers.h"
+#include "intel_regions.h"
+#include "intel_pixel.h"
+#include "intel_buffer_objects.h"
+
+/* For many applications, the new ability to pull the source buffers
+ * back out of the GTT and then do the packing/conversion operations
+ * in software will be as much of an improvement as trying to get the
+ * blitter and/or texture engine to do the work.
+ *
+ * This step is gated on private backbuffers.
+ *
+ * Obviously the frontbuffer can't be pulled back, so that is either
+ * an argument for blit/texture readpixels, or for blitting to a
+ * temporary and then pulling that back.
+ *
+ * When the destination is a pbo, however, it's not clear if it is
+ * ever going to be pulled to main memory (though the access param
+ * will be a good hint).  So it sounds like we do want to be able to
+ * choose between blit/texture implementation on the gpu and pullback
+ * and cpu-based copying.
+ *
+ * Unless you can magically turn client memory into a PBO for the
+ * duration of this call, there will be a cpu-based copying step in
+ * any case.
+ */
+
+
+static GLboolean
+do_texture_readpixels(GLcontext * ctx,
+                      GLint x, GLint y, GLsizei width, GLsizei height,
+                      GLenum format, GLenum type,
+                      const struct gl_pixelstore_attrib *pack,
+                      struct intel_region *dest_region)
+{
+#if 0
+   struct intel_context *intel = intel_context(ctx);
+   intelScreenPrivate *screen = intel->intelScreen;
+   GLint pitch = pack->RowLength ? pack->RowLength : width;
+   __DRIdrawablePrivate *dPriv = intel->driDrawable;
+   int textureFormat;
+   GLenum glTextureFormat;
+   int destFormat, depthFormat, destPitch;
+   drm_clip_rect_t tmp;
+
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+
+   if (ctx->_ImageTransferState ||
+       pack->SwapBytes || pack->LsbFirst || !pack->Invert) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         fprintf(stderr, "%s: check_color failed\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   intel->vtbl.meta_texrect_source(intel, intel_readbuf_region(intel));
+
+   if (!intel->vtbl.meta_render_dest(intel, dest_region, type, format)) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         fprintf(stderr, "%s: couldn't set dest %s/%s\n",
+                 __FUNCTION__,
+                 _mesa_lookup_enum_by_nr(type),
+                 _mesa_lookup_enum_by_nr(format));
+      return GL_FALSE;
+   }
+
+   LOCK_HARDWARE(intel);
+
+   if (intel->driDrawable->numClipRects) {
+      intel->vtbl.install_meta_state(intel);
+      intel->vtbl.meta_no_depth_write(intel);
+      intel->vtbl.meta_no_stencil_write(intel);
+
+      if (!driClipRectToFramebuffer(ctx->ReadBuffer, &x, &y, &width, &height)) {
+         UNLOCK_HARDWARE(intel);
+         SET_STATE(i830, state);
+         if (INTEL_DEBUG & DEBUG_PIXEL)
+            fprintf(stderr, "%s: cliprect failed\n", __FUNCTION__);
+         return GL_TRUE;
+      }
+
+      y = dPriv->h - y - height;
+      x += dPriv->x;
+      y += dPriv->y;
+
+
+      /* Set the frontbuffer up as a large rectangular texture.
+       */
+      intel->vtbl.meta_tex_rect_source(intel, src_region, textureFormat);
+
+
+      intel->vtbl.meta_texture_blend_replace(i830, glTextureFormat);
+
+
+      /* Set the 3d engine to draw into the destination region:
+       */
+
+      intel->vtbl.meta_draw_region(intel, dest_region);
+      intel->vtbl.meta_draw_format(intel, destFormat, depthFormat);     /* ?? */
+
+
+      /* Draw a single quad, no cliprects:
+       */
+      intel->vtbl.meta_disable_cliprects(intel);
+
+      intel->vtbl.draw_quad(intel,
+                            0, width, 0, height,
+                            0x00ff00ff, x, x + width, y, y + height);
+
+      intel->vtbl.leave_meta_state(intel);
+   }
+   UNLOCK_HARDWARE(intel);
+
+   intel_region_wait_fence(ctx, dest_region);   /* required by GL */
+   return GL_TRUE;
+#endif
+
+   return GL_FALSE;
+}
+
+
+
+
+static GLboolean
+do_blit_readpixels(GLcontext * ctx,
+                   GLint x, GLint y, GLsizei width, GLsizei height,
+                   GLenum format, GLenum type,
+                   const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_region *src = intel_readbuf_region(intel);
+   struct intel_buffer_object *dst = intel_buffer_object(pack->BufferObj);
+   GLuint dst_offset;
+   GLuint rowLength;
+
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   if (!src)
+      return GL_FALSE;
+
+   if (!_mesa_is_bufferobj(pack->BufferObj)) {
+      /* PBO only for now:
+       */
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         _mesa_printf("%s - not PBO\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+
+   if (ctx->_ImageTransferState ||
+       !intel_check_blit_format(src, format, type)) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         _mesa_printf("%s - bad format for blit\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   if (pack->Alignment != 1 || pack->SwapBytes || pack->LsbFirst) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         _mesa_printf("%s: bad packing params\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   if (pack->RowLength > 0)
+      rowLength = pack->RowLength;
+   else
+      rowLength = width;
+
+   if (pack->Invert) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         _mesa_printf("%s: MESA_PACK_INVERT not done yet\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+   else {
+      rowLength = -rowLength;
+   }
+
+   /* XXX 64-bit cast? */
+   dst_offset = (GLuint) _mesa_image_address(2, pack, pixels, width, height,
+                                             format, type, 0, 0, 0);
+
+
+   /* Although the blits go on the command buffer, need to do this and
+    * fire with lock held to guarentee cliprects are correct.
+    */
+   intelFlush(&intel->ctx);
+   LOCK_HARDWARE(intel);
+
+   if (intel->driDrawable->numClipRects) {
+      GLboolean all = (width * height * src->cpp == dst->Base.Size &&
+                       x == 0 && dst_offset == 0);
+
+      dri_bo *dst_buffer = intel_bufferobj_buffer(intel, dst,
+						  all ? INTEL_WRITE_FULL :
+						  INTEL_WRITE_PART);
+      __DRIdrawablePrivate *dPriv = intel->driDrawable;
+      int nbox = dPriv->numClipRects;
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      drm_clip_rect_t rect;
+      drm_clip_rect_t src_rect;
+      int i;
+
+      src_rect.x1 = dPriv->x + x;
+      src_rect.y1 = dPriv->y + dPriv->h - (y + height);
+      src_rect.x2 = src_rect.x1 + width;
+      src_rect.y2 = src_rect.y1 + height;
+
+
+
+      for (i = 0; i < nbox; i++) {
+         if (!intel_intersect_cliprects(&rect, &src_rect, &box[i]))
+            continue;
+
+         if (!intelEmitCopyBlit(intel,
+				src->cpp,
+				src->pitch, src->buffer, 0, src->tiling,
+				rowLength, dst_buffer, dst_offset, GL_FALSE,
+				rect.x1,
+				rect.y1,
+				rect.x1 - src_rect.x1,
+				rect.y2 - src_rect.y2,
+				rect.x2 - rect.x1, rect.y2 - rect.y1,
+				GL_COPY)) {
+	    UNLOCK_HARDWARE(intel);
+	    return GL_FALSE;
+	 }
+      }
+   }
+   UNLOCK_HARDWARE(intel);
+
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      _mesa_printf("%s - DONE\n", __FUNCTION__);
+
+   return GL_TRUE;
+}
+
+void
+intelReadPixels(GLcontext * ctx,
+                GLint x, GLint y, GLsizei width, GLsizei height,
+                GLenum format, GLenum type,
+                const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
+{
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   intelFlush(ctx);
+
+#ifdef I915
+   if (do_blit_readpixels
+       (ctx, x, y, width, height, format, type, pack, pixels))
+      return;
+
+   if (do_texture_readpixels
+       (ctx, x, y, width, height, format, type, pack, pixels))
+      return;
+#else
+   (void)do_blit_readpixels;
+   (void)do_texture_readpixels;
+#endif
+
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      _mesa_printf("%s: fallback to swrast\n", __FUNCTION__);
+
+   /* Update Mesa state before calling down into _swrast_ReadPixels, as
+    * the spans code requires the computed buffer states to be up to date,
+    * but _swrast_ReadPixels only updates Mesa state after setting up
+    * the spans code.
+    */
+
+   if (ctx->NewState)
+      _mesa_update_state(ctx);
+
+   _swrast_ReadPixels(ctx, x, y, width, height, format, type, pack, pixels);
+}
diff --git a/src/mesa/drivers/dri/intel/intel_reg.h b/src/mesa/drivers/dri/intel/intel_reg.h
index 57ac8f0cc1..d19f1bae34 100644
--- a/src/mesa/drivers/dri/intel/intel_reg.h
+++ b/src/mesa/drivers/dri/intel/intel_reg.h
@@ -189,6 +189,19 @@
 
 #define S7_DEPTH_OFFSET_CONST_MASK     ~0
 
+/* p143 */
+#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
+/* Dword 1 */
+#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
+#define BUF_3D_ID_DEPTH 	(0x7<<24)
+#define BUF_3D_USE_FENCE	(1<<23)
+#define BUF_3D_TILED_SURFACE	(1<<22)
+#define BUF_3D_TILE_WALK_X	0
+#define BUF_3D_TILE_WALK_Y	(1<<21)
+#define BUF_3D_PITCH(x)         (((x)/4)<<2)
+/* Dword 2 */
+#define BUF_3D_ADDR(x)		((x) & ~0x3)
+
 /* Primitive dispatch on 830-945 */
 #define _3DPRIMITIVE			(CMD_3D | (0x1f << 24))
 #define PRIM_INDIRECT            (1<<23)
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index 51ce32a967..a86c66a844 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -52,17 +52,77 @@
 
 #define FILE_DEBUG_FLAG DEBUG_REGION
 
+/* This should be set to the maximum backtrace size desired.
+ * Set it to 0 to disable backtrace debugging.
+ */
+#define DEBUG_BACKTRACE_SIZE 0
+
+#if DEBUG_BACKTRACE_SIZE == 0
+/* Use the standard debug output */
+#define _DBG(...) DBG(__VA_ARGS__)
+#else
+/* Use backtracing debug output */
+#define _DBG(...) {debug_backtrace(); DBG(__VA_ARGS__);}
+
+/* Backtracing debug support */
+#include <execinfo.h>
+
+static void
+debug_backtrace(void)
+{
+   void *trace[DEBUG_BACKTRACE_SIZE];
+   char **strings = NULL;
+   int traceSize;
+   register int i;
+
+   traceSize = backtrace(trace, DEBUG_BACKTRACE_SIZE);
+   strings = backtrace_symbols(trace, traceSize);
+   if (strings == NULL) {
+      DBG("no backtrace:");
+      return;
+   }
+
+   /* Spit out all the strings with a colon separator.  Ignore
+    * the first, since we don't really care about the call
+    * to debug_backtrace() itself.  Skip until the final "/" in
+    * the trace to avoid really long lines.
+    */
+   for (i = 1; i < traceSize; i++) {
+      char *p = strings[i], *slash = strings[i];
+      while (*p) {
+         if (*p++ == '/') {
+            slash = p;
+         }
+      }
+
+      DBG("%s:", slash);
+   }
+
+   /* Free up the memory, and we're done */
+   free(strings);
+}
+
+#endif
+
+
+
 /* XXX: Thread safety?
  */
 GLubyte *
 intel_region_map(struct intel_context *intel, struct intel_region *region)
 {
-   DBG("%s\n", __FUNCTION__);
+   intelFlush(&intel->ctx);
+
+   _DBG("%s %p\n", __FUNCTION__, region);
    if (!region->map_refcount++) {
       if (region->pbo)
          intel_region_cow(intel, region);
 
-      dri_bo_map(region->buffer, GL_TRUE);
+      if (region->tiling != I915_TILING_NONE &&
+	  intel->intelScreen->kernel_exec_fencing)
+	 drm_intel_gem_bo_map_gtt(region->buffer);
+      else
+	 dri_bo_map(region->buffer, GL_TRUE);
       region->map = region->buffer->virtual;
    }
 
@@ -72,9 +132,13 @@ intel_region_map(struct intel_context *intel, struct intel_region *region)
 void
 intel_region_unmap(struct intel_context *intel, struct intel_region *region)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s %p\n", __FUNCTION__, region);
    if (!--region->map_refcount) {
-      dri_bo_unmap(region->buffer);
+      if (region->tiling != I915_TILING_NONE &&
+	  intel->intelScreen->kernel_exec_fencing)
+	 drm_intel_gem_bo_unmap_gtt(region->buffer);
+      else
+	 dri_bo_unmap(region->buffer);
       region->map = NULL;
    }
 }
@@ -87,10 +151,10 @@ intel_region_alloc_internal(struct intel_context *intel,
 {
    struct intel_region *region;
 
-   DBG("%s\n", __FUNCTION__);
-
-   if (buffer == NULL)
+   if (buffer == NULL) {
+      _DBG("%s <-- NULL\n", __FUNCTION__);
       return NULL;
+   }
 
    region = calloc(sizeof(*region), 1);
    region->cpp = cpp;
@@ -104,19 +168,59 @@ intel_region_alloc_internal(struct intel_context *intel,
    region->tiling = I915_TILING_NONE;
    region->bit_6_swizzle = I915_BIT_6_SWIZZLE_NONE;
 
+   _DBG("%s <-- %p\n", __FUNCTION__, region);
    return region;
 }
 
 struct intel_region *
 intel_region_alloc(struct intel_context *intel,
-                   GLuint cpp, GLuint width, GLuint height, GLuint pitch)
+		   uint32_t tiling,
+                   GLuint cpp, GLuint width, GLuint height, GLuint pitch,
+		   GLboolean expect_accelerated_upload)
 {
    dri_bo *buffer;
+   struct intel_region *region;
+
+   /* If we're tiled, our allocations are in 8 or 32-row blocks, so
+    * failure to align our height means that we won't allocate enough pages.
+    *
+    * If we're untiled, we still have to align to 2 rows high because the
+    * data port accesses 2x2 blocks even if the bottom row isn't to be
+    * rendered, so failure to align means we could walk off the end of the
+    * GTT and fault.
+    */
+   if (tiling == I915_TILING_X)
+      height = ALIGN(height, 8);
+   else if (tiling == I915_TILING_Y)
+      height = ALIGN(height, 32);
+   else
+      height = ALIGN(height, 2);
 
-   buffer = dri_bo_alloc(intel->bufmgr, "region",
-			 pitch * cpp * height, 64);
+   /* If we're untiled, we have to align to 2 rows high because the
+    * data port accesses 2x2 blocks even if the bottom row isn't to be
+    * rendered, so failure to align means we could walk off the end of the
+    * GTT and fault.
+    */
+   height = ALIGN(height, 2);
 
-   return intel_region_alloc_internal(intel, cpp, width, height, pitch, buffer);
+   if (expect_accelerated_upload) {
+      buffer = drm_intel_bo_alloc_for_render(intel->bufmgr, "region",
+					     pitch * cpp * height, 64);
+   } else {
+      buffer = drm_intel_bo_alloc(intel->bufmgr, "region",
+				  pitch * cpp * height, 64);
+   }
+
+   region = intel_region_alloc_internal(intel, cpp, width, height,
+					pitch, buffer);
+
+   if (tiling != I915_TILING_NONE) {
+      assert(((pitch * cpp) & 127) == 0);
+      drm_intel_bo_set_tiling(buffer, &tiling, pitch * cpp);
+      drm_intel_bo_get_tiling(buffer, &region->tiling, &region->bit_6_swizzle);
+   }
+
+   return region;
 }
 
 struct intel_region *
@@ -152,7 +256,7 @@ void
 intel_region_reference(struct intel_region **dst, struct intel_region *src)
 {
    if (src)
-      DBG("%s %d\n", __FUNCTION__, src->refcount);
+      _DBG("%s %p %d\n", __FUNCTION__, src, src->refcount);
 
    assert(*dst == NULL);
    if (src) {
@@ -166,10 +270,12 @@ intel_region_release(struct intel_region **region_handle)
 {
    struct intel_region *region = *region_handle;
 
-   if (region == NULL)
+   if (region == NULL) {
+      _DBG("%s NULL\n", __FUNCTION__);
       return;
+   }
 
-   DBG("%s %d\n", __FUNCTION__, region->refcount - 1);
+   _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
 
    ASSERT(region->refcount > 0);
    region->refcount--;
@@ -243,9 +349,7 @@ intel_region_data(struct intel_context *intel,
                   const void *src, GLuint src_pitch,
                   GLuint srcx, GLuint srcy, GLuint width, GLuint height)
 {
-   GLboolean locked = GL_FALSE;
-
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
       return;
@@ -258,39 +362,33 @@ intel_region_data(struct intel_context *intel,
          intel_region_cow(intel, dst);
    }
 
-   if (!intel->locked) {
-      LOCK_HARDWARE(intel);
-      locked = GL_TRUE;
-   }
-
+   LOCK_HARDWARE(intel);
    _mesa_copy_rect(intel_region_map(intel, dst) + dst_offset,
                    dst->cpp,
                    dst->pitch,
                    dstx, dsty, width, height, src, src_pitch, srcx, srcy);
 
    intel_region_unmap(intel, dst);
-
-   if (locked)
-      UNLOCK_HARDWARE(intel);
-
+   UNLOCK_HARDWARE(intel);
 }
 
 /* Copy rectangular sub-regions. Need better logic about when to
  * push buffers into AGP - will currently do so whenever possible.
  */
-void
+GLboolean
 intel_region_copy(struct intel_context *intel,
                   struct intel_region *dst,
                   GLuint dst_offset,
                   GLuint dstx, GLuint dsty,
                   struct intel_region *src,
                   GLuint src_offset,
-                  GLuint srcx, GLuint srcy, GLuint width, GLuint height)
+                  GLuint srcx, GLuint srcy, GLuint width, GLuint height,
+		  GLenum logicop)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
-      return;
+      return GL_FALSE;
 
    if (dst->pbo) {
       if (dstx == 0 &&
@@ -302,41 +400,12 @@ intel_region_copy(struct intel_context *intel,
 
    assert(src->cpp == dst->cpp);
 
-   intelEmitCopyBlit(intel,
-                     dst->cpp,
-                     src->pitch, src->buffer, src_offset, src->tiling,
-                     dst->pitch, dst->buffer, dst_offset, dst->tiling,
-                     srcx, srcy, dstx, dsty, width, height,
-		     GL_COPY);
-}
-
-/* Fill a rectangular sub-region.  Need better logic about when to
- * push buffers into AGP - will currently do so whenever possible.
- */
-void
-intel_region_fill(struct intel_context *intel,
-                  struct intel_region *dst,
-                  GLuint dst_offset,
-                  GLuint dstx, GLuint dsty,
-                  GLuint width, GLuint height, GLuint color)
-{
-   DBG("%s\n", __FUNCTION__);
-
-   if (intel == NULL)
-      return;   
-
-   if (dst->pbo) {
-      if (dstx == 0 &&
-          dsty == 0 && width == dst->pitch && height == dst->height)
-         intel_region_release_pbo(intel, dst);
-      else
-         intel_region_cow(intel, dst);
-   }
-
-   intelEmitFillBlit(intel,
-                     dst->cpp,
-                     dst->pitch, dst->buffer, dst_offset, dst->tiling,
-                     dstx, dsty, width, height, color);
+   return intelEmitCopyBlit(intel,
+			    dst->cpp,
+			    src->pitch, src->buffer, src_offset, src->tiling,
+			    dst->pitch, dst->buffer, dst_offset, dst->tiling,
+			    srcx, srcy, dstx, dsty, width, height,
+			    logicop);
 }
 
 /* Attach to a pbo, discarding our data.  Effectively zero-copy upload
@@ -347,9 +416,13 @@ intel_region_attach_pbo(struct intel_context *intel,
                         struct intel_region *region,
                         struct intel_buffer_object *pbo)
 {
+   dri_bo *buffer;
+
    if (region->pbo == pbo)
       return;
 
+   _DBG("%s %p %p\n", __FUNCTION__, region, pbo);
+
    /* If there is already a pbo attached, break the cow tie now.
     * Don't call intel_region_release_pbo() as that would
     * unnecessarily allocate a new buffer we would have to immediately
@@ -365,10 +438,13 @@ intel_region_attach_pbo(struct intel_context *intel,
       region->buffer = NULL;
    }
 
+   /* make sure pbo has a buffer of its own */
+   buffer = intel_bufferobj_buffer(intel, pbo, INTEL_WRITE_FULL);
+
    region->pbo = pbo;
    region->pbo->region = region;
-   dri_bo_reference(pbo->buffer);
-   region->buffer = pbo->buffer;
+   dri_bo_reference(buffer);
+   region->buffer = buffer;
 }
 
 
@@ -379,6 +455,7 @@ void
 intel_region_release_pbo(struct intel_context *intel,
                          struct intel_region *region)
 {
+   _DBG("%s %p\n", __FUNCTION__, region);
    assert(region->buffer == region->pbo->buffer);
    region->pbo->region = NULL;
    region->pbo = NULL;
@@ -397,34 +474,27 @@ void
 intel_region_cow(struct intel_context *intel, struct intel_region *region)
 {
    struct intel_buffer_object *pbo = region->pbo;
-   GLboolean was_locked = intel->locked;
-
-   if (intel == NULL)
-      return;
+   GLboolean ok;
 
    intel_region_release_pbo(intel, region);
 
    assert(region->cpp * region->pitch * region->height == pbo->Base.Size);
 
-   DBG("%s (%d bytes)\n", __FUNCTION__, pbo->Base.Size);
+   _DBG("%s %p (%d bytes)\n", __FUNCTION__, region, pbo->Base.Size);
 
    /* Now blit from the texture buffer to the new buffer: 
     */
 
-   was_locked = intel->locked;
-   if (!was_locked)
-      LOCK_HARDWARE(intel);
-
-   intelEmitCopyBlit(intel,
-		     region->cpp,
-		     region->pitch, region->buffer, 0, region->tiling,
-		     region->pitch, pbo->buffer, 0, region->tiling,
-		     0, 0, 0, 0,
-		     region->pitch, region->height,
-		     GL_COPY);
-
-   if (!was_locked)
-      UNLOCK_HARDWARE(intel);
+   LOCK_HARDWARE(intel);
+   ok = intelEmitCopyBlit(intel,
+                          region->cpp,
+                          region->pitch, pbo->buffer, 0, region->tiling,
+                          region->pitch, region->buffer, 0, region->tiling,
+                          0, 0, 0, 0,
+                          region->pitch, region->height,
+                          GL_COPY);
+   assert(ok);
+   UNLOCK_HARDWARE(intel);
 }
 
 dri_bo *
@@ -453,6 +523,10 @@ intel_recreate_static(struct intel_context *intel,
    if (region == NULL) {
       region = calloc(sizeof(*region), 1);
       region->refcount = 1;
+      _DBG("%s creating new region %p\n", __FUNCTION__, region);
+   }
+   else {
+      _DBG("%s %p\n", __FUNCTION__, region);
    }
 
    if (intel->ctx.Visual.rgbBits == 24)
@@ -460,7 +534,8 @@ intel_recreate_static(struct intel_context *intel,
    else
       region->cpp = intel->ctx.Visual.rgbBits / 8;
    region->pitch = intelScreen->pitch;
-   region->height = intelScreen->height;     /* needed? */
+   region->width = intelScreen->width;
+   region->height = intelScreen->height;
 
    if (region->buffer != NULL) {
       dri_bo_unreference(region->buffer);
diff --git a/src/mesa/drivers/dri/intel/intel_regions.h b/src/mesa/drivers/dri/intel/intel_regions.h
index 4b120ba4ce..0d379bdc6e 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.h
+++ b/src/mesa/drivers/dri/intel/intel_regions.h
@@ -73,8 +73,10 @@ struct intel_region
  * copied by calling intel_reference_region().
  */
 struct intel_region *intel_region_alloc(struct intel_context *intel,
-                                        GLuint cpp, GLuint width,
-                                        GLuint height, GLuint pitch);
+                                        uint32_t tiling,
+					GLuint cpp, GLuint width,
+                                        GLuint height, GLuint pitch,
+					GLboolean expect_accelerated_upload);
 
 struct intel_region *
 intel_region_alloc_for_handle(struct intel_context *intel,
@@ -108,21 +110,15 @@ void intel_region_data(struct intel_context *intel,
 
 /* Copy rectangular sub-regions
  */
-void intel_region_copy(struct intel_context *intel,
-                       struct intel_region *dest,
-                       GLuint dest_offset,
-                       GLuint destx, GLuint desty,
-                       struct intel_region *src,
-                       GLuint src_offset,
-                       GLuint srcx, GLuint srcy, GLuint width, GLuint height);
-
-/* Fill a rectangular sub-region
- */
-void intel_region_fill(struct intel_context *intel,
-                       struct intel_region *dest,
-                       GLuint dest_offset,
-                       GLuint destx, GLuint desty,
-                       GLuint width, GLuint height, GLuint color);
+GLboolean
+intel_region_copy(struct intel_context *intel,
+		  struct intel_region *dest,
+		  GLuint dest_offset,
+		  GLuint destx, GLuint desty,
+		  struct intel_region *src,
+		  GLuint src_offset,
+		  GLuint srcx, GLuint srcy, GLuint width, GLuint height,
+		  GLenum logicop);
 
 /* Helpers for zerocopy uploads, particularly texture image uploads:
  */
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 2809fadd88..d547077929 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -29,27 +29,31 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/framebuffer.h"
-#include "main/matrix.h"
 #include "main/renderbuffer.h"
-#include "main/simple_list.h"
+
 #include "utils.h"
 #include "vblank.h"
 #include "xmlpool.h"
 
-
-#include "intel_screen.h"
-
+#include "intel_batchbuffer.h"
 #include "intel_buffers.h"
-#include "intel_tex.h"
-#include "intel_span.h"
-#include "intel_fbo.h"
+#include "intel_bufmgr.h"
 #include "intel_chipset.h"
+#include "intel_extensions.h"
+#include "intel_fbo.h"
+#include "intel_regions.h"
+#include "intel_swapbuffers.h"
+#include "intel_screen.h"
+#include "intel_span.h"
+#include "intel_tex.h"
 
 #include "i915_drm.h"
 #include "i830_dri.h"
-#include "intel_regions.h"
-#include "intel_batchbuffer.h"
-#include "intel_bufmgr.h"
+
+#define DRI_CONF_TEXTURE_TILING(def) \
+	DRI_CONF_OPT_BEGIN(texture_tiling, bool, def)		\
+		DRI_CONF_DESC(en, "Enable texture tiling")	\
+	DRI_CONF_OPT_END					\
 
 PUBLIC const char __driConfigOptions[] =
    DRI_CONF_BEGIN
@@ -65,6 +69,17 @@ PUBLIC const char __driConfigOptions[] =
 	    DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
 	 DRI_CONF_DESC_END
       DRI_CONF_OPT_END
+
+#ifdef I915
+     DRI_CONF_TEXTURE_TILING(false)
+#else
+     DRI_CONF_TEXTURE_TILING(true)
+#endif
+
+      DRI_CONF_OPT_BEGIN(early_z, bool, false)
+	 DRI_CONF_DESC(en, "Enable early Z in classic mode (unstable, 945-only).")
+      DRI_CONF_OPT_END
+
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_QUALITY
       DRI_CONF_FORCE_S3TC_ENABLE(false)
@@ -72,10 +87,12 @@ PUBLIC const char __driConfigOptions[] =
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_DEBUG
      DRI_CONF_NO_RAST(false)
+     DRI_CONF_ALWAYS_FLUSH_BATCH(false)
+     DRI_CONF_ALWAYS_FLUSH_CACHE(false)
    DRI_CONF_SECTION_END
 DRI_CONF_END;
 
-const GLuint __driNConfigOptions = 6;
+const GLuint __driNConfigOptions = 10;
 
 #ifdef USE_NEW_INTERFACE
 static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
@@ -160,7 +177,7 @@ intelPrintSAREA(const drm_i915_sarea_t * sarea)
  * A number of the screen parameters are obtained/computed from
  * information in the SAREA.  This function updates those parameters.
  */
-void
+static void
 intelUpdateScreenFromSAREA(intelScreenPrivate * intelScreen,
                            drm_i915_sarea_t * sarea)
 {
@@ -210,6 +227,7 @@ static const __DRItexOffsetExtension intelTexOffsetExtension = {
 static const __DRItexBufferExtension intelTexBufferExtension = {
     { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
    intelSetTexBuffer,
+   intelSetTexBuffer2,
 };
 
 static int
@@ -279,7 +297,7 @@ intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
 
    ret = drmCommandWriteRead(psp->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
    if (ret) {
-      fprintf(stderr, "drm_i915_getparam: %d\n", ret);
+      _mesa_warning(NULL, "drm_i915_getparam: %d", ret);
       return GL_FALSE;
    }
 
@@ -348,6 +366,7 @@ intelDestroyScreen(__DRIscreenPrivate * sPriv)
 
    dri_bufmgr_destroy(intelScreen->bufmgr);
    intelUnmapScreenRegions(intelScreen);
+   driDestroyOptionInfo(&intelScreen->optionCache);
 
    FREE(intelScreen);
    sPriv->private = NULL;
@@ -362,15 +381,13 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
                   __DRIdrawablePrivate * driDrawPriv,
                   const __GLcontextModes * mesaVis, GLboolean isPixmap)
 {
-   intelScreenPrivate *screen = (intelScreenPrivate *) driScrnPriv->private;
-
    if (isPixmap) {
       return GL_FALSE;          /* not implemented */
    }
    else {
       GLboolean swStencil = (mesaVis->stencilBits > 0 &&
                              mesaVis->depthBits != 24);
-      GLenum rgbFormat = (mesaVis->redBits == 5 ? GL_RGB5 : GL_RGBA8);
+      GLenum rgbFormat;
 
       struct intel_framebuffer *intel_fb = CALLOC_STRUCT(intel_framebuffer);
 
@@ -379,6 +396,13 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
 
       _mesa_initialize_framebuffer(&intel_fb->Base, mesaVis);
 
+      if (mesaVis->redBits == 5)
+	 rgbFormat = GL_RGB5;
+      else if (mesaVis->alphaBits == 0)
+	 rgbFormat = GL_RGB8;
+      else
+	 rgbFormat = GL_RGBA8;
+
       /* setup the hardware-based renderbuffers */
       intel_fb->color_rb[0] = intel_create_renderbuffer(rgbFormat);
       _mesa_add_renderbuffer(&intel_fb->Base, BUFFER_FRONT_LEFT,
@@ -432,7 +456,31 @@ intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
 static void
 intelDestroyBuffer(__DRIdrawablePrivate * driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   struct intel_framebuffer *intel_fb = driDrawPriv->driverPrivate;
+   struct intel_renderbuffer *depth_rb;
+   struct intel_renderbuffer *stencil_rb;
+
+   if (intel_fb) {
+      if (intel_fb->color_rb[0]) {
+         intel_renderbuffer_set_region(intel_fb->color_rb[0], NULL);
+      }
+
+      if (intel_fb->color_rb[1]) {
+         intel_renderbuffer_set_region(intel_fb->color_rb[1], NULL);
+      }
+
+      depth_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+      if (depth_rb) {
+         intel_renderbuffer_set_region(depth_rb, NULL);
+      }
+
+      stencil_rb = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+      if (stencil_rb) {
+         intel_renderbuffer_set_region(stencil_rb, NULL);
+      }
+   }
+
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
@@ -513,8 +561,6 @@ intelFillInModes(__DRIscreenPrivate *psp,
    __GLcontextModes *m;
    unsigned depth_buffer_factor;
    unsigned back_buffer_factor;
-   GLenum fb_format;
-   GLenum fb_type;
    int i;
 
    /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
@@ -526,6 +572,7 @@ intelFillInModes(__DRIscreenPrivate *psp,
 
    uint8_t depth_bits_array[3];
    uint8_t stencil_bits_array[3];
+   uint8_t msaa_samples_array[1];
 
    depth_bits_array[0] = 0;
    depth_bits_array[1] = depth_bits;
@@ -542,22 +589,39 @@ intelFillInModes(__DRIscreenPrivate *psp,
 
    stencil_bits_array[2] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+   msaa_samples_array[0] = 0;
+
    depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 3 : 1;
    back_buffer_factor = (have_back_buffer) ? 3 : 1;
 
    if (pixel_bits == 16) {
-      fb_format = GL_RGB;
-      fb_type = GL_UNSIGNED_SHORT_5_6_5;
+      configs = driCreateConfigs(GL_RGB, GL_UNSIGNED_SHORT_5_6_5,
+				 depth_bits_array, stencil_bits_array,
+				 depth_buffer_factor, back_buffer_modes,
+				 back_buffer_factor,
+				 msaa_samples_array, 1);
    }
    else {
-      fb_format = GL_BGRA;
-      fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+      __DRIconfig **configs_a8r8g8b8;
+      __DRIconfig **configs_x8r8g8b8;
+
+      configs_a8r8g8b8 = driCreateConfigs(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV,
+					  depth_bits_array,
+					  stencil_bits_array,
+					  depth_buffer_factor,
+					  back_buffer_modes,
+					  back_buffer_factor,
+					  msaa_samples_array, 1);
+      configs_x8r8g8b8 = driCreateConfigs(GL_BGR, GL_UNSIGNED_INT_8_8_8_8_REV,
+					  depth_bits_array,
+					  stencil_bits_array,
+					  depth_buffer_factor,
+					  back_buffer_modes,
+					  back_buffer_factor,
+					  msaa_samples_array, 1);
+      configs = driConcatConfigs(configs_a8r8g8b8, configs_x8r8g8b8);
    }
 
-   configs = driCreateConfigs(fb_format, fb_type,
-			      depth_bits_array, stencil_bits_array,
-			      depth_buffer_factor, back_buffer_modes,
-			      back_buffer_factor);
    if (configs == NULL) {
     fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
               __LINE__);
@@ -584,6 +648,7 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
    GLboolean gem_supported;
    struct drm_i915_getparam gp;
    __DRIscreenPrivate *spriv = intelScreen->driScrnPriv;
+   int num_fences = 0;
 
    intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
@@ -613,10 +678,10 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
    /* Otherwise, use the classic buffer manager. */
    if (intelScreen->bufmgr == NULL) {
       if (gem_disable) {
-	 fprintf(stderr, "GEM disabled.  Using classic.\n");
+	 _mesa_warning(NULL, "GEM disabled.  Using classic.");
       } else {
-	 fprintf(stderr, "Failed to initialize GEM.  "
-		 "Falling back to classic.\n");
+	 _mesa_warning(NULL,
+                       "Failed to initialize GEM.  Falling back to classic.");
       }
 
       if (intelScreen->tex.size == 0) {
@@ -634,8 +699,10 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
 				&intelScreen->sarea->last_dispatch);
    }
 
-   /* XXX bufmgr should be per-screen, not per-context */
-   intelScreen->ttm = intelScreen->ttm;
+   if (intel_get_param(spriv, I915_PARAM_NUM_FENCES_AVAIL, &num_fences))
+      intelScreen->kernel_exec_fencing = !!num_fences;
+   else
+      intelScreen->kernel_exec_fencing = GL_FALSE;
 
    return GL_TRUE;
 }
@@ -719,6 +786,17 @@ static const
 __DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
 {
    intelScreenPrivate *intelScreen;
+   GLenum fb_format[3];
+   GLenum fb_type[3];
+   /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
+    * support pageflipping at all.
+    */
+   static const GLenum back_buffer_modes[] = {
+      GLX_NONE, GLX_SWAP_UNDEFINED_OML, GLX_SWAP_COPY_OML
+   };
+   uint8_t depth_bits[4], stencil_bits[4], msaa_samples_array[1];
+   int color;
+   __DRIconfig **configs = NULL;
 
    /* Calling driInitExtensions here, with a NULL context pointer,
     * does not actually enable the extensions.  It just makes sure
@@ -758,8 +836,71 @@ __DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
    intelScreen->irq_active = 1;
    psp->extensions = intelScreenExtensions;
 
-   return driConcatConfigs(intelFillInModes(psp, 16, 16, 0, 1),
-			   intelFillInModes(psp, 32, 24, 8, 1));
+   depth_bits[0] = 0;
+   stencil_bits[0] = 0;
+   depth_bits[1] = 16;
+   stencil_bits[1] = 0;
+   depth_bits[2] = 24;
+   stencil_bits[2] = 0;
+   depth_bits[3] = 24;
+   stencil_bits[3] = 8;
+
+   msaa_samples_array[0] = 0;
+
+   fb_format[0] = GL_RGB;
+   fb_type[0] = GL_UNSIGNED_SHORT_5_6_5;
+
+   fb_format[1] = GL_BGR;
+   fb_type[1] = GL_UNSIGNED_INT_8_8_8_8_REV;
+
+   fb_format[2] = GL_BGRA;
+   fb_type[2] = GL_UNSIGNED_INT_8_8_8_8_REV;
+
+   depth_bits[0] = 0;
+   stencil_bits[0] = 0;
+
+   for (color = 0; color < ARRAY_SIZE(fb_format); color++) {
+      __DRIconfig **new_configs;
+      int depth_factor;
+
+      /* With DRI2 right now, GetBuffers always returns a depth/stencil buffer
+       * with the same cpp as the drawable.  So we can't support depth cpp !=
+       * color cpp currently.
+       */
+      if (fb_type[color] == GL_UNSIGNED_SHORT_5_6_5) {
+	 depth_bits[1] = 16;
+	 stencil_bits[1] = 0;
+
+	 depth_factor = 2;
+      } else {
+	 depth_bits[1] = 24;
+	 stencil_bits[1] = 0;
+	 depth_bits[2] = 24;
+	 stencil_bits[2] = 8;
+
+	 depth_factor = 3;
+      }
+      new_configs = driCreateConfigs(fb_format[color], fb_type[color],
+				     depth_bits,
+				     stencil_bits,
+				     depth_factor,
+				     back_buffer_modes,
+				     ARRAY_SIZE(back_buffer_modes),
+				     msaa_samples_array,
+				     ARRAY_SIZE(msaa_samples_array));
+      if (configs == NULL)
+	 configs = new_configs;
+      else
+	 configs = driConcatConfigs(configs, new_configs);
+   }
+
+   if (configs == NULL) {
+      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+              __LINE__);
+      return NULL;
+   }
+
+   return (const __DRIconfig **)configs;
 }
 
 const struct __DriverAPIRec driDriverAPI = {
diff --git a/src/mesa/drivers/dri/intel/intel_screen.h b/src/mesa/drivers/dri/intel/intel_screen.h
index fcd0d9c28c..a9b9e109a6 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.h
+++ b/src/mesa/drivers/dri/intel/intel_screen.h
@@ -79,6 +79,7 @@ typedef struct
    GLboolean no_vbo;
    int ttm;
    dri_bufmgr *bufmgr;
+   GLboolean kernel_exec_fencing;
 
    /**
    * Configuration cache with default values for all contexts
@@ -92,10 +93,6 @@ extern GLboolean intelMapScreenRegions(__DRIscreenPrivate * sPriv);
 
 extern void intelUnmapScreenRegions(intelScreenPrivate * intelScreen);
 
-extern void
-intelUpdateScreenFromSAREA(intelScreenPrivate * intelScreen,
-                           drm_i915_sarea_t * sarea);
-
 extern void intelDestroyContext(__DRIcontextPrivate * driContextPriv);
 
 extern GLboolean intelUnbindContext(__DRIcontextPrivate * driContextPriv);
@@ -105,11 +102,6 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
                  __DRIdrawablePrivate * driDrawPriv,
                  __DRIdrawablePrivate * driReadPriv);
 
-extern void intelSwapBuffers(__DRIdrawablePrivate * dPriv);
-
-extern void
-intelCopySubBuffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h);
-
 extern struct intel_context *intelScreenContext(intelScreenPrivate *intelScreen);
 
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_span.c b/src/mesa/drivers/dri/intel/intel_span.c
index d9315043e6..1e73943457 100644
--- a/src/mesa/drivers/dri/intel/intel_span.c
+++ b/src/mesa/drivers/dri/intel/intel_span.c
@@ -29,6 +29,7 @@
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/colormac.h"
+#include "main/texformat.h"
 
 #include "intel_buffers.h"
 #include "intel_fbo.h"
@@ -131,6 +132,18 @@ pwrite_8(struct intel_renderbuffer *irb, uint32_t offset, uint8_t val)
    dri_bo_subdata(irb->region->buffer, offset, 1, &val);
 }
 
+static uint32_t
+z24s8_to_s8z24(uint32_t val)
+{
+   return (val << 24) | (val >> 8);
+}
+
+static uint32_t
+s8z24_to_z24s8(uint32_t val)
+{
+   return (val >> 24) | (val << 8);
+}
+
 static uint32_t no_tile_swizzle(struct intel_renderbuffer *irb,
 				int x, int y)
 {
@@ -150,7 +163,7 @@ static uint32_t x_tile_swizzle(struct intel_renderbuffer *irb,
 	int	x_tile_number, y_tile_number;
 	int	tile_off, tile_base;
 	
-	tile_stride = (irb->pfPitch * irb->region->cpp) << 3;
+	tile_stride = (irb->region->pitch * irb->region->cpp) << 3;
 
 	xbyte = x * irb->region->cpp;
 
@@ -190,7 +203,7 @@ static uint32_t x_tile_swizzle(struct intel_renderbuffer *irb,
 	printf("(%d,%d) -> %d + %d = %d (pitch = %d, tstride = %d)\n",
 	       x, y, tile_off, tile_base,
 	       tile_off + tile_base,
-	       irb->pfPitch, tile_stride);
+	       irb->region->pitch, tile_stride);
 #endif
 
 	return tile_base + tile_off;
@@ -205,7 +218,7 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 	int	x_tile_number, y_tile_number;
 	int	tile_off, tile_base;
 	
-	tile_stride = (irb->pfPitch * irb->region->cpp) << 5;
+	tile_stride = (irb->region->pitch * irb->region->cpp) << 5;
 
 	xbyte = x * irb->region->cpp;
 
@@ -255,8 +268,8 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 #define LOCAL_VARS							\
    struct intel_context *intel = intel_context(ctx);			\
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);		\
-   const GLint yScale = irb->RenderToTexture ? 1 : -1;			\
-   const GLint yBias = irb->RenderToTexture ? 0 : irb->Base.Height - 1;	\
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : irb->Base.Height - 1;\
    unsigned int num_cliprects;						\
    struct drm_clip_rect *cliprects;					\
    int x_off, y_off;							\
@@ -293,107 +306,51 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 #define X_TILE(_X, _Y) x_tile_swizzle(irb, (_X) + x_off, (_Y) + y_off)
 #define Y_TILE(_X, _Y) y_tile_swizzle(irb, (_X) + x_off, (_Y) + y_off)
 
-/* 16 bit, RGB565 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    intel##x##_RGB565
-#define TAG2(x,y) intel##x##_RGB565##y
-#define GET_VALUE(X, Y) pread_16(irb, NO_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_16(irb, NO_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 32 bit, ARGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel##x##_ARGB8888
-#define TAG2(x,y) intel##x##_ARGB8888##y
-#define GET_VALUE(X, Y) pread_32(irb, NO_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_32(irb, NO_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 32 bit, xRGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel##x##_xRGB8888
-#define TAG2(x,y) intel##x##_xRGB8888##y
-#define GET_VALUE(X, Y) pread_xrgb8888(irb, NO_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_xrgb8888(irb, NO_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 16 bit RGB565 color tile spanline and pixel functions
- */
-
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    intel_XTile_##x##_RGB565
-#define TAG2(x,y) intel_XTile_##x##_RGB565##y
-#define GET_VALUE(X, Y) pread_16(irb, X_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_16(irb, X_TILE(X, Y), V)
-#include "spantmp2.h"
-
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    intel_YTile_##x##_RGB565
-#define TAG2(x,y) intel_YTile_##x##_RGB565##y
-#define GET_VALUE(X, Y) pread_16(irb, Y_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_16(irb, Y_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 32 bit ARGB888 color tile spanline and pixel functions
- */
-
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel_XTile_##x##_ARGB8888
-#define TAG2(x,y) intel_XTile_##x##_ARGB8888##y
-#define GET_VALUE(X, Y) pread_32(irb, X_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_32(irb, X_TILE(X, Y), V)
-#include "spantmp2.h"
-
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel_YTile_##x##_ARGB8888
-#define TAG2(x,y) intel_YTile_##x##_ARGB8888##y
-#define GET_VALUE(X, Y) pread_32(irb, Y_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_32(irb, Y_TILE(X, Y), V)
-#include "spantmp2.h"
-
-/* 32 bit xRGB888 color tile spanline and pixel functions
- */
-
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel_XTile_##x##_xRGB8888
-#define TAG2(x,y) intel_XTile_##x##_xRGB8888##y
-#define GET_VALUE(X, Y) pread_xrgb8888(irb, X_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_xrgb8888(irb, X_TILE(X, Y), V)
-#include "spantmp2.h"
-
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    intel_YTile_##x##_xRGB8888
-#define TAG2(x,y) intel_YTile_##x##_xRGB8888##y
-#define GET_VALUE(X, Y) pread_xrgb8888(irb, Y_TILE(X, Y))
-#define PUT_VALUE(X, Y, V) pwrite_xrgb8888(irb, Y_TILE(X, Y), V)
-#include "spantmp2.h"
+/* r5g6b5 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_RGB
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
+#define INTEL_READ_VALUE(offset) pread_16(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_16(irb, offset, v)
+#define INTEL_TAG(x) x##_RGB565
+#include "intel_spantmp.h"
+
+/* a4r4g4b4 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4_REV
+#define INTEL_READ_VALUE(offset) pread_16(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_16(irb, offset, v)
+#define INTEL_TAG(x) x##_ARGB4444
+#include "intel_spantmp.h"
+
+/* a1r5g5b5 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5_REV
+#define INTEL_READ_VALUE(offset) pread_16(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_16(irb, offset, v)
+#define INTEL_TAG(x) x##_ARGB1555
+#include "intel_spantmp.h"
+
+/* a8r8g8b8 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+#define INTEL_READ_VALUE(offset) pread_32(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_32(irb, offset, v)
+#define INTEL_TAG(x) x##_ARGB8888
+#include "intel_spantmp.h"
+
+/* x8r8g8b8 color span and pixel functions */
+#define INTEL_PIXEL_FMT GL_BGRA
+#define INTEL_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+#define INTEL_READ_VALUE(offset) pread_xrgb8888(irb, offset)
+#define INTEL_WRITE_VALUE(offset, v) pwrite_xrgb8888(irb, offset, v)
+#define INTEL_TAG(x) x##_xRGB8888
+#include "intel_spantmp.h"
 
 #define LOCAL_DEPTH_VARS						\
    struct intel_context *intel = intel_context(ctx);			\
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);		\
-   const GLint yScale = irb->RenderToTexture ? 1 : -1;			\
-   const GLint yBias = irb->RenderToTexture ? 0 : irb->Base.Height - 1; \
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : irb->Base.Height - 1;\
    unsigned int num_cliprects;						\
    struct drm_clip_rect *cliprects;					\
    int x_off, y_off;							\
@@ -402,98 +359,26 @@ static uint32_t y_tile_swizzle(struct intel_renderbuffer *irb,
 
 #define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
 
-/**
- ** 16-bit depthbuffer functions.
- **/
-#define VALUE_TYPE GLushort
-#define WRITE_DEPTH(_x, _y, d) pwrite_16(irb, NO_TILE(_x, _y), d)
-#define READ_DEPTH(d, _x, _y) d = pread_16(irb, NO_TILE(_x, _y))
-#define TAG(x) intel##x##_z16
-#include "depthtmp.h"
-
+/* z16 depthbuffer functions. */
+#define INTEL_VALUE_TYPE GLushort
+#define INTEL_WRITE_DEPTH(offset, d) pwrite_16(irb, offset, d)
+#define INTEL_READ_DEPTH(offset) pread_16(irb, offset)
+#define INTEL_TAG(name) name##_z16
+#include "intel_depthtmp.h"
 
-/**
- ** 16-bit x tile depthbuffer functions.
- **/
-#define VALUE_TYPE GLushort
-#define WRITE_DEPTH(_x, _y, d) pwrite_16(irb, X_TILE(_x, _y), d)
-#define READ_DEPTH(d, _x, _y) d = pread_16(irb, X_TILE(_x, _y))
-#define TAG(x) intel_XTile_##x##_z16
-#include "depthtmp.h"
+/* z24 depthbuffer functions. */
+#define INTEL_VALUE_TYPE GLuint
+#define INTEL_WRITE_DEPTH(offset, d) pwrite_32(irb, offset, d)
+#define INTEL_READ_DEPTH(offset) pread_32(irb, offset)
+#define INTEL_TAG(name) name##_z24
+#include "intel_depthtmp.h"
 
-/**
- ** 16-bit y tile depthbuffer functions.
- **/
-#define VALUE_TYPE GLushort
-#define WRITE_DEPTH(_x, _y, d) pwrite_16(irb, Y_TILE(_x, _y), d)
-#define READ_DEPTH(d, _x, _y) d = pread_16(irb, Y_TILE(_x, _y))
-#define TAG(x) intel_YTile_##x##_z16
-#include "depthtmp.h"
-
-
-/**
- ** 24/8-bit interleaved depth/stencil functions
- ** Note: we're actually reading back combined depth+stencil values.
- ** The wrappers in main/depthstencil.c are used to extract the depth
- ** and stencil values.
- **/
-#define VALUE_TYPE GLuint
-
-/* Change ZZZS -> SZZZ */
-#define WRITE_DEPTH(_x, _y, d)					\
-   pwrite_32(irb, NO_TILE(_x, _y), ((d) >> 8) | ((d) << 24))
-
-/* Change SZZZ -> ZZZS */
-#define READ_DEPTH( d, _x, _y ) {				\
-   GLuint tmp = pread_32(irb, NO_TILE(_x, _y));			\
-   d = (tmp << 8) | (tmp >> 24);				\
-}
-
-#define TAG(x) intel##x##_z24_s8
-#include "depthtmp.h"
-
-
-/**
- ** 24/8-bit x-tile interleaved depth/stencil functions
- ** Note: we're actually reading back combined depth+stencil values.
- ** The wrappers in main/depthstencil.c are used to extract the depth
- ** and stencil values.
- **/
-#define VALUE_TYPE GLuint
-
-/* Change ZZZS -> SZZZ */
-#define WRITE_DEPTH(_x, _y, d)					\
-   pwrite_32(irb, X_TILE(_x, _y), ((d) >> 8) | ((d) << 24))
-
-/* Change SZZZ -> ZZZS */
-#define READ_DEPTH( d, _x, _y ) {				\
-   GLuint tmp = pread_32(irb, X_TILE(_x, _y));		\
-   d = (tmp << 8) | (tmp >> 24);				\
-}
-
-#define TAG(x) intel_XTile_##x##_z24_s8
-#include "depthtmp.h"
-
-/**
- ** 24/8-bit y-tile interleaved depth/stencil functions
- ** Note: we're actually reading back combined depth+stencil values.
- ** The wrappers in main/depthstencil.c are used to extract the depth
- ** and stencil values.
- **/
-#define VALUE_TYPE GLuint
-
-/* Change ZZZS -> SZZZ */
-#define WRITE_DEPTH(_x, _y, d)					\
-   pwrite_32(irb, Y_TILE(_x, _y), ((d) >> 8) | ((d) << 24))
-
-/* Change SZZZ -> ZZZS */
-#define READ_DEPTH( d, _x, _y ) {				\
-   GLuint tmp = pread_32(irb, Y_TILE(_x, _y));			\
-   d = (tmp << 8) | (tmp >> 24);				\
-}
-
-#define TAG(x) intel_YTile_##x##_z24_s8
-#include "depthtmp.h"
+/* z24s8 depthbuffer functions. */
+#define INTEL_VALUE_TYPE GLuint
+#define INTEL_WRITE_DEPTH(offset, d) pwrite_32(irb, offset, z24s8_to_s8z24(d))
+#define INTEL_READ_DEPTH(offset) s8z24_to_z24s8(pread_32(irb, offset))
+#define INTEL_TAG(name) name##_z24_s8
+#include "intel_depthtmp.h"
 
 
 /**
@@ -528,8 +413,6 @@ intel_renderbuffer_map(struct intel_context *intel, struct gl_renderbuffer *rb)
    if (irb == NULL || irb->region == NULL)
       return;
 
-   irb->pfPitch = irb->region->pitch;
-
    intel_set_span_functions(intel, rb);
 }
 
@@ -543,7 +426,6 @@ intel_renderbuffer_unmap(struct intel_context *intel,
       return;
 
    clear_span_cache(irb);
-   irb->pfPitch = 0;
 
    rb->GetRow = NULL;
    rb->PutRow = NULL;
@@ -562,23 +444,32 @@ intel_renderbuffer_unmap(struct intel_context *intel,
  * _ColorReadBuffer, _DepthBuffer or _StencilBuffer fields.
  */
 static void
-intel_map_unmap_buffers(struct intel_context *intel, GLboolean map)
+intel_map_unmap_framebuffer(struct intel_context *intel,
+			    struct gl_framebuffer *fb,
+			    GLboolean map)
 {
    GLcontext *ctx = &intel->ctx;
    GLuint i, j;
 
-   /* color draw buffers */
-   for (j = 0; j < ctx->DrawBuffer->_NumColorDrawBuffers; j++) {
+   /* color buffers */
+   if (fb == ctx->DrawBuffer) {
+      for (j = 0; j < fb->_NumColorDrawBuffers; j++) {
+	 if (map)
+	    intel_renderbuffer_map(intel, fb->_ColorDrawBuffers[j]);
+	 else
+	    intel_renderbuffer_unmap(intel, fb->_ColorDrawBuffers[j]);
+      }
+   } else {
       if (map)
-	 intel_renderbuffer_map(intel, ctx->DrawBuffer->_ColorDrawBuffers[j]);
+	 intel_renderbuffer_map(intel, fb->_ColorReadBuffer);
       else
-	 intel_renderbuffer_unmap(intel, ctx->DrawBuffer->_ColorDrawBuffers[j]);
+	 intel_renderbuffer_unmap(intel, fb->_ColorReadBuffer);
    }
 
    /* check for render to textures */
    for (i = 0; i < BUFFER_COUNT; i++) {
       struct gl_renderbuffer_attachment *att =
-         ctx->DrawBuffer->Attachment + i;
+         fb->Attachment + i;
       struct gl_texture_object *tex = att->Texture;
       if (tex) {
          /* render to texture */
@@ -590,36 +481,28 @@ intel_map_unmap_buffers(struct intel_context *intel, GLboolean map)
       }
    }
 
-   /* color read buffers */
-   if (map)
-      intel_renderbuffer_map(intel, ctx->ReadBuffer->_ColorReadBuffer);
-   else
-      intel_renderbuffer_unmap(intel, ctx->ReadBuffer->_ColorReadBuffer);
-
    /* depth buffer (Note wrapper!) */
-   if (ctx->DrawBuffer->_DepthBuffer) {
+   if (fb->_DepthBuffer) {
       if (map)
-	 intel_renderbuffer_map(intel, ctx->DrawBuffer->_DepthBuffer->Wrapped);
+	 intel_renderbuffer_map(intel, fb->_DepthBuffer->Wrapped);
       else
 	 intel_renderbuffer_unmap(intel,
-				  ctx->DrawBuffer->_DepthBuffer->Wrapped);
+				  fb->_DepthBuffer->Wrapped);
    }
 
    /* stencil buffer (Note wrapper!) */
-   if (ctx->DrawBuffer->_StencilBuffer) {
+   if (fb->_StencilBuffer) {
       if (map)
 	 intel_renderbuffer_map(intel,
-				ctx->DrawBuffer->_StencilBuffer->Wrapped);
+				fb->_StencilBuffer->Wrapped);
       else
 	 intel_renderbuffer_unmap(intel,
-				  ctx->DrawBuffer->_StencilBuffer->Wrapped);
+				  fb->_StencilBuffer->Wrapped);
    }
 }
 
-
-
 /**
- * Prepare for softare rendering.  Map current read/draw framebuffers'
+ * Prepare for software rendering.  Map current read/draw framebuffers'
  * renderbuffes and all currently bound texture objects.
  *
  * Old note: Moved locking out to get reasonable span performance.
@@ -640,11 +523,12 @@ intelSpanRenderStart(GLcontext * ctx)
       }
    }
 
-   intel_map_unmap_buffers(intel, GL_TRUE);
+   intel_map_unmap_framebuffer(intel, ctx->DrawBuffer, GL_TRUE);
+   intel_map_unmap_framebuffer(intel, ctx->ReadBuffer, GL_TRUE);
 }
 
 /**
- * Called when done softare rendering.  Unmap the buffers we mapped in
+ * Called when done software rendering.  Unmap the buffers we mapped in
  * the above function.
  */
 void
@@ -662,7 +546,8 @@ intelSpanRenderFinish(GLcontext * ctx)
       }
    }
 
-   intel_map_unmap_buffers(intel, GL_FALSE);
+   intel_map_unmap_framebuffer(intel, ctx->DrawBuffer, GL_FALSE);
+   intel_map_unmap_framebuffer(intel, ctx->ReadBuffer, GL_FALSE);
 
    UNLOCK_HARDWARE(intel);
 }
@@ -696,8 +581,8 @@ intel_set_span_functions(struct intel_context *intel,
    else
       tiling = I915_TILING_NONE;
 
-   if (rb->_ActualFormat == GL_RGB5) {
-      /* 565 RGB */
+   switch (irb->texformat->MesaFormat) {
+   case MESA_FORMAT_RGB565:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
@@ -710,38 +595,67 @@ intel_set_span_functions(struct intel_context *intel,
 	 intel_YTile_InitPointers_RGB565(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_RGB8) {
-      /* 8888 RGBx */
+      break;
+   case MESA_FORMAT_ARGB4444:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
-	 intelInitPointers_xRGB8888(rb);
+	 intelInitPointers_ARGB4444(rb);
 	 break;
       case I915_TILING_X:
-	 intel_XTile_InitPointers_xRGB8888(rb);
+	 intel_XTile_InitPointers_ARGB4444(rb);
 	 break;
       case I915_TILING_Y:
-	 intel_YTile_InitPointers_xRGB8888(rb);
+	 intel_YTile_InitPointers_ARGB4444(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_RGBA8) {
-      /* 8888 RGBA */
+      break;
+   case MESA_FORMAT_ARGB1555:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
-	 intelInitPointers_ARGB8888(rb);
+	 intelInitPointers_ARGB1555(rb);
 	 break;
       case I915_TILING_X:
-	 intel_XTile_InitPointers_ARGB8888(rb);
+	 intel_XTile_InitPointers_ARGB1555(rb);
 	 break;
       case I915_TILING_Y:
-	 intel_YTile_InitPointers_ARGB8888(rb);
+	 intel_YTile_InitPointers_ARGB1555(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_DEPTH_COMPONENT16) {
+      break;
+   case MESA_FORMAT_ARGB8888:
+      if (rb->AlphaBits == 0) { /* XXX: Need xRGB8888 Mesa format */
+	 /* 8888 RGBx */
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitPointers_xRGB8888(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitPointers_xRGB8888(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitPointers_xRGB8888(rb);
+	    break;
+	 }
+      } else {
+	 /* 8888 RGBA */
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitPointers_ARGB8888(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitPointers_ARGB8888(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitPointers_ARGB8888(rb);
+	    break;
+	 }
+      }
+      break;
+   case MESA_FORMAT_Z16:
       switch (tiling) {
       case I915_TILING_NONE:
       default:
@@ -754,38 +668,60 @@ intel_set_span_functions(struct intel_context *intel,
 	 intel_YTile_InitDepthPointers_z16(rb);
 	 break;
       }
-   }
-   else if (rb->_ActualFormat == GL_DEPTH_COMPONENT24 ||        /* XXX FBO remove */
-            rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
-      switch (tiling) {
-      case I915_TILING_NONE:
-      default:
-	 intelInitDepthPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_X:
-	 intel_XTile_InitDepthPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_Y:
-	 intel_YTile_InitDepthPointers_z24_s8(rb);
-	 break;
-      }
-   }
-   else if (rb->_ActualFormat == GL_STENCIL_INDEX8_EXT) {
-      switch (tiling) {
-      case I915_TILING_NONE:
-      default:
-	 intelInitStencilPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_X:
-	 intel_XTile_InitStencilPointers_z24_s8(rb);
-	 break;
-      case I915_TILING_Y:
-	 intel_YTile_InitStencilPointers_z24_s8(rb);
-	 break;
+      break;
+   case MESA_FORMAT_S8_Z24:
+      /* There are a few different ways SW asks us to access the S8Z24 data:
+       * Z24 depth-only depth reads
+       * S8Z24 depth reads
+       * S8Z24 stencil reads.
+       */
+      if (rb->_ActualFormat == GL_DEPTH_COMPONENT24) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitDepthPointers_z24(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitDepthPointers_z24(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitDepthPointers_z24(rb);
+	    break;
+	 }
+      } else if (rb->_ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitDepthPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitDepthPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitDepthPointers_z24_s8(rb);
+	    break;
+	 }
+      } else if (rb->_ActualFormat == GL_STENCIL_INDEX8_EXT) {
+	 switch (tiling) {
+	 case I915_TILING_NONE:
+	 default:
+	    intelInitStencilPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_X:
+	    intel_XTile_InitStencilPointers_z24_s8(rb);
+	    break;
+	 case I915_TILING_Y:
+	    intel_YTile_InitStencilPointers_z24_s8(rb);
+	    break;
+	 }
+      } else {
+	 _mesa_problem(NULL,
+		       "Unexpected ActualFormat in intelSetSpanFunctions");
       }
-   }
-   else {
+      break;
+   default:
       _mesa_problem(NULL,
-                    "Unexpected _ActualFormat in intelSetSpanFunctions");
+                    "Unexpected MesaFormat in intelSetSpanFunctions");
+      break;
    }
 }
diff --git a/src/mesa/drivers/dri/intel/intel_spantmp.h b/src/mesa/drivers/dri/intel/intel_spantmp.h
new file mode 100644
index 0000000000..ead0b1c168
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_spantmp.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/**
+ * Wrapper around the spantmp.h macrofest to generate spans code for
+ * all the tiling styles.
+ */
+
+#define SPANTMP_PIXEL_FMT INTEL_PIXEL_FMT
+#define SPANTMP_PIXEL_TYPE INTEL_PIXEL_TYPE
+#define PUT_VALUE(_x, _y, v) INTEL_WRITE_VALUE(NO_TILE(_x, _y), v)
+#define GET_VALUE(_x, _y) INTEL_READ_VALUE(NO_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel##x)
+#define TAG2(x, y) INTEL_TAG(intel##x)##y
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT INTEL_PIXEL_FMT
+#define SPANTMP_PIXEL_TYPE INTEL_PIXEL_TYPE
+#define PUT_VALUE(_x, _y, v) INTEL_WRITE_VALUE(X_TILE(_x, _y), v)
+#define GET_VALUE(_x, _y) INTEL_READ_VALUE(X_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel_XTile_##x)
+#define TAG2(x, y) INTEL_TAG(intel_XTile_##x)##y
+#include "spantmp2.h"
+
+#define SPANTMP_PIXEL_FMT INTEL_PIXEL_FMT
+#define SPANTMP_PIXEL_TYPE INTEL_PIXEL_TYPE
+#define PUT_VALUE(_x, _y, v) INTEL_WRITE_VALUE(X_TILE(_x, _y), v)
+#define GET_VALUE(_x, _y) INTEL_READ_VALUE(X_TILE(_x, _y))
+#define TAG(x) INTEL_TAG(intel_YTile_##x)
+#define TAG2(x, y) INTEL_TAG(intel_YTile_##x)##y
+#include "spantmp2.h"
+
+#undef INTEL_PIXEL_FMT
+#undef INTEL_PIXEL_TYPE
+#undef INTEL_WRITE_VALUE
+#undef INTEL_READ_VALUE
+#undef INTEL_TAG
diff --git a/src/mesa/drivers/dri/intel/intel_state.c b/src/mesa/drivers/dri/intel/intel_state.c
new file mode 100644
index 0000000000..4ee742377d
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_state.c
@@ -0,0 +1,233 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "main/colormac.h"
+#include "main/dd.h"
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_regions.h"
+#include "swrast/swrast.h"
+
+int
+intel_translate_shadow_compare_func(GLenum func)
+{
+   switch (func) {
+   case GL_NEVER: 
+       return COMPAREFUNC_ALWAYS;
+   case GL_LESS: 
+       return COMPAREFUNC_LEQUAL;
+   case GL_LEQUAL: 
+       return COMPAREFUNC_LESS;
+   case GL_GREATER: 
+       return COMPAREFUNC_GEQUAL;
+   case GL_GEQUAL: 
+      return COMPAREFUNC_GREATER;
+   case GL_NOTEQUAL: 
+      return COMPAREFUNC_EQUAL;
+   case GL_EQUAL: 
+      return COMPAREFUNC_NOTEQUAL;
+   case GL_ALWAYS: 
+       return COMPAREFUNC_NEVER;
+   }
+
+   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
+   return COMPAREFUNC_NEVER;
+}
+
+int
+intel_translate_compare_func(GLenum func)
+{
+   switch (func) {
+   case GL_NEVER:
+      return COMPAREFUNC_NEVER;
+   case GL_LESS:
+      return COMPAREFUNC_LESS;
+   case GL_LEQUAL:
+      return COMPAREFUNC_LEQUAL;
+   case GL_GREATER:
+      return COMPAREFUNC_GREATER;
+   case GL_GEQUAL:
+      return COMPAREFUNC_GEQUAL;
+   case GL_NOTEQUAL:
+      return COMPAREFUNC_NOTEQUAL;
+   case GL_EQUAL:
+      return COMPAREFUNC_EQUAL;
+   case GL_ALWAYS:
+      return COMPAREFUNC_ALWAYS;
+   }
+
+   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
+   return COMPAREFUNC_ALWAYS;
+}
+
+int
+intel_translate_stencil_op(GLenum op)
+{
+   switch (op) {
+   case GL_KEEP:
+      return STENCILOP_KEEP;
+   case GL_ZERO:
+      return STENCILOP_ZERO;
+   case GL_REPLACE:
+      return STENCILOP_REPLACE;
+   case GL_INCR:
+      return STENCILOP_INCRSAT;
+   case GL_DECR:
+      return STENCILOP_DECRSAT;
+   case GL_INCR_WRAP:
+      return STENCILOP_INCR;
+   case GL_DECR_WRAP:
+      return STENCILOP_DECR;
+   case GL_INVERT:
+      return STENCILOP_INVERT;
+   default:
+      return STENCILOP_ZERO;
+   }
+}
+
+int
+intel_translate_blend_factor(GLenum factor)
+{
+   switch (factor) {
+   case GL_ZERO:
+      return BLENDFACT_ZERO;
+   case GL_SRC_ALPHA:
+      return BLENDFACT_SRC_ALPHA;
+   case GL_ONE:
+      return BLENDFACT_ONE;
+   case GL_SRC_COLOR:
+      return BLENDFACT_SRC_COLR;
+   case GL_ONE_MINUS_SRC_COLOR:
+      return BLENDFACT_INV_SRC_COLR;
+   case GL_DST_COLOR:
+      return BLENDFACT_DST_COLR;
+   case GL_ONE_MINUS_DST_COLOR:
+      return BLENDFACT_INV_DST_COLR;
+   case GL_ONE_MINUS_SRC_ALPHA:
+      return BLENDFACT_INV_SRC_ALPHA;
+   case GL_DST_ALPHA:
+      return BLENDFACT_DST_ALPHA;
+   case GL_ONE_MINUS_DST_ALPHA:
+      return BLENDFACT_INV_DST_ALPHA;
+   case GL_SRC_ALPHA_SATURATE:
+      return BLENDFACT_SRC_ALPHA_SATURATE;
+   case GL_CONSTANT_COLOR:
+      return BLENDFACT_CONST_COLOR;
+   case GL_ONE_MINUS_CONSTANT_COLOR:
+      return BLENDFACT_INV_CONST_COLOR;
+   case GL_CONSTANT_ALPHA:
+      return BLENDFACT_CONST_ALPHA;
+   case GL_ONE_MINUS_CONSTANT_ALPHA:
+      return BLENDFACT_INV_CONST_ALPHA;
+   }
+
+   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, factor);
+   return BLENDFACT_ZERO;
+}
+
+int
+intel_translate_logic_op(GLenum opcode)
+{
+   switch (opcode) {
+   case GL_CLEAR:
+      return LOGICOP_CLEAR;
+   case GL_AND:
+      return LOGICOP_AND;
+   case GL_AND_REVERSE:
+      return LOGICOP_AND_RVRSE;
+   case GL_COPY:
+      return LOGICOP_COPY;
+   case GL_COPY_INVERTED:
+      return LOGICOP_COPY_INV;
+   case GL_AND_INVERTED:
+      return LOGICOP_AND_INV;
+   case GL_NOOP:
+      return LOGICOP_NOOP;
+   case GL_XOR:
+      return LOGICOP_XOR;
+   case GL_OR:
+      return LOGICOP_OR;
+   case GL_OR_INVERTED:
+      return LOGICOP_OR_INV;
+   case GL_NOR:
+      return LOGICOP_NOR;
+   case GL_EQUIV:
+      return LOGICOP_EQUIV;
+   case GL_INVERT:
+      return LOGICOP_INV;
+   case GL_OR_REVERSE:
+      return LOGICOP_OR_RVRSE;
+   case GL_NAND:
+      return LOGICOP_NAND;
+   case GL_SET:
+      return LOGICOP_SET;
+   default:
+      return LOGICOP_SET;
+   }
+}
+
+
+static void
+intelClearColor(GLcontext *ctx, const GLfloat color[4])
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLubyte clear[4];
+
+   CLAMPED_FLOAT_TO_UBYTE(clear[0], color[0]);
+   CLAMPED_FLOAT_TO_UBYTE(clear[1], color[1]);
+   CLAMPED_FLOAT_TO_UBYTE(clear[2], color[2]);
+   CLAMPED_FLOAT_TO_UBYTE(clear[3], color[3]);
+
+   /* compute both 32 and 16-bit clear values */
+   intel->ClearColor8888 = INTEL_PACKCOLOR8888(clear[0], clear[1],
+                                               clear[2], clear[3]);
+   intel->ClearColor565 = INTEL_PACKCOLOR565(clear[0], clear[1], clear[2]);
+}
+
+
+/* Fallback to swrast for select and feedback.
+ */
+static void
+intelRenderMode(GLcontext *ctx, GLenum mode)
+{
+   struct intel_context *intel = intel_context(ctx);
+   FALLBACK(intel, INTEL_FALLBACK_RENDERMODE, (mode != GL_RENDER));
+}
+
+
+void
+intelInitStateFuncs(struct dd_function_table *functions)
+{
+   functions->RenderMode = intelRenderMode;
+   functions->ClearColor = intelClearColor;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_swapbuffers.c b/src/mesa/drivers/dri/intel/intel_swapbuffers.c
new file mode 100644
index 0000000000..7d035b9f6e
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_swapbuffers.c
@@ -0,0 +1,248 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "intel_blit.h"
+#include "intel_buffers.h"
+#include "intel_swapbuffers.h"
+#include "intel_fbo.h"
+#include "intel_batchbuffer.h"
+#include "drirenderbuffer.h"
+#include "vblank.h"
+#include "i915_drm.h"
+
+
+
+/*
+ * Correct a drawablePrivate's set of vblank flags WRT the current context.
+ * When considering multiple crtcs.
+ */
+GLuint
+intelFixupVblank(struct intel_context *intel, __DRIdrawablePrivate *dPriv)
+{
+   if (!intel->intelScreen->driScrnPriv->dri2.enabled &&
+       intel->intelScreen->driScrnPriv->ddx_version.minor >= 7) {
+      volatile drm_i915_sarea_t *sarea = intel->sarea;
+      drm_clip_rect_t drw_rect = { .x1 = dPriv->x, .x2 = dPriv->x + dPriv->w,
+				   .y1 = dPriv->y, .y2 = dPriv->y + dPriv->h };
+      drm_clip_rect_t planeA_rect = { .x1 = sarea->planeA_x, .y1 = sarea->planeA_y,
+				     .x2 = sarea->planeA_x + sarea->planeA_w,
+				     .y2 = sarea->planeA_y + sarea->planeA_h };
+      drm_clip_rect_t planeB_rect = { .x1 = sarea->planeB_x, .y1 = sarea->planeB_y,
+				     .x2 = sarea->planeB_x + sarea->planeB_w,
+				     .y2 = sarea->planeB_y + sarea->planeB_h };
+      GLint areaA = driIntersectArea( drw_rect, planeA_rect );
+      GLint areaB = driIntersectArea( drw_rect, planeB_rect );
+      GLuint flags = dPriv->vblFlags;
+
+      /* Update vblank info
+       */
+      if (areaB > areaA || (areaA == areaB && areaB > 0)) {
+	 flags = dPriv->vblFlags | VBLANK_FLAG_SECONDARY;
+      } else {
+	 flags = dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
+      }
+
+      /* Do the stupid test: Is one of them actually disabled?
+       */
+      if (sarea->planeA_w == 0 || sarea->planeA_h == 0) {
+	 flags = dPriv->vblFlags | VBLANK_FLAG_SECONDARY;
+      } else if (sarea->planeB_w == 0 || sarea->planeB_h == 0) {
+	 flags = dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
+      }
+
+      return flags;
+   } else {
+      return dPriv->vblFlags & ~VBLANK_FLAG_SECONDARY;
+   }
+}
+
+
+/**
+ * Called from driSwapBuffers()
+ */
+void
+intelSwapBuffers(__DRIdrawablePrivate * dPriv)
+{
+   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
+
+   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+      GET_CURRENT_CONTEXT(ctx);
+      struct intel_context *intel;
+
+      if (ctx == NULL)
+	 return;
+
+      intel = intel_context(ctx);
+
+      if (ctx->Visual.doubleBufferMode) {
+	 GLboolean missed_target;
+	 struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
+	 int64_t ust;
+         
+	 _mesa_notifySwapBuffers(ctx);  /* flush pending rendering comands */
+
+	/*
+	 * The old swapping ioctl was incredibly racy, just wait for vblank
+	 * and do the swap ourselves.
+	 */
+	 driWaitForVBlank(dPriv, &missed_target);
+
+	 /*
+	  * Update each buffer's vbl_pending so we don't get too out of
+	  * sync
+	  */
+	 intel_get_renderbuffer(&intel_fb->Base,
+		   		BUFFER_BACK_LEFT)->vbl_pending = dPriv->vblSeq;
+         intel_get_renderbuffer(&intel_fb->Base,
+		   		BUFFER_FRONT_LEFT)->vbl_pending = dPriv->vblSeq;
+
+	 intelCopyBuffer(dPriv, NULL);
+
+	 intel_fb->swap_count++;
+	 (*psp->systemTime->getUST) (&ust);
+	 if (missed_target) {
+	    intel_fb->swap_missed_count++;
+	    intel_fb->swap_missed_ust = ust - intel_fb->swap_ust;
+	 }
+
+	 intel_fb->swap_ust = ust;
+      }
+      drmCommandNone(intel->driFd, DRM_I915_GEM_THROTTLE);
+   }
+   else {
+      /* XXX this shouldn't be an error but we can't handle it for now */
+      fprintf(stderr, "%s: drawable has no context!\n", __FUNCTION__);
+   }
+}
+
+
+/**
+ * Called from driCopySubBuffer()
+ */
+void
+intelCopySubBuffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h)
+{
+   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+      struct intel_context *intel =
+         (struct intel_context *) dPriv->driContextPriv->driverPrivate;
+      GLcontext *ctx = &intel->ctx;
+
+      if (ctx->Visual.doubleBufferMode) {
+         drm_clip_rect_t rect;
+         rect.x1 = x + dPriv->x;
+         rect.y1 = (dPriv->h - y - h) + dPriv->y;
+         rect.x2 = rect.x1 + w;
+         rect.y2 = rect.y1 + h;
+         _mesa_notifySwapBuffers(ctx);  /* flush pending rendering comands */
+         intelCopyBuffer(dPriv, &rect);
+      }
+   }
+   else {
+      /* XXX this shouldn't be an error but we can't handle it for now */
+      fprintf(stderr, "%s: drawable has no context!\n", __FUNCTION__);
+   }
+}
+
+
+/**
+ * This will be called whenever the currently bound window is moved/resized.
+ * XXX: actually, it seems to NOT be called when the window is only moved (BP).
+ */
+void
+intelWindowMoved(struct intel_context *intel)
+{
+   GLcontext *ctx = &intel->ctx;
+   __DRIdrawablePrivate *dPriv = intel->driDrawable;
+   struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
+
+   if (!intel->intelScreen->driScrnPriv->dri2.enabled &&
+       intel->intelScreen->driScrnPriv->ddx_version.minor >= 7) {
+      GLuint flags = intelFixupVblank(intel, dPriv);
+
+      /* Check to see if we changed pipes */
+      if (flags != dPriv->vblFlags && dPriv->vblFlags &&
+	  !(dPriv->vblFlags & VBLANK_FLAG_NO_IRQ)) {
+	 int64_t count;
+	 drmVBlank vbl;
+	 int i;
+
+	 /*
+	  * Deal with page flipping
+	  */
+	 vbl.request.type = DRM_VBLANK_ABSOLUTE;
+
+	 if ( dPriv->vblFlags & VBLANK_FLAG_SECONDARY ) {
+	    vbl.request.type |= DRM_VBLANK_SECONDARY;
+	 }
+
+	 for (i = 0; i < 2; i++) {
+	    if (!intel_fb->color_rb[i] ||
+		(intel_fb->vbl_waited - intel_fb->color_rb[i]->vbl_pending) <=
+		(1<<23))
+	       continue;
+
+	    vbl.request.sequence = intel_fb->color_rb[i]->vbl_pending;
+	    drmWaitVBlank(intel->driFd, &vbl);
+	 }
+
+	 /*
+	  * Update msc_base from old pipe
+	  */
+	 driDrawableGetMSC32(dPriv->driScreenPriv, dPriv, &count);
+	 dPriv->msc_base = count;
+	 /*
+	  * Then get new vblank_base and vblSeq values
+	  */
+	 dPriv->vblFlags = flags;
+	 driGetCurrentVBlank(dPriv);
+	 dPriv->vblank_base = dPriv->vblSeq;
+
+	 intel_fb->vbl_waited = dPriv->vblSeq;
+
+	 for (i = 0; i < 2; i++) {
+	    if (intel_fb->color_rb[i])
+	       intel_fb->color_rb[i]->vbl_pending = intel_fb->vbl_waited;
+	 }
+      }
+   } else {
+      dPriv->vblFlags &= ~VBLANK_FLAG_SECONDARY;
+   }
+
+   /* Update Mesa's notion of window size */
+   driUpdateFramebufferSize(ctx, dPriv);
+   intel_fb->Base.Initialized = GL_TRUE; /* XXX remove someday */
+
+   /* Update hardware scissor */
+   if (ctx->Driver.Scissor != NULL) {
+      ctx->Driver.Scissor(ctx, ctx->Scissor.X, ctx->Scissor.Y,
+			  ctx->Scissor.Width, ctx->Scissor.Height);
+   }
+
+   /* Re-calculate viewport related state */
+   if (ctx->Driver.DepthRange != NULL)
+      ctx->Driver.DepthRange( ctx, ctx->Viewport.Near, ctx->Viewport.Far );
+}
diff --git a/src/mesa/drivers/dri/intel/intel_swapbuffers.h b/src/mesa/drivers/dri/intel/intel_swapbuffers.h
new file mode 100644
index 0000000000..75bb6242ff
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_swapbuffers.h
@@ -0,0 +1,52 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef INTEL_SWAPBUFFERS_H
+#define INTEL_SWAPBUFFERS_H
+
+#include "dri_util.h"
+#include "drm.h"
+
+struct intel_context;
+struct intel_framebuffer;
+
+
+extern void
+intelSwapBuffers(__DRIdrawablePrivate * dPriv);
+
+extern void
+intelCopySubBuffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h);
+
+extern GLuint
+intelFixupVblank(struct intel_context *intel, __DRIdrawablePrivate *dPriv);
+
+extern void
+intelWindowMoved(struct intel_context *intel);
+
+
+#endif /* INTEL_SWAPBUFFERS_H */
diff --git a/src/mesa/drivers/dri/intel/intel_syncobj.c b/src/mesa/drivers/dri/intel/intel_syncobj.c
new file mode 100644
index 0000000000..b6ea56d547
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_syncobj.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+/** @file intel_syncobj.c
+ *
+ * Support for ARB_sync
+ *
+ * ARB_sync is implemented by flushing the current batchbuffer and keeping a
+ * reference on it.  We can then check for completion or wait for compeltion
+ * using the normal buffer object mechanisms.  This does mean that if an
+ * application is using many sync objects, it will emit small batchbuffers
+ * which may end up being a significant overhead.  In other tests of removing
+ * gratuitous batchbuffer syncs in Mesa, it hasn't appeared to be a significant
+ * performance bottleneck, though.
+ */
+
+#include "main/simple_list.h"
+#include "main/imports.h"
+
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+#if FEATURE_ARB_sync
+
+static struct gl_sync_object *
+intel_new_sync_object(GLcontext *ctx, GLuint id)
+{
+   struct intel_sync_object *sync;
+
+   sync = _mesa_calloc(sizeof(struct intel_sync_object));
+
+   return &sync->Base;
+}
+
+static void
+intel_delete_sync_object(GLcontext *ctx, struct gl_sync_object *s)
+{
+   struct intel_sync_object *sync = (struct intel_sync_object *)s;
+
+   drm_intel_bo_unreference(sync->bo);
+   _mesa_free(sync);
+}
+
+static void
+intel_fence_sync(GLcontext *ctx, struct gl_sync_object *s,
+	       GLenum condition, GLbitfield flags)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_sync_object *sync = (struct intel_sync_object *)s;
+
+   assert(condition == GL_SYNC_GPU_COMMANDS_COMPLETE);
+   intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   sync->bo = intel->batch->buf;
+   drm_intel_bo_reference(sync->bo);
+
+   intelFlush(ctx);
+}
+
+/* We ignore the user-supplied timeout.  This is weaselly -- we're allowed to
+ * round to an implementation-dependent accuracy, and right now our
+ * implementation "rounds" to the wait-forever value.
+ *
+ * The fix would be a new kernel function to do the GTT transition with a
+ * timeout.
+ */
+static void intel_client_wait_sync(GLcontext *ctx, struct gl_sync_object *s,
+				 GLbitfield flags, GLuint64 timeout)
+{
+   struct intel_sync_object *sync = (struct intel_sync_object *)s;
+
+   if (sync->bo) {
+      drm_intel_bo_wait_rendering(sync->bo);
+      s->StatusFlag = 1;
+      drm_intel_bo_unreference(sync->bo);
+      sync->bo = NULL;
+   }
+}
+
+/* We have nothing to do for WaitSync.  Our GL command stream is sequential,
+ * so given that the sync object has already flushed the batchbuffer,
+ * any batchbuffers coming after this waitsync will naturally not occur until
+ * the previous one is done.
+ */
+static void intel_server_wait_sync(GLcontext *ctx, struct gl_sync_object *s,
+				 GLbitfield flags, GLuint64 timeout)
+{
+}
+
+static void intel_check_sync(GLcontext *ctx, struct gl_sync_object *s)
+{
+   struct intel_sync_object *sync = (struct intel_sync_object *)s;
+
+   if (sync->bo && drm_intel_bo_busy(sync->bo)) {
+      drm_intel_bo_unreference(sync->bo);
+      sync->bo = NULL;
+      s->StatusFlag = 1;
+   }
+}
+
+void intel_init_syncobj_functions(struct dd_function_table *functions)
+{
+   functions->NewSyncObject = intel_new_sync_object;
+   functions->DeleteSyncObject = intel_delete_sync_object;
+   functions->FenceSync = intel_fence_sync;
+   functions->CheckSync = intel_check_sync;
+   functions->ClientWaitSync = intel_client_wait_sync;
+   functions->ServerWaitSync = intel_server_wait_sync;
+}
+
+#else /* FEATURE_ARB_sync */
+
+void intel_init_syncobj_functions(struct dd_function_table *functions)
+{
+}
+
+#endif
diff --git a/src/mesa/drivers/dri/intel/intel_tex.c b/src/mesa/drivers/dri/intel/intel_tex.c
index e64d8a1556..df63f29a42 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.c
+++ b/src/mesa/drivers/dri/intel/intel_tex.c
@@ -93,7 +93,7 @@ intelFreeTextureImageData(GLcontext * ctx, struct gl_texture_image *texImage)
 static void *
 do_memcpy(void *dest, const void *src, size_t n)
 {
-   if ((((unsigned) src) & 63) || (((unsigned) dest) & 63)) {
+   if ((((unsigned long) src) & 63) || (((unsigned long) dest) & 63)) {
       return __memcpy(dest, src, n);
    }
    else
@@ -158,81 +158,11 @@ timed_memcpy(void *dest, const void *src, size_t n)
 }
 #endif /* DO_DEBUG */
 
-/**
- * Generate new mipmap data from BASE+1 to BASE+p (the minimally-sized mipmap
- * level).
- *
- * The texture object's miptree must be mapped.
- *
- * It would be really nice if this was just called by Mesa whenever mipmaps
- * needed to be regenerated, rather than us having to remember to do so in
- * each texture image modification path.
- *
- * This function should also include an accelerated path.
- */
-void
-intel_generate_mipmap(GLcontext *ctx, GLenum target,
-                      struct gl_texture_object *texObj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_texture_object *intelObj = intel_texture_object(texObj);
-   GLuint nr_faces = (intelObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-   int face, i;
-
-   _mesa_generate_mipmap(ctx, target, texObj);
-
-   /* Update the level information in our private data in the new images, since
-    * it didn't get set as part of a normal TexImage path.
-    */
-   for (face = 0; face < nr_faces; face++) {
-      for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
-         struct intel_texture_image *intelImage;
-
-	 intelImage = intel_texture_image(texObj->Image[face][i]);
-	 if (intelImage == NULL)
-	    break;
-
-	 intelImage->level = i;
-	 intelImage->face = face;
-	 /* Unreference the miptree to signal that the new Data is a bare
-	  * pointer from mesa.
-	  */
-	 intel_miptree_release(intel, &intelImage->mt);
-      }
-   }
-}
-
-static void intelGenerateMipmap(GLcontext *ctx, GLenum target, struct gl_texture_object *texObj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_texture_object *intelObj = intel_texture_object(texObj);
-
-   intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
-   intel_generate_mipmap(ctx, target, texObj);
-   intel_tex_unmap_level_images(intel, intelObj, texObj->BaseLevel);
-}
-
 void
 intelInitTextureFuncs(struct dd_function_table *functions)
 {
    functions->ChooseTextureFormat = intelChooseTextureFormat;
-   functions->TexImage1D = intelTexImage1D;
-   functions->TexImage2D = intelTexImage2D;
-   functions->TexImage3D = intelTexImage3D;
-   functions->TexSubImage1D = intelTexSubImage1D;
-   functions->TexSubImage2D = intelTexSubImage2D;
-   functions->TexSubImage3D = intelTexSubImage3D;
-   functions->CopyTexImage1D = intelCopyTexImage1D;
-   functions->CopyTexImage2D = intelCopyTexImage2D;
-   functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
-   functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
-   functions->GetTexImage = intelGetTexImage;
-   functions->GenerateMipmap = intelGenerateMipmap;
-
-   /* compressed texture functions */
-   functions->CompressedTexImage2D = intelCompressedTexImage2D;
-   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
-   functions->GetCompressedTexImage = intelGetCompressedTexImage;
+   functions->GenerateMipmap = intel_generate_mipmap;
 
    functions->NewTextureObject = intelNewTextureObject;
    functions->NewTextureImage = intelNewTextureImage;
diff --git a/src/mesa/drivers/dri/intel/intel_tex.h b/src/mesa/drivers/dri/intel/intel_tex.h
index 742ccc043a..471aa2a240 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.h
+++ b/src/mesa/drivers/dri/intel/intel_tex.h
@@ -35,120 +35,23 @@
 
 void intelInitTextureFuncs(struct dd_function_table *functions);
 
+void intelInitTextureImageFuncs(struct dd_function_table *functions);
+
+void intelInitTextureSubImageFuncs(struct dd_function_table *functions);
+
+void intelInitTextureCopyImageFuncs(struct dd_function_table *functions);
+
 const struct gl_texture_format *intelChooseTextureFormat(GLcontext * ctx,
                                                          GLint internalFormat,
                                                          GLenum format,
                                                          GLenum type);
 
-
-void intelTexImage3D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint height, GLint depth,
-                     GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage3D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset, GLint yoffset, GLint zoffset,
-                        GLsizei width, GLsizei height, GLsizei depth,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelTexImage2D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint height, GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage2D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset, GLint yoffset,
-                        GLsizei width, GLsizei height,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelTexImage1D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage1D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset,
-                        GLsizei width,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
-                         GLenum internalFormat,
-                         GLint x, GLint y, GLsizei width, GLint border);
-
-void intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
-                         GLenum internalFormat,
-                         GLint x, GLint y, GLsizei width, GLsizei height,
-                         GLint border);
-
-void intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
-                            GLint xoffset, GLint x, GLint y, GLsizei width);
-
-void intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
-                            GLint xoffset, GLint yoffset,
-                            GLint x, GLint y, GLsizei width, GLsizei height);
-
-void intelGetTexImage(GLcontext * ctx, GLenum target, GLint level,
-                      GLenum format, GLenum type, GLvoid * pixels,
-                      struct gl_texture_object *texObj,
-                      struct gl_texture_image *texImage);
-
-void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-				GLint internalFormat,
-				GLint width, GLint height, GLint border,
-				GLsizei imageSize, const GLvoid *data,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage );
-
-void intelCompressedTexSubImage2D(GLcontext * ctx,
-				  GLenum target,
-				  GLint level,
-				  GLint xoffset, GLint yoffset,
-				  GLsizei width, GLsizei height,
-				  GLenum format, GLsizei imageSize,
-				  const GLvoid * pixels,
-				  struct gl_texture_object *texObj,
-				  struct gl_texture_image *texImage);
-
-void intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
-				GLvoid *pixels,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage);
-
 void intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		       unsigned long long offset, GLint depth, GLuint pitch);
 void intelSetTexBuffer(__DRIcontext *pDRICtx,
 		       GLint target, __DRIdrawable *pDraw);
+void intelSetTexBuffer2(__DRIcontext *pDRICtx,
+			GLint target, GLint format, __DRIdrawable *pDraw);
 
 GLuint intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit);
 
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index 08437aa0e2..028b49c14d 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -73,11 +73,8 @@ get_teximage_source(struct intel_context *intel, GLenum internalFormat)
       return NULL;
    case GL_RGBA:
    case GL_RGBA8:
-      return intel_readbuf_region(intel);
    case GL_RGB:
-      if (intel->ctx.Visual.rgbBits == 16)
-         return intel_readbuf_region(intel);
-      return NULL;
+      return intel_readbuf_region(intel);
    default:
       return NULL;
    }
@@ -99,14 +96,24 @@ do_copy_texsubimage(struct intel_context *intel,
 
    if (!intelImage->mt || !src) {
       if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s fail %p %p\n",
-		 __FUNCTION__, intelImage->mt, src);
+	 fprintf(stderr, "%s fail %p %p (0x%08x)\n",
+		 __FUNCTION__, intelImage->mt, src, internalFormat);
+      return GL_FALSE;
+   }
+
+   if (intelImage->mt->cpp != src->cpp) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "%s fail %d vs %d cpp\n",
+		 __FUNCTION__, intelImage->mt->cpp, src->cpp);
       return GL_FALSE;
    }
 
    intelFlush(ctx);
    LOCK_HARDWARE(intel);
    {
+      drm_intel_bo *dst_bo = intel_region_buffer(intel,
+						 intelImage->mt->region,
+						 INTEL_WRITE_PART);
       GLuint image_offset = intel_miptree_image_offset(intelImage->mt,
                                                        intelImage->face,
                                                        intelImage->level);
@@ -118,8 +125,12 @@ do_copy_texsubimage(struct intel_context *intel,
       dstx += x - orig_x;
       dsty += y - orig_y;
 
-      /* image_offset may be non-page-aligned, but that's illegal for tiling. */
-      assert(intelImage->mt->region->tiling == I915_TILING_NONE);
+      /* Can't blit to tiled buffers with non-tile-aligned offset. */
+      if (intelImage->mt->region->tiling != I915_TILING_NONE &&
+	  (image_offset & 4095) != 0) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
 
       if (ctx->ReadBuffer->Name == 0) {
 	 /* reading from a window, adjust x, y */
@@ -140,35 +151,35 @@ do_copy_texsubimage(struct intel_context *intel,
 	 src_pitch = src->pitch;
       }
 
-      intelEmitCopyBlit(intel,
-			intelImage->mt->cpp,
-			src_pitch,
-			src->buffer,
-			0,
-			src->tiling,
-			intelImage->mt->pitch,
-			intelImage->mt->region->buffer,
-			image_offset,
-			intelImage->mt->region->tiling,
-			x, y, dstx, dsty, width, height,
-			GL_COPY);
+      if (!intelEmitCopyBlit(intel,
+			     intelImage->mt->cpp,
+			     src_pitch,
+			     src->buffer,
+			     0,
+			     src->tiling,
+			     intelImage->mt->pitch,
+			     dst_bo,
+			     image_offset,
+			     intelImage->mt->region->tiling,
+			     x, y, dstx, dsty, width, height,
+			     GL_COPY)) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
    }
 
    UNLOCK_HARDWARE(intel);
 
    /* GL_SGIS_generate_mipmap */
    if (intelImage->level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target, texObj);
+      intel_generate_mipmap(ctx, target, texObj);
    }
 
    return GL_TRUE;
 }
 
 
-
-
-
-void
+static void
 intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
                     GLenum internalFormat,
                     GLint x, GLint y, GLsizei width, GLint border)
@@ -214,7 +225,8 @@ intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
                            width, border);
 }
 
-void
+
+static void
 intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
                     GLenum internalFormat,
                     GLint x, GLint y, GLsizei width, GLsizei height,
@@ -231,6 +243,14 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
    if (border)
       goto fail;
 
+   /* Setup or redefine the texture object, mipmap tree and texture
+    * image.  Don't populate yet.
+    */
+   ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
+                          width, height, border,
+                          GL_RGBA, CHAN_TYPE, NULL,
+                          &ctx->DefaultPacking, texObj, texImage);
+
    srcx = x;
    srcy = y;
    dstx = 0;
@@ -241,15 +261,6 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
 				   &width, &height))
       return;
 
-   /* Setup or redefine the texture object, mipmap tree and texture
-    * image.  Don't populate yet.  
-    */
-   ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
-                          width, height, border,
-                          GL_RGBA, CHAN_TYPE, NULL,
-                          &ctx->DefaultPacking, texObj, texImage);
-
-
    if (!do_copy_texsubimage(intel_context(ctx), target,
                             intel_texture_image(texImage),
                             internalFormat, 0, 0, x, y, width, height))
@@ -263,7 +274,7 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 
-void
+static void
 intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint x, GLint y, GLsizei width)
 {
@@ -288,8 +299,7 @@ intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 
-
-void
+static void
 intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint yoffset,
                        GLint x, GLint y, GLsizei width, GLsizei height)
@@ -302,7 +312,6 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
       _mesa_select_tex_image(ctx, texObj, target, level);
    GLenum internalFormat = texImage->InternalFormat;
 
-
    /* Need to check texture is compatible with source format. 
     */
 
@@ -317,3 +326,13 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                                  xoffset, yoffset, x, y, width, height);
    }
 }
+
+
+void
+intelInitTextureCopyImageFuncs(struct dd_function_table *functions)
+{
+   functions->CopyTexImage1D = intelCopyTexImage1D;
+   functions->CopyTexImage2D = intelCopyTexImage2D;
+   functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
+   functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_tex_format.c b/src/mesa/drivers/dri/intel/intel_tex_format.c
index 5e418ac446..3322a71130 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_format.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_format.c
@@ -1,13 +1,18 @@
 #include "intel_context.h"
 #include "intel_tex.h"
+#include "intel_chipset.h"
 #include "main/texformat.h"
 #include "main/enums.h"
 
-/* It works out that this function is fine for all the supported
+
+/**
+ * Choose hardware texture format given the user's glTexImage parameters.
+ *
+ * It works out that this function is fine for all the supported
  * hardware.  However, there is still a need to map the formats onto
  * hardware descriptors.
- */
-/* Note that the i915 can actually support many more formats than
+ *
+ * Note that the i915 can actually support many more formats than
  * these if we take the step of simply swizzling the colors
  * immediately after sampling...
  */
@@ -16,7 +21,12 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
                          GLenum format, GLenum type)
 {
    struct intel_context *intel = intel_context(ctx);
-   const GLboolean do32bpt = (intel->ctx.Visual.rgbBits == 32);
+   const GLboolean do32bpt = (intel->ctx.Visual.rgbBits >= 24);
+
+#if 0
+   printf("%s intFmt=0x%x format=0x%x type=0x%x\n",
+          __FUNCTION__, internalFormat, format, type);
+#endif
 
    switch (internalFormat) {
    case 4:
@@ -151,20 +161,36 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
    case GL_SRGB8_EXT:
    case GL_SRGB_ALPHA_EXT:
    case GL_SRGB8_ALPHA8_EXT:
-   case GL_SLUMINANCE_EXT:
-   case GL_SLUMINANCE8_EXT:
-   case GL_SLUMINANCE_ALPHA_EXT:
-   case GL_SLUMINANCE8_ALPHA8_EXT:
    case GL_COMPRESSED_SRGB_EXT:
    case GL_COMPRESSED_SRGB_ALPHA_EXT:
    case GL_COMPRESSED_SLUMINANCE_EXT:
    case GL_COMPRESSED_SLUMINANCE_ALPHA_EXT:
-       return &_mesa_texformat_srgba8;
+      return &_mesa_texformat_sargb8;
+   case GL_SLUMINANCE_EXT:
+   case GL_SLUMINANCE8_EXT:
+      if (IS_G4X(intel->intelScreen->deviceID))
+         return &_mesa_texformat_sl8;
+      else
+         return &_mesa_texformat_sargb8;
+   case GL_SLUMINANCE_ALPHA_EXT:
+   case GL_SLUMINANCE8_ALPHA8_EXT:
+      if (IS_G4X(intel->intelScreen->deviceID))
+         return &_mesa_texformat_sla8;
+      else
+         return &_mesa_texformat_sargb8;
    case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
    case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
    case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
    case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
       return &_mesa_texformat_srgb_dxt1;
+
+   /* i915 could also do this */
+   case GL_DUDV_ATI:
+   case GL_DU8DV8_ATI:
+      return &_mesa_texformat_dudv8;
+   case GL_RGBA_SNORM:
+   case GL_RGBA8_SNORM:
+      return &_mesa_texformat_signed_rgba8888_rev;
 #endif
 
    default:
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index 2ac7dceb0f..a206fe6805 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -1,17 +1,14 @@
 
-#include <stdlib.h>
-#include <stdio.h>
-
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/enums.h"
-#include "main/colortab.h"
+#include "main/bufferobj.h"
 #include "main/convolve.h"
 #include "main/context.h"
-#include "main/simple_list.h"
 #include "main/texcompress.h"
 #include "main/texformat.h"
+#include "main/texgetimage.h"
 #include "main/texobj.h"
 #include "main/texstore.h"
 #include "main/teximage.h"
@@ -62,7 +59,8 @@ logbase2(int n)
 static void
 guess_and_alloc_mipmap_tree(struct intel_context *intel,
                             struct intel_texture_object *intelObj,
-                            struct intel_texture_image *intelImage)
+                            struct intel_texture_image *intelImage,
+			    GLboolean expect_accelerated_upload)
 {
    GLuint firstLevel;
    GLuint lastLevel;
@@ -129,6 +127,7 @@ guess_and_alloc_mipmap_tree(struct intel_context *intel,
       comp_byte = intel_compressed_num_bytes(intelImage->base.TexFormat->MesaFormat);
    intelObj->mt = intel_miptree_create(intel,
                                        intelObj->base.Target,
+                                       intelImage->base._BaseFormat,
                                        intelImage->base.InternalFormat,
                                        firstLevel,
                                        lastLevel,
@@ -136,7 +135,8 @@ guess_and_alloc_mipmap_tree(struct intel_context *intel,
                                        height,
                                        depth,
                                        intelImage->base.TexFormat->TexelBytes,
-                                       comp_byte);
+                                       comp_byte,
+				       expect_accelerated_upload);
 
    DBG("%s - success\n", __FUNCTION__);
 }
@@ -202,14 +202,15 @@ try_pbo_upload(struct intel_context *intel,
    GLuint src_offset, src_stride;
    GLuint dst_offset, dst_stride;
 
-   if (!pbo ||
+   if (!_mesa_is_bufferobj(unpack->BufferObj) ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
-      _mesa_printf("%s: failure 1\n", __FUNCTION__);
+      DBG("%s: failure 1\n", __FUNCTION__);
       return GL_FALSE;
    }
 
-   src_offset = (GLuint) pixels;
+   /* note: potential 64-bit ptr to 32-bit int cast */
+   src_offset = (GLuint) (unsigned long) pixels;
 
    if (unpack->RowLength > 0)
       src_stride = unpack->RowLength;
@@ -231,12 +232,15 @@ try_pbo_upload(struct intel_context *intel,
 					       INTEL_WRITE_FULL);
 
 
-      intelEmitCopyBlit(intel,
-                        intelImage->mt->cpp,
-                        src_stride, src_buffer, src_offset, GL_FALSE,
-                        dst_stride, dst_buffer, dst_offset, GL_FALSE,
-                        0, 0, 0, 0, width, height,
-			GL_COPY);
+      if (!intelEmitCopyBlit(intel,
+			     intelImage->mt->cpp,
+			     src_stride, src_buffer, src_offset, GL_FALSE,
+			     dst_stride, dst_buffer, dst_offset, GL_FALSE,
+			     0, 0, 0, 0, width, height,
+			     GL_COPY)) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
    }
    UNLOCK_HARDWARE(intel);
 
@@ -244,7 +248,6 @@ try_pbo_upload(struct intel_context *intel,
 }
 
 
-
 static GLboolean
 try_pbo_zcopy(struct intel_context *intel,
               struct intel_texture_image *intelImage,
@@ -257,14 +260,15 @@ try_pbo_zcopy(struct intel_context *intel,
    GLuint src_offset, src_stride;
    GLuint dst_offset, dst_stride;
 
-   if (!pbo ||
+   if (!_mesa_is_bufferobj(unpack->BufferObj) ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
-      _mesa_printf("%s: failure 1\n", __FUNCTION__);
+      DBG("%s: failure 1\n", __FUNCTION__);
       return GL_FALSE;
    }
 
-   src_offset = (GLuint) pixels;
+   /* note: potential 64-bit ptr to 32-bit int cast */
+   src_offset = (GLuint) (unsigned long) pixels;
 
    if (unpack->RowLength > 0)
       src_stride = unpack->RowLength;
@@ -278,7 +282,7 @@ try_pbo_zcopy(struct intel_context *intel,
    dst_stride = intelImage->mt->pitch;
 
    if (src_stride != dst_stride || dst_offset != 0 || src_offset != 0) {
-      _mesa_printf("%s: failure 2\n", __FUNCTION__);
+      DBG("%s: failure 2\n", __FUNCTION__);
       return GL_FALSE;
    }
 
@@ -288,10 +292,6 @@ try_pbo_zcopy(struct intel_context *intel,
 }
 
 
-
-
-
-
 static void
 intelTexImage(GLcontext * ctx,
               GLint dims,
@@ -302,7 +302,8 @@ intelTexImage(GLcontext * ctx,
               GLenum format, GLenum type, const void *pixels,
               const struct gl_pixelstore_attrib *unpack,
               struct gl_texture_object *texObj,
-              struct gl_texture_image *texImage, GLsizei imageSize, int compressed)
+              struct gl_texture_image *texImage, GLsizei imageSize,
+              GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_object *intelObj = intel_texture_object(texObj);
@@ -310,8 +311,7 @@ intelTexImage(GLcontext * ctx,
    GLint postConvWidth = width;
    GLint postConvHeight = height;
    GLint texelBytes, sizeInBytes;
-   GLuint dstRowStride, srcRowStride = texImage->RowStride;
-
+   GLuint dstRowStride = 0, srcRowStride = texImage->RowStride;
 
    DBG("%s target %s level %d %dx%dx%d border %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target), level, width, height, depth, border);
@@ -383,7 +383,7 @@ intelTexImage(GLcontext * ctx,
    }
 
    if (!intelObj->mt) {
-      guess_and_alloc_mipmap_tree(intel, intelObj, intelImage);
+      guess_and_alloc_mipmap_tree(intel, intelObj, intelImage, pixels == NULL);
       if (!intelObj->mt) {
 	 DBG("guess_and_alloc_mipmap_tree: failed\n");
       }
@@ -409,11 +409,13 @@ intelTexImage(GLcontext * ctx,
        * a miptree, so create one just for our level and store it in the image.
        * It'll get moved into the object miptree at validate time.
        */
-      intelImage->mt = intel_miptree_create(intel, target, internalFormat,
+      intelImage->mt = intel_miptree_create(intel, target,
+					    intelImage->base.TexFormat->BaseFormat,
+					    internalFormat,
 					    level, level,
 					    width, height, depth,
 					    intelImage->base.TexFormat->TexelBytes,
-					    comp_byte);
+					    comp_byte, pixels == NULL);
 
    }
 
@@ -421,7 +423,7 @@ intelTexImage(GLcontext * ctx,
     */
    if (dims <= 2 &&
        intelImage->mt &&
-       intel_buffer_object(unpack->BufferObj) &&
+       _mesa_is_bufferobj(unpack->BufferObj) &&
        check_pbo_format(internalFormat, format,
                         type, intelImage->base.TexFormat)) {
 
@@ -459,8 +461,6 @@ intelTexImage(GLcontext * ctx,
       DBG("pbo upload failed\n");
    }
 
-
-
    /* intelCopyTexImage calls this function with pixels == NULL, with
     * the expectation that the mipmap tree will be set up but nothing
     * more will be done.  This is where those calls return:
@@ -478,12 +478,13 @@ intelTexImage(GLcontext * ctx,
    LOCK_HARDWARE(intel);
 
    if (intelImage->mt) {
-      texImage->Data = intel_miptree_image_map(intel,
-                                               intelImage->mt,
-                                               intelImage->face,
-                                               intelImage->level,
-                                               &dstRowStride,
-                                               intelImage->base.ImageOffsets);
+      if (pixels != NULL)
+         texImage->Data = intel_miptree_image_map(intel,
+                                                  intelImage->mt,
+                                                  intelImage->face,
+                                                  intelImage->level,
+                                                  &dstRowStride,
+                                                  intelImage->base.ImageOffsets);
       texImage->RowStride = dstRowStride / intelImage->mt->cpp;
    }
    else {
@@ -503,8 +504,9 @@ intelTexImage(GLcontext * ctx,
    }
 
    DBG("Upload image %dx%dx%d row_len %d "
-       "pitch %d\n",
-       width, height, depth, width * texelBytes, dstRowStride);
+       "pitch %d pixels %d compressed %d\n",
+       width, height, depth, width * texelBytes, dstRowStride,
+       pixels ? 1 : 0, compressed);
 
    /* Copy data.  Would like to know when it's ok for us to eg. use
     * the blitter to copy.  Or, use the hardware to do the format
@@ -517,7 +519,7 @@ intelTexImage(GLcontext * ctx,
 	       _mesa_copy_rect(texImage->Data, dst->cpp, dst->pitch,
 			       0, 0,
 			       intelImage->mt->level[level].width,
-			       intelImage->mt->level[level].height/4,
+			       (intelImage->mt->level[level].height+3)/4,
 			       pixels,
 			       srcRowStride,
 			       0, 0);
@@ -535,22 +537,24 @@ intelTexImage(GLcontext * ctx,
        }
    }
 
-   /* GL_SGIS_generate_mipmap */
-   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target, texObj);
-   }
-
    _mesa_unmap_teximage_pbo(ctx, unpack);
 
    if (intelImage->mt) {
-      intel_miptree_image_unmap(intel, intelImage->mt);
+      if (pixels != NULL)
+         intel_miptree_image_unmap(intel, intelImage->mt);
       texImage->Data = NULL;
    }
 
    UNLOCK_HARDWARE(intel);
+
+   /* GL_SGIS_generate_mipmap */
+   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+      intel_generate_mipmap(ctx, target, texObj);
+   }
 }
 
-void
+
+static void
 intelTexImage3D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -563,11 +567,11 @@ intelTexImage3D(GLcontext * ctx,
 {
    intelTexImage(ctx, 3, target, level,
                  internalFormat, width, height, depth, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
 
-void
+static void
 intelTexImage2D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -579,10 +583,11 @@ intelTexImage2D(GLcontext * ctx,
 {
    intelTexImage(ctx, 2, target, level,
                  internalFormat, width, height, 1, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
-void
+
+static void
 intelTexImage1D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -594,21 +599,24 @@ intelTexImage1D(GLcontext * ctx,
 {
    intelTexImage(ctx, 1, target, level,
                  internalFormat, width, 1, 1, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
-void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-				GLint internalFormat,
-				GLint width, GLint height, GLint border,
-				GLsizei imageSize, const GLvoid *data,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage )
+
+static void
+intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+                           GLint internalFormat,
+                           GLint width, GLint height, GLint border,
+                           GLsizei imageSize, const GLvoid *data,
+                           struct gl_texture_object *texObj,
+                           struct gl_texture_image *texImage )
 {
    intelTexImage(ctx, 2, target, level,
 		 internalFormat, width, height, 1, border,
-		 0, 0, data, &ctx->Unpack, texObj, texImage, imageSize, 1);
+		 0, 0, data, &ctx->Unpack, texObj, texImage, imageSize, GL_TRUE);
 }
 
+
 /**
  * Need to map texture image into memory before copying image data,
  * then unmap it.
@@ -617,11 +625,17 @@ static void
 intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
 		    GLenum format, GLenum type, GLvoid * pixels,
 		    struct gl_texture_object *texObj,
-		    struct gl_texture_image *texImage, int compressed)
+		    struct gl_texture_image *texImage, GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
 
+   /* If we're reading from a texture that has been rendered to, need to
+    * make sure rendering is complete.
+    * We could probably predicate this on texObj->_RenderToTexture
+    */
+   intelFlush(ctx);
+
    /* Map */
    if (intelImage->mt) {
       /* Image is stored in hardware format in a buffer managed by the
@@ -665,28 +679,29 @@ intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
    }
 }
 
-void
+
+static void
 intelGetTexImage(GLcontext * ctx, GLenum target, GLint level,
                  GLenum format, GLenum type, GLvoid * pixels,
                  struct gl_texture_object *texObj,
                  struct gl_texture_image *texImage)
 {
    intel_get_tex_image(ctx, target, level, format, type, pixels,
-		       texObj, texImage, 0);
-
-
+		       texObj, texImage, GL_FALSE);
 }
 
-void
+
+static void
 intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
 			   GLvoid *pixels,
 			   struct gl_texture_object *texObj,
 			   struct gl_texture_image *texImage)
 {
    intel_get_tex_image(ctx, target, level, 0, 0, pixels,
-		       texObj, texImage, 1);
+		       texObj, texImage, GL_TRUE);
 }
 
+
 void
 intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		  unsigned long long offset, GLint depth, GLuint pitch)
@@ -710,7 +725,9 @@ intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 }
 
 void
-intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+intelSetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
+		   GLint glx_texture_format,
+		   __DRIdrawable *dPriv)
 {
    struct intel_framebuffer *intel_fb = dPriv->driverPrivate;
    struct intel_context *intel = pDRICtx->driverPrivate;
@@ -741,7 +758,10 @@ intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
 
    type = GL_BGRA;
    format = GL_UNSIGNED_BYTE;
-   internalFormat = (rb->region->cpp == 3 ? 3 : 4);
+   if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+      internalFormat = GL_RGB;
+   else
+      internalFormat = GL_RGBA;
 
    mt = intel_miptree_create_for_region(intel, target,
 					internalFormat,
@@ -751,16 +771,21 @@ intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
 
    _mesa_lock_texture(&intel->ctx, texObj);
 
+   texImage = _mesa_get_tex_image(&intel->ctx, texObj, target, level);
+   intelImage = intel_texture_image(texImage);
+
+   if (intelImage->mt) {
+      intel_miptree_release(intel, &intelImage->mt);
+      assert(!texImage->Data);
+   }
    if (intelObj->mt)
       intel_miptree_release(intel, &intelObj->mt);
 
    intelObj->mt = mt;
-   texImage = _mesa_get_tex_image(&intel->ctx, texObj, target, level);
    _mesa_init_teximage_fields(&intel->ctx, target, texImage,
 			      rb->region->width, rb->region->height, 1,
 			      0, internalFormat);
 
-   intelImage = intel_texture_image(texImage);
    intelImage->face = target_to_face(target);
    intelImage->level = level;
    texImage->TexFormat = intelChooseTextureFormat(&intel->ctx, internalFormat,
@@ -776,3 +801,25 @@ intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
 
    _mesa_unlock_texture(&intel->ctx, texObj);
 }
+
+void
+intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+   /* The old interface didn't have the format argument, so copy our
+    * implementation's behavior at the time.
+    */
+   intelSetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}
+
+
+void
+intelInitTextureImageFuncs(struct dd_function_table *functions)
+{
+   functions->TexImage1D = intelTexImage1D;
+   functions->TexImage2D = intelTexImage2D;
+   functions->TexImage3D = intelTexImage3D;
+   functions->GetTexImage = intelGetTexImage;
+
+   functions->CompressedTexImage2D = intelCompressedTexImage2D;
+   functions->GetCompressedTexImage = intelGetCompressedTexImage;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_tex_layout.c b/src/mesa/drivers/dri/intel/intel_tex_layout.c
index e6f9a41779..7d69ea4484 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_layout.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_layout.c
@@ -35,26 +35,39 @@
 #include "intel_context.h"
 #include "main/macros.h"
 
-GLuint intel_compressed_alignment(GLenum internalFormat)
+void intel_get_texture_alignment_unit(GLenum internalFormat, GLuint *w, GLuint *h)
 {
-    GLuint alignment = 4;
-
     switch (internalFormat) {
     case GL_COMPRESSED_RGB_FXT1_3DFX:
     case GL_COMPRESSED_RGBA_FXT1_3DFX:
-        alignment = 8;
+        *w = 8;
+        *h = 4;
+        break;
+
+    case GL_RGB_S3TC:
+    case GL_RGB4_S3TC:
+    case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+    case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+    case GL_RGBA_S3TC:
+    case GL_RGBA4_S3TC:
+    case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+    case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+        *w = 4;
+        *h = 4;
         break;
 
     default:
+        *w = 4;
+        *h = 2;
         break;
     }
-
-    return alignment;
 }
 
-void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt )
+void i945_miptree_layout_2d( struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling )
 {
-   GLint align_h = 2, align_w = 4;
+   GLuint align_h = 2, align_w = 4;
    GLuint level;
    GLuint x = 0;
    GLuint y = 0;
@@ -62,9 +75,9 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
    GLuint height = mt->height0;
 
    mt->pitch = mt->width0;
+   intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
 
    if (mt->compressed) {
-       align_w = intel_compressed_alignment(mt->internal_format);
        mt->pitch = ALIGN(mt->width0, align_w);
    }
 
@@ -92,7 +105,7 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
    /* Pitch must be a whole number of dwords, even though we
     * express it in texels.
     */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->pitch);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->pitch);
    mt->total_height = 0;
 
    for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
diff --git a/src/mesa/drivers/dri/intel/intel_tex_layout.h b/src/mesa/drivers/dri/intel/intel_tex_layout.h
index dbc90e6f9b..c9de9b5678 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_layout.h
+++ b/src/mesa/drivers/dri/intel/intel_tex_layout.h
@@ -38,5 +38,7 @@ static GLuint minify( GLuint d )
    return MAX2(1, d>>1);
 }
 
-extern void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt );
-extern GLuint intel_compressed_alignment(GLenum);
+extern void i945_miptree_layout_2d(struct intel_context *intel,
+				   struct intel_mipmap_tree *mt,
+				   uint32_t tiling);
+extern void intel_get_texture_alignment_unit(GLenum, GLuint *, GLuint *);
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index f86de56897..89037073f8 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -44,10 +44,12 @@ intelTexSubimage(GLcontext * ctx,
                  GLenum target, GLint level,
                  GLint xoffset, GLint yoffset, GLint zoffset,
                  GLint width, GLint height, GLint depth,
+                 GLsizei imageSize,
                  GLenum format, GLenum type, const void *pixels,
                  const struct gl_pixelstore_attrib *packing,
                  struct gl_texture_object *texObj,
-                 struct gl_texture_image *texImage)
+                 struct gl_texture_image *texImage,
+                 GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
@@ -59,9 +61,14 @@ intelTexSubimage(GLcontext * ctx,
 
    intelFlush(ctx);
 
-   pixels =
-      _mesa_validate_pbo_teximage(ctx, dims, width, height, depth, format,
-                                  type, pixels, packing, "glTexSubImage2D");
+   if (compressed)
+      pixels = _mesa_validate_pbo_compressed_teximage(ctx, imageSize,
+                                                      pixels, packing,
+                                                      "glCompressedTexImage");
+   else
+      pixels = _mesa_validate_pbo_teximage(ctx, dims, width, height, depth,
+                                           format, type, pixels, packing,
+                                           "glTexSubImage");
    if (!pixels)
       return;
 
@@ -90,20 +97,28 @@ intelTexSubimage(GLcontext * ctx,
 
    assert(dstRowStride);
 
-   if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
-                                        texImage->TexFormat,
-                                        texImage->Data,
-                                        xoffset, yoffset, zoffset,
-                                        dstRowStride,
-                                        texImage->ImageOffsets,
-                                        width, height, depth,
-                                        format, type, pixels, packing)) {
-      _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
+   if (compressed) {
+      if (intelImage->mt) {
+         struct intel_region *dst = intelImage->mt->region;
+         
+         _mesa_copy_rect(texImage->Data, dst->cpp, dst->pitch,
+                         xoffset, yoffset / 4,
+                         (width + 3)  & ~3, (height + 3) / 4,
+                         pixels, (width + 3) & ~3, 0, 0);
+      } else
+        memcpy(texImage->Data, pixels, imageSize);
    }
-
-   /* GL_SGIS_generate_mipmap */
-   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target, texObj);
+   else {
+      if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
+                                           texImage->TexFormat,
+                                           texImage->Data,
+                                           xoffset, yoffset, zoffset,
+                                           dstRowStride,
+                                           texImage->ImageOffsets,
+                                           width, height, depth,
+                                           format, type, pixels, packing)) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
+      }
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -114,13 +129,15 @@ intelTexSubimage(GLcontext * ctx,
    }
 
    UNLOCK_HARDWARE(intel);
-}
-
-
 
+   /* GL_SGIS_generate_mipmap */
+   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+      intel_generate_mipmap(ctx, target, texObj);
+   }
+}
 
 
-void
+static void
 intelTexSubImage3D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -132,18 +149,15 @@ intelTexSubImage3D(GLcontext * ctx,
                    struct gl_texture_object *texObj,
                    struct gl_texture_image *texImage)
 {
-
    intelTexSubimage(ctx, 3,
                     target, level,
                     xoffset, yoffset, zoffset,
-                    width, height, depth,
-                    format, type, pixels, packing, texObj, texImage);
-
+                    width, height, depth, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
 
-
-void
+static void
 intelTexSubImage2D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -155,17 +169,15 @@ intelTexSubImage2D(GLcontext * ctx,
                    struct gl_texture_object *texObj,
                    struct gl_texture_image *texImage)
 {
-
    intelTexSubimage(ctx, 2,
                     target, level,
                     xoffset, yoffset, 0,
-                    width, height, 1,
-                    format, type, pixels, packing, texObj, texImage);
-
+                    width, height, 1, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
 
-void
+static void
 intelTexSubImage1D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -180,12 +192,11 @@ intelTexSubImage1D(GLcontext * ctx,
    intelTexSubimage(ctx, 1,
                     target, level,
                     xoffset, 0, 0,
-                    width, 1, 1,
-                    format, type, pixels, packing, texObj, texImage);
-
+                    width, 1, 1, 0,
+                    format, type, pixels, packing, texObj, texImage, GL_FALSE);
 }
 
-void
+static void
 intelCompressedTexSubImage2D(GLcontext * ctx,
 			     GLenum target,
 			     GLint level,
@@ -196,6 +207,20 @@ intelCompressedTexSubImage2D(GLcontext * ctx,
 			     struct gl_texture_object *texObj,
 			     struct gl_texture_image *texImage)
 {
-   fprintf(stderr, "stubbed CompressedTexSubImage2D: %dx%d@%dx%d\n",
-	   width, height, xoffset, yoffset);
+   intelTexSubimage(ctx, 2,
+                    target, level,
+                    xoffset, yoffset, 0,
+                    width, height, 1, imageSize,
+                    format, 0, pixels, &ctx->Unpack, texObj, texImage, GL_TRUE);
+}
+
+
+
+void
+intelInitTextureSubImageFuncs(struct dd_function_table *functions)
+{
+   functions->TexSubImage1D = intelTexSubImage1D;
+   functions->TexSubImage2D = intelTexSubImage2D;
+   functions->TexSubImage3D = intelTexSubImage3D;
+   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 820683d42e..a284d5475f 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -199,6 +199,7 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
    if (!intelObj->mt) {
       intelObj->mt = intel_miptree_create(intel,
                                           intelObj->base.Target,
+                                          firstImage->base._BaseFormat,
                                           firstImage->base.InternalFormat,
                                           intelObj->firstLevel,
                                           intelObj->lastLevel,
@@ -206,7 +207,8 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
                                           firstImage->base.Height,
                                           firstImage->base.Depth,
                                           cpp,
-                                          comp_byte);
+                                          comp_byte,
+					  GL_TRUE);
    }
 
    /* Pull in any images not in the object's tree:
@@ -240,7 +242,7 @@ intel_tex_map_level_images(struct intel_context *intel,
       struct intel_texture_image *intelImage =
 	 intel_texture_image(intelObj->base.Image[face][level]);
 
-      if (intelImage->mt) {
+      if (intelImage && intelImage->mt) {
 	 intelImage->base.Data =
 	    intel_miptree_image_map(intel,
 				    intelImage->mt,
@@ -267,7 +269,7 @@ intel_tex_unmap_level_images(struct intel_context *intel,
       struct intel_texture_image *intelImage =
 	 intel_texture_image(intelObj->base.Image[face][level]);
 
-      if (intelImage->mt) {
+      if (intelImage && intelImage->mt) {
 	 intel_miptree_image_unmap(intel, intelImage->mt);
 	 intelImage->base.Data = NULL;
       }
diff --git a/src/mesa/drivers/dri/mach64/mach64_context.c b/src/mesa/drivers/dri/mach64/mach64_context.c
index 99abd209b6..9c7f513c6f 100644
--- a/src/mesa/drivers/dri/mach64/mach64_context.c
+++ b/src/mesa/drivers/dri/mach64/mach64_context.c
@@ -57,10 +57,6 @@
 #include "utils.h"
 #include "vblank.h"
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_vertex_buffer_object
-#include "extension_helper.h"
-
 #ifndef MACH64_DEBUG
 int MACH64_DEBUG = (0);
 #endif
@@ -82,9 +78,7 @@ static const struct dri_debug_control debug_control[] =
 
 const struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     { "GL_EXT_texture_edge_clamp",         NULL },
     { "GL_MESA_ycbcr_texture",             NULL },
     { "GL_SGIS_generate_mipmap",           NULL },
@@ -196,6 +190,7 @@ GLboolean mach64CreateContext( const __GLcontextModes *glVisual,
    ctx->Const.MaxTextureUnits = 2;
    ctx->Const.MaxTextureImageUnits = 2;
    ctx->Const.MaxTextureCoordUnits = 2;
+   ctx->Const.MaxDrawBuffers = 1;
 
    heap = mach64Screen->IsPCI ? MACH64_CARD_HEAP : MACH64_AGP_HEAP;
 
diff --git a/src/mesa/drivers/dri/mach64/mach64_screen.c b/src/mesa/drivers/dri/mach64/mach64_screen.c
index 6bfb4c32b1..6440027ca4 100644
--- a/src/mesa/drivers/dri/mach64/mach64_screen.c
+++ b/src/mesa/drivers/dri/mach64/mach64_screen.c
@@ -93,6 +93,7 @@ mach64FillInModes( __DRIscreenPrivate *psp,
 
     uint8_t depth_bits_array[2];
     uint8_t stencil_bits_array[2];
+    uint8_t msaa_samples_array[1];
 
     depth_bits_array[0] = depth_bits;
     depth_bits_array[1] = depth_bits;
@@ -104,6 +105,8 @@ mach64FillInModes( __DRIscreenPrivate *psp,
     stencil_bits_array[0] = 0;
     stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+    msaa_samples_array[0] = 0;
+
     depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 2 : 1;
     back_buffer_factor  = (have_back_buffer) ? 2 : 1;
 
@@ -119,7 +122,8 @@ mach64FillInModes( __DRIscreenPrivate *psp,
     configs = driCreateConfigs(fb_format, fb_type,
 			       depth_bits_array, stencil_bits_array,
 			       depth_buffer_factor, back_buffer_modes,
-			       back_buffer_factor);
+			       back_buffer_factor,
+                               msaa_samples_array, 1);
     if (configs == NULL) {
        fprintf(stderr, "[%s:%u] Error creating FBConfig!\n",
 	       __func__, __LINE__);
@@ -370,7 +374,7 @@ mach64CreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 mach64DestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
diff --git a/src/mesa/drivers/dri/mach64/mach64_tex.c b/src/mesa/drivers/dri/mach64/mach64_tex.c
index 1f9d3c57eb..225d23179e 100644
--- a/src/mesa/drivers/dri/mach64/mach64_tex.c
+++ b/src/mesa/drivers/dri/mach64/mach64_tex.c
@@ -99,7 +99,7 @@ static void mach64SetTexFilter( mach64TexObjPtr t,
    }
 }
 
-static void mach64SetTexBorderColor( mach64TexObjPtr t, GLubyte c[4] )
+static void mach64SetTexBorderColor( mach64TexObjPtr t, const GLfloat c[4] )
 {
 #if 0
    GLuint border = mach64PackColor( 4, c[0], c[1], c[2], c[3] );
@@ -131,7 +131,7 @@ mach64AllocTexObj( struct gl_texture_object *texObj )
 
    mach64SetTexWrap( t, texObj->WrapS, texObj->WrapT );
    mach64SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
-   mach64SetTexBorderColor( t, texObj->_BorderChan );
+   mach64SetTexBorderColor( t, texObj->BorderColor );
 
    return t;
 }
@@ -152,6 +152,7 @@ mach64ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
    case GL_ALPHA8:
    case GL_ALPHA12:
    case GL_ALPHA16:
+   case GL_COMPRESSED_ALPHA:
    case 2:
    case GL_LUMINANCE_ALPHA:
    case GL_LUMINANCE4_ALPHA4:
@@ -160,9 +161,11 @@ mach64ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
    case GL_LUMINANCE12_ALPHA4:
    case GL_LUMINANCE12_ALPHA12:
    case GL_LUMINANCE16_ALPHA16:
+   case GL_COMPRESSED_LUMINANCE_ALPHA:
    case 4:
    case GL_RGBA:
    case GL_RGBA2:
+   case GL_COMPRESSED_RGBA:
       if (mmesa->mach64Screen->cpp == 4)
          return &_mesa_texformat_argb8888;
       else
@@ -193,6 +196,7 @@ mach64ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
    case GL_RGB10:
    case GL_RGB12:
    case GL_RGB16:
+   case GL_COMPRESSED_RGB:
       if (mmesa->mach64Screen->cpp == 4)
          return &_mesa_texformat_argb8888;
       else
@@ -204,6 +208,7 @@ mach64ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
    case GL_LUMINANCE8:
    case GL_LUMINANCE12:
    case GL_LUMINANCE16:
+   case GL_COMPRESSED_LUMINANCE:
       if (mmesa->mach64Screen->cpp == 4)
          return &_mesa_texformat_argb8888; /* inefficient but accurate */
       else
@@ -214,6 +219,7 @@ mach64ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
    case GL_INTENSITY8:
    case GL_INTENSITY12:
    case GL_INTENSITY16:
+   case GL_COMPRESSED_INTENSITY:
       if (mmesa->mach64Screen->cpp == 4)
          return &_mesa_texformat_argb8888; /* inefficient but accurate */
       else
@@ -465,7 +471,7 @@ static void mach64DDTexParameter( GLcontext *ctx, GLenum target,
 
    case GL_TEXTURE_BORDER_COLOR:
       if ( t->base.bound ) FLUSH_BATCH( mmesa );
-      mach64SetTexBorderColor( t, tObj->_BorderChan );
+      mach64SetTexBorderColor( t, tObj->BorderColor );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
diff --git a/src/mesa/drivers/dri/mga/mga_texstate.c b/src/mesa/drivers/dri/mga/mga_texstate.c
index d4c5b6fd97..ad765d1dd7 100644
--- a/src/mesa/drivers/dri/mga/mga_texstate.c
+++ b/src/mesa/drivers/dri/mga/mga_texstate.c
@@ -206,8 +206,8 @@ static void mgaUpdateTextureEnvG200( GLcontext *ctx, GLuint unit )
    mgaTextureObjectPtr t = (mgaTextureObjectPtr) tObj->DriverData;
    GLenum format = tObj->Image[0][tObj->BaseLevel]->_BaseFormat;
 
-   if (tObj != ctx->Texture.Unit[0].Current2D &&
-       tObj != ctx->Texture.Unit[0].CurrentRect)
+   if (tObj != ctx->Texture.Unit[0].CurrentTex[TEXTURE_2D_INDEX] &&
+       tObj != ctx->Texture.Unit[0].CurrentTex[TEXTURE_RECT_INDEX])
       return;
 
 
@@ -635,8 +635,8 @@ static void mgaUpdateTextureEnvG400( GLcontext *ctx, GLuint unit )
    mgaTextureObjectPtr t = (mgaTextureObjectPtr) tObj->DriverData;
    GLenum format = tObj->Image[0][tObj->BaseLevel]->_BaseFormat;
 
-   if (tObj != ctx->Texture.Unit[source].Current2D &&
-       tObj != ctx->Texture.Unit[source].CurrentRect)
+   if (tObj != ctx->Texture.Unit[source].CurrentTex[TEXTURE_2D_INDEX] &&
+       tObj != ctx->Texture.Unit[source].CurrentTex[TEXTURE_RECT_INDEX])
       return;
 
    switch (ctx->Texture.Unit[source].EnvMode) {
diff --git a/src/mesa/drivers/dri/mga/mga_xmesa.c b/src/mesa/drivers/dri/mga/mga_xmesa.c
index 86da3a2cac..0dc76fea50 100644
--- a/src/mesa/drivers/dri/mga/mga_xmesa.c
+++ b/src/mesa/drivers/dri/mga/mga_xmesa.c
@@ -69,13 +69,9 @@
 
 #include "GL/internal/dri_interface.h"
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
 #define need_GL_ARB_vertex_program
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_gpu_program_parameters
-#define need_GL_EXT_multi_draw_arrays
 #define need_GL_EXT_secondary_color
 #if 0
 #define need_GL_EXT_paletted_texture
@@ -133,6 +129,7 @@ mgaFillInModes( __DRIscreenPrivate *psp,
 
     uint8_t depth_bits_array[3];
     uint8_t stencil_bits_array[3];
+    uint8_t msaa_samples_array[1];
 
 
     depth_bits_array[0] = 0;
@@ -147,6 +144,8 @@ mgaFillInModes( __DRIscreenPrivate *psp,
     stencil_bits_array[1] = 0;
     stencil_bits_array[2] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+    msaa_samples_array[0] = 0;
+
     depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 3 : 1;
     back_buffer_factor  = (have_back_buffer) ? 2 : 1;
 
@@ -162,7 +161,8 @@ mgaFillInModes( __DRIscreenPrivate *psp,
     configs = driCreateConfigs(fb_format, fb_type,
 			       depth_bits_array, stencil_bits_array,
 			       depth_buffer_factor,
-			       back_buffer_modes, back_buffer_factor);
+			       back_buffer_modes, back_buffer_factor,
+                               msaa_samples_array, 1);
     if (configs == NULL) {
 	fprintf( stderr, "[%s:%u] Error creating FBConfig!\n",
 		 __func__, __LINE__ );
@@ -385,13 +385,9 @@ static const struct dri_extension g400_extensions[] =
 
 static const struct dri_extension card_extensions[] =
 {
-   { "GL_ARB_multisample",            GL_ARB_multisample_functions },
-   { "GL_ARB_texture_compression",    GL_ARB_texture_compression_functions },
    { "GL_ARB_texture_rectangle",      NULL },
-   { "GL_ARB_vertex_buffer_object",   GL_ARB_vertex_buffer_object_functions },
    { "GL_EXT_blend_logic_op",         NULL },
    { "GL_EXT_fog_coord",              GL_EXT_fog_coord_functions },
-   { "GL_EXT_multi_draw_arrays",      GL_EXT_multi_draw_arrays_functions },
    /* paletted_textures currently doesn't work, but we could fix them later */
 #if defined( need_GL_EXT_paletted_texture )
    { "GL_EXT_shared_texture_palette", NULL },
@@ -539,6 +535,8 @@ mgaCreateContext( const __GLcontextModes *mesaVis,
    ctx->Const.MaxLineWidthAA = 10.0;
    ctx->Const.LineWidthGranularity = 1.0;
 
+   ctx->Const.MaxDrawBuffers = 1;
+
    mmesa->texture_depth = driQueryOptioni (&mmesa->optionCache,
 					   "texture_depth");
    if (mmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
@@ -818,7 +816,7 @@ mgaCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 mgaDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/mga/mgapixel.c b/src/mesa/drivers/dri/mga/mgapixel.c
index 9f90047ba5..977dfa0b76 100644
--- a/src/mesa/drivers/dri/mga/mgapixel.c
+++ b/src/mesa/drivers/dri/mga/mgapixel.c
@@ -133,7 +133,7 @@ check_color_per_fragment_ops( const GLcontext *ctx )
 		    ctx->Depth.Test ||
 		    ctx->Fog.Enabled ||
 		    ctx->Scissor.Enabled ||
-		    ctx->Stencil.Enabled ||
+		    ctx->Stencil._Enabled ||
 		    !ctx->Color.ColorMask[0] ||
 		    !ctx->Color.ColorMask[1] ||
 		    !ctx->Color.ColorMask[2] ||
diff --git a/src/mesa/drivers/dri/mga/mgatex.c b/src/mesa/drivers/dri/mga/mgatex.c
index 2392622b90..33eb0be449 100644
--- a/src/mesa/drivers/dri/mga/mgatex.c
+++ b/src/mesa/drivers/dri/mga/mgatex.c
@@ -153,10 +153,14 @@ mgaSetTexFilter( mgaTextureObjectPtr t, GLenum minf, GLenum magf )
    t->setup.texfilter |= val;
 }
 
-static void mgaSetTexBorderColor(mgaTextureObjectPtr t, GLubyte color[4])
+static void mgaSetTexBorderColor(mgaTextureObjectPtr t, const GLfloat color[4])
 {
-   t->setup.texbordercol = PACK_COLOR_8888(color[3], color[0], 
-					   color[1], color[2] );
+   GLubyte c[4];
+   CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+   CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+   CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+   CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
+   t->setup.texbordercol = PACK_COLOR_8888(c[3], c[0], c[1], c[2] );
 }
 
 
@@ -329,7 +333,7 @@ mgaAllocTexObj( struct gl_texture_object *tObj )
 
       mgaSetTexWrapping( t, tObj->WrapS, tObj->WrapT );
       mgaSetTexFilter( t, tObj->MinFilter, tObj->MagFilter );
-      mgaSetTexBorderColor( t, tObj->_BorderChan );
+      mgaSetTexBorderColor( t, tObj->BorderColor );
    }
 
    return( t );
@@ -458,7 +462,7 @@ mgaTexParameter( GLcontext *ctx, GLenum target,
 
    case GL_TEXTURE_BORDER_COLOR:
       FLUSH_BATCH(mmesa);
-      mgaSetTexBorderColor(t, tObj->_BorderChan);
+      mgaSetTexBorderColor(t, tObj->BorderColor);
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
diff --git a/src/mesa/drivers/dri/r128/r128_context.c b/src/mesa/drivers/dri/r128/r128_context.c
index 535a98cc01..f511a67bad 100644
--- a/src/mesa/drivers/dri/r128/r128_context.c
+++ b/src/mesa/drivers/dri/r128/r128_context.c
@@ -65,9 +65,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 int R128_DEBUG = 0;
 #endif
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
 #define need_GL_EXT_blend_minmax
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_secondary_color
@@ -75,12 +72,9 @@ int R128_DEBUG = 0;
 
 const struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
     { "GL_ARB_texture_env_add",            NULL },
     { "GL_ARB_texture_mirrored_repeat",    NULL },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     { "GL_EXT_blend_subtract",             GL_EXT_blend_minmax_functions },
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
     { "GL_EXT_texture_edge_clamp",         NULL },
@@ -229,6 +223,8 @@ GLboolean r128CreateContext( const __GLcontextModes *glVisual,
    ctx->Const.MaxLineWidthAA = 1.0;
    ctx->Const.LineWidthGranularity = 1.0;
 
+   ctx->Const.MaxDrawBuffers = 1;
+
 #if ENABLE_PERF_BOXES
    rmesa->boxes = driQueryOptionb(&rmesa->optionCache, "performance_boxes");
 #endif
diff --git a/src/mesa/drivers/dri/r128/r128_screen.c b/src/mesa/drivers/dri/r128/r128_screen.c
index cb3a147dba..f5bcc2f290 100644
--- a/src/mesa/drivers/dri/r128/r128_screen.c
+++ b/src/mesa/drivers/dri/r128/r128_screen.c
@@ -353,7 +353,7 @@ r128CreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 r128DestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
@@ -422,7 +422,7 @@ r128FillInModes( __DRIscreenPrivate *psp,
 
     uint8_t depth_bits_array[2];
     uint8_t stencil_bits_array[2];
-
+    uint8_t msaa_samples_array[1];
 
     depth_bits_array[0] = depth_bits;
     depth_bits_array[1] = depth_bits;
@@ -434,6 +434,8 @@ r128FillInModes( __DRIscreenPrivate *psp,
     stencil_bits_array[0] = 0;
     stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+    msaa_samples_array[0] = 0;
+
     depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 2 : 1;
     back_buffer_factor  = (have_back_buffer) ? 2 : 1;
 
@@ -446,26 +448,27 @@ r128FillInModes( __DRIscreenPrivate *psp,
         fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
     }
 
-   configs = driCreateConfigs(fb_format, fb_type,
-			      depth_bits_array, stencil_bits_array,
-			      depth_buffer_factor, back_buffer_modes,
-			      back_buffer_factor);
-   if (configs == NULL) {
-    fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
-              __LINE__);
-      return NULL;
-   }
+    configs = driCreateConfigs(fb_format, fb_type,
+                               depth_bits_array, stencil_bits_array,
+                               depth_buffer_factor, back_buffer_modes,
+                               back_buffer_factor,
+                               msaa_samples_array, 1);
+    if (configs == NULL) {
+        fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+                __LINE__);
+        return NULL;
+    }
 
-   /* Mark the visual as slow if there are "fake" stencil bits.
-    */
-   for (i = 0; configs[i]; i++) {
-      m = &configs[i]->modes;
-      if ((m->stencilBits != 0) && (m->stencilBits != stencil_bits)) {
-         m->visualRating = GLX_SLOW_CONFIG;
-      }
-   }
+    /* Mark the visual as slow if there are "fake" stencil bits.
+     */
+    for (i = 0; configs[i]; i++) {
+        m = &configs[i]->modes;
+        if ((m->stencilBits != 0) && (m->stencilBits != stencil_bits)) {
+            m->visualRating = GLX_SLOW_CONFIG;
+        }
+    }
 
-   return (const __DRIconfig **) configs;
+    return (const __DRIconfig **) configs;
 }
 
 
diff --git a/src/mesa/drivers/dri/r128/r128_state.c b/src/mesa/drivers/dri/r128/r128_state.c
index 451dcd1b55..4ae7bf5b97 100644
--- a/src/mesa/drivers/dri/r128/r128_state.c
+++ b/src/mesa/drivers/dri/r128/r128_state.c
@@ -771,6 +771,11 @@ static void r128DDLightModelfv( GLcontext *ctx, GLenum pname,
       FLUSH_BATCH( rmesa );
       updateSpecularLighting(ctx);
    }
+
+   if ( pname == GL_LIGHT_MODEL_TWO_SIDE ) {
+      FLUSH_BATCH( rmesa );
+      r128ChooseRenderState( ctx );
+   }
 }
 
 static void r128DDShadeModel( GLcontext *ctx, GLenum mode )
diff --git a/src/mesa/drivers/dri/r128/r128_tex.c b/src/mesa/drivers/dri/r128/r128_tex.c
index 3fc9c06cfa..0920270d7b 100644
--- a/src/mesa/drivers/dri/r128/r128_tex.c
+++ b/src/mesa/drivers/dri/r128/r128_tex.c
@@ -135,8 +135,13 @@ static void r128SetTexFilter( r128TexObjPtr t, GLenum minf, GLenum magf )
    }
 }
 
-static void r128SetTexBorderColor( r128TexObjPtr t, GLubyte c[4] )
+static void r128SetTexBorderColor( r128TexObjPtr t, const GLfloat color[4] )
 {
+   GLubyte c[4];
+   CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+   CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+   CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+   CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
    t->setup.tex_border_color = r128PackColor( 4, c[0], c[1], c[2], c[3] );
 }
 
@@ -165,7 +170,7 @@ static r128TexObjPtr r128AllocTexObj( struct gl_texture_object *texObj )
 
       r128SetTexWrap( t, texObj->WrapS, texObj->WrapT );
       r128SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
-      r128SetTexBorderColor( t, texObj->_BorderChan );
+      r128SetTexBorderColor( t, texObj->BorderColor );
    }
 
    return t;
@@ -531,7 +536,7 @@ static void r128TexParameter( GLcontext *ctx, GLenum target,
 
    case GL_TEXTURE_BORDER_COLOR:
       if ( t->base.bound ) FLUSH_BATCH( rmesa );
-      r128SetTexBorderColor( t, tObj->_BorderChan );
+      r128SetTexBorderColor( t, tObj->BorderColor );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
diff --git a/src/mesa/drivers/dri/r128/r128_tris.c b/src/mesa/drivers/dri/r128/r128_tris.c
index bcc9ffa651..5b91271d74 100644
--- a/src/mesa/drivers/dri/r128/r128_tris.c
+++ b/src/mesa/drivers/dri/r128/r128_tris.c
@@ -426,7 +426,7 @@ r128_fallback_point( r128ContextPtr rmesa,
 #define ANY_RASTER_FLAGS (DD_TRI_LIGHT_TWOSIDE|DD_TRI_OFFSET|DD_TRI_UNFILLED)
 #define _R128_NEW_RENDER_STATE (ANY_FALLBACK_FLAGS | ANY_RASTER_FLAGS)
 
-static void r128ChooseRenderState(GLcontext *ctx)
+void r128ChooseRenderState(GLcontext *ctx)
 {
    r128ContextPtr rmesa = R128_CONTEXT(ctx);
    GLuint flags = ctx->_TriangleCaps;
diff --git a/src/mesa/drivers/dri/r128/r128_tris.h b/src/mesa/drivers/dri/r128/r128_tris.h
index d90ca31534..c0667edb61 100644
--- a/src/mesa/drivers/dri/r128/r128_tris.h
+++ b/src/mesa/drivers/dri/r128/r128_tris.h
@@ -38,7 +38,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/mtypes.h"
 
 extern void r128InitTriFuncs( GLcontext *ctx );
-
+extern void r128ChooseRenderState( GLcontext *ctx );
 
 extern void r128Fallback( GLcontext *ctx, GLuint bit, GLboolean mode );
 #define FALLBACK( rmesa, bit, mode ) r128Fallback( rmesa->glCtx, bit, mode )
diff --git a/src/mesa/drivers/dri/r200/.gitignore b/src/mesa/drivers/dri/r200/.gitignore
deleted file mode 100644
index 3773d8ea73..0000000000
--- a/src/mesa/drivers/dri/r200/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-radeon_chipset.h
-radeon_screen.*
-server
diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
index e9144ac75c..42635bf9d9 100644
--- a/src/mesa/drivers/dri/r200/Makefile
+++ b/src/mesa/drivers/dri/r200/Makefile
@@ -3,6 +3,8 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
+CFLAGS += $(RADEON_CFLAGS)
+
 LIBNAME = r200_dri.so
 
 MINIGLX_SOURCES = server/radeon_dri.c 
@@ -11,44 +13,52 @@ ifeq ($(USING_EGL), 1)
 EGL_SOURCES = server/radeon_egl.c
 endif
 
+ifeq ($(RADEON_LDFLAGS),)
+CS_SOURCES = radeon_cs_space_drm.c
+endif
+
+RADEON_COMMON_SOURCES = \
+	radeon_bo_legacy.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_cs_legacy.c \
+	radeon_dma.c \
+	radeon_debug.c \
+	radeon_fbo.c \
+	radeon_lock.c \
+	radeon_mipmap_tree.c \
+	radeon_queryobj.c \
+	radeon_span.c \
+	radeon_texture.c
+
+
 DRIVER_SOURCES = r200_context.c \
 		 r200_ioctl.c \
-		 r200_lock.c \
 		 r200_state.c \
 		 r200_state_init.c \
 		 r200_cmdbuf.c \
 		 r200_pixel.c \
 		 r200_tex.c \
-		 r200_texmem.c \
 		 r200_texstate.c \
 		 r200_tcl.c \
 		 r200_swtcl.c \
-		 r200_span.c \
 		 r200_maos.c \
 		 r200_sanity.c \
 		 r200_fragshader.c \
 		 r200_vertprog.c \
 		 radeon_screen.c \
-		 $(EGL_SOURCES)
+		 $(EGL_SOURCES) \
+		 $(RADEON_COMMON_SOURCES) \
+		 $(CS_SOURCES)
 
 C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
 
 X86_SOURCES = 
 
-DRIVER_DEFINES = -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R200
-
-SYMLINKS = \
-	server/radeon_egl.c  \
-	server/radeon_dri.c  \
-	server/radeon_dri.h \
-	server/radeon.h \
-	server/radeon_macros.h \
-	server/radeon_reg.h
+DRIVER_DEFINES = -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R200 \
+				 -Wall
 
-COMMON_SYMLINKS = \
-	radeon_chipset.h \
-	radeon_screen.c \
-	radeon_screen.h
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 ##### TARGETS #####
 
@@ -57,15 +67,4 @@ include ../Makefile.template
 
 #INCLUDES += -I../radeon/server
 
-server:
-	mkdir -p server
-
-$(SYMLINKS): server
-	@[ -e $@ ] || ln -sf ../../radeon/$@ server/
-
-
-$(COMMON_SYMLINKS):
-	@[ -e $@ ] || ln -sf ../radeon/$@ ./
-
-symlinks: $(SYMLINKS) $(COMMON_SYMLINKS)
-
+symlinks:
diff --git a/src/mesa/drivers/dri/r200/r200_cmdbuf.c b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
index e1633772a1..1d1bea6f5f 100644
--- a/src/mesa/drivers/dri/r200/r200_cmdbuf.c
+++ b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 #include "main/simple_list.h"
 
+#include "radeon_common.h"
 #include "r200_context.h"
 #include "r200_state.h"
 #include "r200_ioctl.h"
@@ -45,160 +46,71 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_sanity.h"
 #include "radeon_reg.h"
 
-static void print_state_atom( struct r200_state_atom *state )
-{
-   int i;
-
-   fprintf(stderr, "emit %s/%d\n", state->name, state->cmd_size);
-
-   if (0 & R200_DEBUG & DEBUG_VERBOSE) 
-      for (i = 0 ; i < state->cmd_size ; i++) 
-	 fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
-
-}
-
 /* The state atoms will be emitted in the order they appear in the atom list,
  * so this step is important.
  */
+#define insert_at_tail_if(atom_list, atom) \
+   do { \
+      struct radeon_state_atom* __atom = (atom); \
+      if (__atom->check) \
+	 insert_at_tail((atom_list), __atom); \
+   } while(0)
+
 void r200SetUpAtomList( r200ContextPtr rmesa )
 {
    int i, mtu;
 
-   mtu = rmesa->glCtx->Const.MaxTextureUnits;
-
-   make_empty_list(&rmesa->hw.atomlist);
-   rmesa->hw.atomlist.name = "atom-list";
-
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ctx );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.set );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.lin );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msk );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpt );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vtx );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vap );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vte );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msc );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.cst );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.zbs );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tcl );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msl );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tcg );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.grd );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.fog );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tam );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tf );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.atf );
+   mtu = rmesa->radeon.glCtx->Const.MaxTextureUnits;
+
+   make_empty_list(&rmesa->radeon.hw.atomlist);
+   rmesa->radeon.hw.atomlist.name = "atom-list";
+
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.ctx );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.set );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.lin );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.msk );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpt );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.vtx );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.vap );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.vte );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.msc );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.cst );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.zbs );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.tcl );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.msl );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.tcg );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.grd );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.fog );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.tam );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.tf );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.atf );
    for (i = 0; i < mtu; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tex[i] );
+       insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.tex[i] );
    for (i = 0; i < mtu; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.cube[i] );
+       insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.cube[i] );
    for (i = 0; i < 6; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pix[i] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.afs[0] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.afs[1] );
+       insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.pix[i] );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.afs[0] );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.afs[1] );
    for (i = 0; i < 8; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.lit[i] );
+       insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.lit[i] );
    for (i = 0; i < 3 + mtu; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.mat[i] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.eye );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.glt );
+       insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.mat[i] );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.eye );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.glt );
    for (i = 0; i < 2; ++i)
-      insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.mtl[i] );
+      insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.mtl[i] );
    for (i = 0; i < 6; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ucp[i] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.spr );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ptp );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.prf );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pvs );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[0] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[1] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[0] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[1] );
-}
-
-static void r200SaveHwState( r200ContextPtr rmesa )
-{
-   struct r200_state_atom *atom;
-   char * dest = rmesa->backup_store.cmd_buf;
-
-   if (R200_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   rmesa->backup_store.cmd_used = 0;
-
-   foreach( atom, &rmesa->hw.atomlist ) {
-      if ( atom->check( rmesa->glCtx, atom->idx ) ) {
-	 int size = atom->cmd_size * 4;
-	 memcpy( dest, atom->cmd, size);
-	 dest += size;
-	 rmesa->backup_store.cmd_used += size;
-	 if (R200_DEBUG & DEBUG_STATE)
-	    print_state_atom( atom );
-      }
-   }
-
-   assert( rmesa->backup_store.cmd_used <= R200_CMD_BUF_SZ );
-   if (R200_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "Returning to r200EmitState\n");
-}
-
-void r200EmitState( r200ContextPtr rmesa )
-{
-   char *dest;
-   int mtu;
-   struct r200_state_atom *atom;
-
-   if (R200_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   if (rmesa->save_on_next_emit) {
-      r200SaveHwState(rmesa);
-      rmesa->save_on_next_emit = GL_FALSE;
-   }
-
-   if (!rmesa->hw.is_dirty && !rmesa->hw.all_dirty)
-      return;
-
-   mtu = rmesa->glCtx->Const.MaxTextureUnits;
-
-   /* To avoid going across the entire set of states multiple times, just check
-    * for enough space for the case of emitting all state, and inline the
-    * r200AllocCmdBuf code here without all the checks.
-    */
-   r200EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size );
-
-   /* we need to calculate dest after EnsureCmdBufSpace
-      as we may flush the buffer - airlied */
-   dest = rmesa->store.cmd_buf + rmesa->store.cmd_used;
-   if (R200_DEBUG & DEBUG_STATE) {
-      foreach( atom, &rmesa->hw.atomlist ) {
-	 if ( atom->dirty || rmesa->hw.all_dirty ) {
-	    if ( atom->check( rmesa->glCtx, atom->idx ) )
-	       print_state_atom( atom );
-	    else
-	       fprintf(stderr, "skip state %s\n", atom->name);
-	 }
-      }
-   }
-
-   foreach( atom, &rmesa->hw.atomlist ) {
-      if ( rmesa->hw.all_dirty )
-	 atom->dirty = GL_TRUE;
-      if ( atom->dirty ) {
-	 if ( atom->check( rmesa->glCtx, atom->idx ) ) {
-	    int size = atom->cmd_size * 4;
-	    memcpy( dest, atom->cmd, size);
-	    dest += size;
-	    rmesa->store.cmd_used += size;
-	    atom->dirty = GL_FALSE;
-	 }
-      }
-   }
-
-   assert( rmesa->store.cmd_used <= R200_CMD_BUF_SZ );
-
-   rmesa->hw.is_dirty = GL_FALSE;
-   rmesa->hw.all_dirty = GL_FALSE;
+       insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.ucp[i] );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.spr );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.ptp );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.prf );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.pvs );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpp[0] );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpp[1] );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpi[0] );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpi[1] );
+   insert_at_tail_if( &rmesa->radeon.hw.atomlist, &rmesa->hw.sci );
 }
 
 /* Fire a section of the retained (indexed_verts) buffer as a regular
@@ -208,51 +120,81 @@ void r200EmitVbufPrim( r200ContextPtr rmesa,
                        GLuint primitive,
                        GLuint vertex_nr )
 {
-   drm_radeon_cmd_header_t *cmd;
+   BATCH_LOCALS(&rmesa->radeon);
 
    assert(!(primitive & R200_VF_PRIM_WALK_IND));
    
-   r200EmitState( rmesa );
-   
-   if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
-      fprintf(stderr, "%s cmd_used/4: %d prim %x nr %d\n", __FUNCTION__,
-	      rmesa->store.cmd_used/4, primitive, vertex_nr);
+   radeonEmitState(&rmesa->radeon);
    
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VBUF_BUFSZ,
-						  __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = R200_CP_CMD_3D_DRAW_VBUF_2;
-   cmd[2].i = (primitive | 
-	       R200_VF_PRIM_WALK_LIST |
-	       R200_VF_COLOR_ORDER_RGBA |
-	       (vertex_nr << R200_VF_VERTEX_NUMBER_SHIFT));
+   radeon_print(RADEON_RENDER|RADEON_SWRENDER,RADEON_VERBOSE,
+           "%s cmd_used/4: %d prim %x nr %d\n", __FUNCTION__,
+           rmesa->store.cmd_used/4, primitive, vertex_nr);
+ 
+   BEGIN_BATCH(3);
+   OUT_BATCH_PACKET3_CLIP(R200_CP_CMD_3D_DRAW_VBUF_2, 0);
+   OUT_BATCH(primitive | R200_VF_PRIM_WALK_LIST | R200_VF_COLOR_ORDER_RGBA |
+	     (vertex_nr << R200_VF_VERTEX_NUMBER_SHIFT));
+   END_BATCH();
 }
 
+static void r200FireEB(r200ContextPtr rmesa, int vertex_count, int type)
+{
+	BATCH_LOCALS(&rmesa->radeon);
+
+	if (vertex_count > 0) {
+		BEGIN_BATCH(8+2);
+		OUT_BATCH_PACKET3_CLIP(R200_CP_CMD_3D_DRAW_INDX_2, 0);
+		OUT_BATCH(R200_VF_PRIM_WALK_IND |
+			  R200_VF_COLOR_ORDER_RGBA | 
+			  ((vertex_count + 0) << 16) |
+			  type);
+		
+		if (!rmesa->radeon.radeonScreen->kernel_mm) {
+			OUT_BATCH_PACKET3(R200_CP_CMD_INDX_BUFFER, 2);
+			OUT_BATCH((0x80 << 24) | (0 << 16) | 0x810);
+			OUT_BATCH_RELOC(rmesa->radeon.tcl.elt_dma_offset,
+					rmesa->radeon.tcl.elt_dma_bo,
+					rmesa->radeon.tcl.elt_dma_offset,
+					RADEON_GEM_DOMAIN_GTT, 0, 0);
+			OUT_BATCH((vertex_count + 1)/2);
+		} else {
+			OUT_BATCH_PACKET3(R200_CP_CMD_INDX_BUFFER, 2);
+			OUT_BATCH((0x80 << 24) | (0 << 16) | 0x810);
+			OUT_BATCH(rmesa->radeon.tcl.elt_dma_offset);
+			OUT_BATCH((vertex_count + 1)/2);
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.elt_dma_bo,
+					      RADEON_GEM_DOMAIN_GTT, 0, 0);
+		}
+		END_BATCH();
+	}
+}
 
-void r200FlushElts( r200ContextPtr rmesa )
+void r200FlushElts(GLcontext *ctx)
 {
-   int *cmd = (int *)(rmesa->store.cmd_buf + rmesa->store.elts_start);
-   int dwords;
-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 12)) / 2;
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   int nr, elt_used = rmesa->tcl.elt_used;
+
+   radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s %x %d\n", __FUNCTION__, rmesa->tcl.hw_primitive, elt_used);
+
+   assert( rmesa->radeon.dma.flush == r200FlushElts );
+   rmesa->radeon.dma.flush = NULL;
 
-   if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   nr = elt_used / 2;
 
-   assert( rmesa->dma.flush == r200FlushElts );
-   rmesa->dma.flush = NULL;
+   radeon_bo_unmap(rmesa->radeon.tcl.elt_dma_bo);
 
-   /* Cope with odd number of elts:
-    */
-   rmesa->store.cmd_used = (rmesa->store.cmd_used + 2) & ~2;
-   dwords = (rmesa->store.cmd_used - rmesa->store.elts_start) / 4;
+   r200FireEB(rmesa, nr, rmesa->tcl.hw_primitive);
 
-   cmd[1] |= (dwords - 3) << 16;
-   cmd[2] |= nr << R200_VF_VERTEX_NUMBER_SHIFT;
+   radeon_bo_unref(rmesa->radeon.tcl.elt_dma_bo);
+   rmesa->radeon.tcl.elt_dma_bo = NULL;
 
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-      r200Finish( rmesa->glCtx );
+   if (R200_ELT_BUF_SZ > elt_used)
+     radeonReturnDmaRegion(&rmesa->radeon, R200_ELT_BUF_SZ - elt_used);
+
+   if (radeon_is_debug_enabled(RADEON_SYNC, RADEON_CRITICAL)) {
+      radeon_print(RADEON_SYNC, RADEON_NORMAL, "%s: Syncing\n", __FUNCTION__);
+      radeonFinish( rmesa->radeon.glCtx );
    }
 }
 
@@ -261,168 +203,153 @@ GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
 				    GLuint primitive,
 				    GLuint min_nr )
 {
-   drm_radeon_cmd_header_t *cmd;
    GLushort *retval;
 
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s %d prim %x\n", __FUNCTION__, min_nr, primitive);
+   radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s %d prim %x\n", __FUNCTION__, min_nr, primitive);
 
    assert((primitive & R200_VF_PRIM_WALK_IND));
    
-   r200EmitState( rmesa );
-   
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, ELTS_BUFSZ(min_nr),
-						__FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = R200_CP_CMD_3D_DRAW_INDX_2;
-   cmd[2].i = (primitive | 
-	       R200_VF_PRIM_WALK_IND |
-	       R200_VF_COLOR_ORDER_RGBA);
-
-   
-   retval = (GLushort *)(cmd+3);
+   radeonEmitState(&rmesa->radeon);
 
-   if (R200_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s: header 0x%x prim %x \n",
-	      __FUNCTION__,
-	      cmd[1].i, primitive);
+   radeonAllocDmaRegion(&rmesa->radeon, &rmesa->radeon.tcl.elt_dma_bo,
+			&rmesa->radeon.tcl.elt_dma_offset, R200_ELT_BUF_SZ, 4);
+   rmesa->tcl.elt_used = min_nr * 2;
 
-   assert(!rmesa->dma.flush);
-   rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-   rmesa->dma.flush = r200FlushElts;
-
-   rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
+   radeon_bo_map(rmesa->radeon.tcl.elt_dma_bo, 1);
+   retval = rmesa->radeon.tcl.elt_dma_bo->ptr + rmesa->radeon.tcl.elt_dma_offset;
+   
+   assert(!rmesa->radeon.dma.flush);
+   rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+   rmesa->radeon.dma.flush = r200FlushElts;
 
    return retval;
 }
 
+void r200EmitMaxVtxIndex(r200ContextPtr rmesa, int count)
+{
+   BATCH_LOCALS(&rmesa->radeon);
 
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+	   BEGIN_BATCH_NO_AUTOSTATE(2);
+	   OUT_BATCH(CP_PACKET0(R200_SE_VF_MAX_VTX_INDX, 0));
+	   OUT_BATCH(count);
+	   END_BATCH();
+   }
+}
 
 void r200EmitVertexAOS( r200ContextPtr rmesa,
-			  GLuint vertex_size,
-			  GLuint offset )
+			GLuint vertex_size,
+ 			struct radeon_bo *bo,
+			GLuint offset )
 {
-   drm_radeon_cmd_header_t *cmd;
+   BATCH_LOCALS(&rmesa->radeon);
 
-   if (R200_DEBUG & (DEBUG_PRIMS|DEBUG_IOCTL))
-      fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
+   radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s:  vertex_size 0x%x offset 0x%x \n",
 	      __FUNCTION__, vertex_size, offset);
 
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
-						  __FUNCTION__ );
 
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (2 << 16);
-   cmd[2].i = 1;
-   cmd[3].i = vertex_size | (vertex_size << 8);
-   cmd[4].i = offset;
+   BEGIN_BATCH(7);
+   OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, 2);
+   OUT_BATCH(1);
+   OUT_BATCH(vertex_size | (vertex_size << 8));
+   OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+   END_BATCH();
 }
-		       
 
-void r200EmitAOS( r200ContextPtr rmesa,
-		    struct r200_dma_region **component,
-		    GLuint nr,
-		    GLuint offset )
+void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset)
 {
-   drm_radeon_cmd_header_t *cmd;
-   int sz = AOS_BUFSZ(nr);
+   BATCH_LOCALS(&rmesa->radeon);
+   uint32_t voffset;
+   int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
    int i;
-   int *tmp;
-
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s nr arrays: %d\n", __FUNCTION__, nr);
-
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sz, __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (((sz / sizeof(int)) - 3) << 16);
-   cmd[2].i = nr;
-   tmp = &cmd[0].i;
-   cmd += 3;
-
-   for (i = 0 ; i < nr ; i++) {
-      if (i & 1) {
-	 cmd[0].i |= ((component[i]->aos_stride << 24) | 
-		      (component[i]->aos_size << 16));
-	 cmd[2].i = (component[i]->aos_start + 
-		     offset * component[i]->aos_stride * 4);
-	 cmd += 3;
+   
+   radeon_print(RADEON_RENDER, RADEON_VERBOSE,
+           "%s: nr=%d, ofs=0x%08x\n",
+           __FUNCTION__, nr, offset);
+
+   BEGIN_BATCH(sz+2+ (nr*2));
+   OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, sz - 1);
+   OUT_BATCH(nr);
+
+    
+   if (!rmesa->radeon.radeonScreen->kernel_mm) {
+      for (i = 0; i + 1 < nr; i += 2) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+		   (rmesa->radeon.tcl.aos[i].stride << 8) |
+		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+			
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[i].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[i+1].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
       }
-      else {
-	 cmd[0].i = ((component[i]->aos_stride << 8) | 
-		     (component[i]->aos_size << 0));
-	 cmd[1].i = (component[i]->aos_start + 
-		     offset * component[i]->aos_stride * 4);
+      
+      if (nr & 1) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[nr - 1].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
+      }
+   } else {
+      for (i = 0; i + 1 < nr; i += 2) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+		   (rmesa->radeon.tcl.aos[i].stride << 8) |
+		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+	 
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 OUT_BATCH(voffset);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 OUT_BATCH(voffset);
+      }
+      
+      if (nr & 1) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 OUT_BATCH(voffset);
+      }
+      for (i = 0; i + 1 < nr; i += 2) {
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[i+0].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[i+1].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
+      }
+      if (nr & 1) {
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[nr-1].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
       }
    }
-
-   if (R200_DEBUG & DEBUG_VERTS) {
-      fprintf(stderr, "%s:\n", __FUNCTION__);
-      for (i = 0 ; i < sz ; i++)
-	 fprintf(stderr, "   %d: %x\n", i, tmp[i]);
-   }
-}
-
-void r200EmitBlit( r200ContextPtr rmesa,
-		   GLuint color_fmt,
-		   GLuint src_pitch,
-		   GLuint src_offset,
-		   GLuint dst_pitch,
-		   GLuint dst_offset,
-		   GLint srcx, GLint srcy,
-		   GLint dstx, GLint dsty,
-		   GLuint w, GLuint h )
-{
-   drm_radeon_cmd_header_t *cmd;
-
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
-	      __FUNCTION__, 
-	      src_pitch, src_offset, srcx, srcy,
-	      dst_pitch, dst_offset, dstx, dsty,
-	      w, h);
-
-   assert( (src_pitch & 63) == 0 );
-   assert( (dst_pitch & 63) == 0 );
-   assert( (src_offset & 1023) == 0 );
-   assert( (dst_offset & 1023) == 0 );
-   assert( w < (1<<16) );
-   assert( h < (1<<16) );
-
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 8 * sizeof(int),
-						  __FUNCTION__ );
-
-
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = R200_CP_CMD_BITBLT_MULTI | (5 << 16);
-   cmd[2].i = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
-	       RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-	       RADEON_GMC_BRUSH_NONE |
-	       (color_fmt << 8) |
-	       RADEON_GMC_SRC_DATATYPE_COLOR |
-	       RADEON_ROP3_S |
-	       RADEON_DP_SRC_SOURCE_MEMORY |
-	       RADEON_GMC_CLR_CMP_CNTL_DIS |
-	       RADEON_GMC_WR_MSK_DIS );
-
-   cmd[3].i = ((src_pitch/64)<<22) | (src_offset >> 10);
-   cmd[4].i = ((dst_pitch/64)<<22) | (dst_offset >> 10);
-   cmd[5].i = (srcx << 16) | srcy;
-   cmd[6].i = (dstx << 16) | dsty; /* dst */
-   cmd[7].i = (w << 16) | h;
-}
-
-
-void r200EmitWait( r200ContextPtr rmesa, GLuint flags )
-{
-   drm_radeon_cmd_header_t *cmd;
-
-   assert( !(flags & ~(RADEON_WAIT_2D|RADEON_WAIT_3D)) );
-
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 1 * sizeof(int),
-					   __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].wait.cmd_type = RADEON_CMD_WAIT;
-   cmd[0].wait.flags = flags;
+   END_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 5531e0a739..3ddb5bf7d6 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -54,27 +54,27 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_state.h"
-#include "r200_span.h"
 #include "r200_pixel.h"
 #include "r200_tex.h"
 #include "r200_swtcl.h"
 #include "r200_tcl.h"
 #include "r200_maos.h"
 #include "r200_vertprog.h"
+#include "radeon_queryobj.h"
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
+#include "radeon_span.h"
+
+#define need_GL_ARB_occlusion_query
 #define need_GL_ARB_vertex_program
 #define need_GL_ATI_fragment_shader
 #define need_GL_EXT_blend_minmax
 #define need_GL_EXT_fog_coord
-#define need_GL_EXT_multi_draw_arrays
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
 #define need_GL_NV_vertex_program
 #define need_GL_ARB_point_parameters
+#define need_GL_EXT_framebuffer_object
 #include "extension_helper.h"
 
 #define DRIVER_DATE	"20060602"
@@ -82,9 +82,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h" /* for symbolic values of enum-type options */
-#ifndef R200_DEBUG
-int R200_DEBUG = (0);
-#endif
 
 /* Return various strings for glGetString().
  */
@@ -93,8 +90,8 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    static char buffer[128];
    unsigned   offset;
-   GLuint agp_mode = (rmesa->r200Screen->card_type == RADEON_CARD_PCI)? 0 :
-      rmesa->r200Screen->AGPMode;
+   GLuint agp_mode = (rmesa->radeon.radeonScreen->card_type == RADEON_CARD_PCI)? 0 :
+      rmesa->radeon.radeonScreen->AGPMode;
 
    switch ( name ) {
    case GL_VENDOR:
@@ -105,7 +102,7 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
 				     agp_mode );
 
       sprintf( & buffer[ offset ], " %sTCL",
-	       !(rmesa->TclFallback & R200_TCL_FALLBACK_TCL_DISABLE)
+	       !(rmesa->radeon.TclFallback & R200_TCL_FALLBACK_TCL_DISABLE)
 	       ? "" : "NO-" );
 
       return (GLubyte *)buffer;
@@ -120,20 +117,18 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
  */
 const struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
+    { "GL_ARB_occlusion_query",		   GL_ARB_occlusion_query_functions},
     { "GL_ARB_texture_border_clamp",       NULL },
-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
     { "GL_ARB_texture_env_add",            NULL },
     { "GL_ARB_texture_env_combine",        NULL },
     { "GL_ARB_texture_env_dot3",           NULL },
     { "GL_ARB_texture_env_crossbar",       NULL },
     { "GL_ARB_texture_mirrored_repeat",    NULL },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
     { "GL_EXT_blend_subtract",             NULL },
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
-    { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
+    { "GL_EXT_packed_depth_stencil",	   NULL},
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
     { "GL_EXT_stencil_wrap",               NULL },
     { "GL_EXT_texture_edge_clamp",         NULL },
@@ -175,6 +170,11 @@ const struct dri_extension point_extensions[] = {
     { NULL,                                NULL }
 };
 
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
 extern const struct tnl_pipeline_stage _r200_render_stage;
 extern const struct tnl_pipeline_stage _r200_tcl_stage;
 
@@ -221,26 +221,54 @@ static void r200InitDriverFuncs( struct dd_function_table *functions )
     functions->GetString		= r200GetString;
 }
 
-static const struct dri_debug_control debug_control[] =
+
+static void r200_get_lock(radeonContextPtr radeon)
 {
-    { "fall",  DEBUG_FALLBACKS },
-    { "tex",   DEBUG_TEXTURE },
-    { "ioctl", DEBUG_IOCTL },
-    { "prim",  DEBUG_PRIMS },
-    { "vert",  DEBUG_VERTS },
-    { "state", DEBUG_STATE },
-    { "code",  DEBUG_CODEGEN },
-    { "vfmt",  DEBUG_VFMT },
-    { "vtxf",  DEBUG_VFMT },
-    { "verb",  DEBUG_VERBOSE },
-    { "dri",   DEBUG_DRI },
-    { "dma",   DEBUG_DMA },
-    { "san",   DEBUG_SANITY },
-    { "sync",  DEBUG_SYNC },
-    { "pix",   DEBUG_PIXEL },
-    { "mem",   DEBUG_MEMORY },
-    { NULL,    0 }
-};
+   r200ContextPtr rmesa = (r200ContextPtr)radeon;
+   drm_radeon_sarea_t *sarea = radeon->sarea;
+
+   R200_STATECHANGE( rmesa, ctx );
+   if (rmesa->radeon.sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+   }
+   else rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &= ~R200_COLOR_TILE_ENABLE;
+
+   if ( sarea->ctx_owner != rmesa->radeon.dri.hwContext ) {
+      sarea->ctx_owner = rmesa->radeon.dri.hwContext;
+      if (!radeon->radeonScreen->kernel_mm)
+         radeon_bo_legacy_texture_age(radeon->radeonScreen->bom);
+   }
+
+}
+
+static void r200_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+}
+
+static void r200_emit_query_finish(radeonContextPtr radeon)
+{
+   BATCH_LOCALS(radeon);
+   struct radeon_query_object *query = radeon->query.current;
+
+   BEGIN_BATCH_NO_AUTOSTATE(4);
+   OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZPASS_ADDR, 0));
+   OUT_BATCH_RELOC(0, query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0);
+   END_BATCH();
+   query->curr_offset += sizeof(uint32_t);
+   assert(query->curr_offset < RADEON_QUERY_PAGE_SIZE);
+   query->emitted_begin = GL_FALSE;
+}
+
+static void r200_init_vtbl(radeonContextPtr radeon)
+{
+   radeon->vtbl.get_lock = r200_get_lock;
+   radeon->vtbl.update_viewport_offset = r200UpdateViewportOffset;
+   radeon->vtbl.emit_cs_header = r200_vtbl_emit_cs_header;
+   radeon->vtbl.swtcl_flush = r200_swtcl_flush;
+   radeon->vtbl.fallback = r200Fallback;
+   radeon->vtbl.update_scissor = r200_vtbl_update_scissor;
+   radeon->vtbl.emit_query_finish = r200_emit_query_finish;
+}
 
 
 /* Create the device specific rendering context.
@@ -253,9 +281,9 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
    struct dd_function_table functions;
    r200ContextPtr rmesa;
-   GLcontext *ctx, *shareCtx;
+   GLcontext *ctx;
    int i;
-   int tcl_mode, fthrottle_mode;
+   int tcl_mode;
 
    assert(glVisual);
    assert(driContextPriv);
@@ -265,7 +293,8 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    rmesa = (r200ContextPtr) CALLOC( sizeof(*rmesa) );
    if ( !rmesa )
       return GL_FALSE;
-      
+
+   r200_init_vtbl(&rmesa->radeon);
    /* init exp fog table data */
    r200InitStaticFogData();
 
@@ -273,12 +302,13 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
     * Do this here so that initialMaxAnisotropy is set before we create
     * the default textures.
     */
-   driParseConfigFiles (&rmesa->optionCache, &screen->optionCache,
+   driParseConfigFiles (&rmesa->radeon.optionCache, &screen->optionCache,
 			screen->driScreen->myNum, "r200");
-   rmesa->initialMaxAnisotropy = driQueryOptionf(&rmesa->optionCache,
-                                                 "def_max_anisotropy");
+   rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
+							"def_max_anisotropy");
 
-   if ( driQueryOptionb( &rmesa->optionCache, "hyperz" ) ) {
+   if ( sPriv->drm_version.major == 1
+       && driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
       if ( sPriv->drm_version.minor < 13 )
 	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
 			  "disabling.\n", sPriv->drm_version.minor );
@@ -295,63 +325,20 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    _mesa_init_driver_functions(&functions);
    r200InitDriverFuncs(&functions);
    r200InitIoctlFuncs(&functions);
-   r200InitStateFuncs(&functions);
+   r200InitStateFuncs(&functions, screen->kernel_mm);
    r200InitTextureFuncs(&functions);
    r200InitShaderFuncs(&functions); 
+   radeonInitQueryObjFunctions(&functions);
 
-   /* Allocate and initialize the Mesa context */
-   if (sharedContextPrivate)
-      shareCtx = ((r200ContextPtr) sharedContextPrivate)->glCtx;
-   else
-      shareCtx = NULL;
-   rmesa->glCtx = _mesa_create_context(glVisual, shareCtx,
-                                       &functions, (void *) rmesa);
-   if (!rmesa->glCtx) {
-      FREE(rmesa);
-      return GL_FALSE;
+   if (!radeonInitContext(&rmesa->radeon, &functions,
+			  glVisual, driContextPriv,
+			  sharedContextPrivate)) {
+     FREE(rmesa);
+     return GL_FALSE;
    }
-   driContextPriv->driverPrivate = rmesa;
-
-   /* Init r200 context data */
-   rmesa->dri.context = driContextPriv;
-   rmesa->dri.screen = sPriv;
-   rmesa->dri.drawable = NULL; /* Set by XMesaMakeCurrent */
-   rmesa->dri.hwContext = driContextPriv->hHWContext;
-   rmesa->dri.hwLock = &sPriv->pSAREA->lock;
-   rmesa->dri.fd = sPriv->fd;
-   rmesa->dri.drmMinor = sPriv->drm_version.minor;
-
-   rmesa->r200Screen = screen;
-   rmesa->sarea = (drm_radeon_sarea_t *)((GLubyte *)sPriv->pSAREA +
-				       screen->sarea_priv_offset);
-
-
-   rmesa->dma.buf0_address = rmesa->r200Screen->buffers->list[0].address;
-
-   (void) memset( rmesa->texture_heaps, 0, sizeof( rmesa->texture_heaps ) );
-   make_empty_list( & rmesa->swapped );
-
-   rmesa->nr_heaps = 1 /* screen->numTexHeaps */ ;
-   assert(rmesa->nr_heaps < RADEON_NR_TEX_HEAPS);
-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-      rmesa->texture_heaps[i] = driCreateTextureHeap( i, rmesa,
-	    screen->texSize[i],
-	    12,
-	    RADEON_NR_TEX_REGIONS,
-	    (drmTextureRegionPtr)rmesa->sarea->tex_list[i],
-	    & rmesa->sarea->tex_age[i],
-	    & rmesa->swapped,
-	    sizeof( r200TexObj ),
-	    (destroy_texture_object_t *) r200DestroyTexObj );
-   }
-   rmesa->texture_depth = driQueryOptioni (&rmesa->optionCache,
-					   "texture_depth");
-   if (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
-      rmesa->texture_depth = ( screen->cpp == 4 ) ?
-	 DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
 
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->hw.all_dirty = 1;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.hw.all_dirty = 1;
 
    /* Set the maximum texture size small enough that we can guarentee that
     * all texture units can bind a maximal texture and have all of them in
@@ -359,29 +346,20 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
     * setting allow larger textures.
     */
 
-   ctx = rmesa->glCtx;
-   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->optionCache,
+   ctx = rmesa->radeon.glCtx;
+   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->radeon.optionCache,
 						 "texture_units");
    ctx->Const.MaxTextureImageUnits = ctx->Const.MaxTextureUnits;
    ctx->Const.MaxTextureCoordUnits = ctx->Const.MaxTextureUnits;
 
-   i = driQueryOptioni( &rmesa->optionCache, "allow_large_textures");
-
-   driCalculateMaxTextureLevels( rmesa->texture_heaps,
-				 rmesa->nr_heaps,
-				 & ctx->Const,
-				 4,
-				 11, /* max 2D texture size is 2048x2048 */
-#if ENABLE_HW_3D_TEXTURE
-				 8,  /* max 3D texture size is 256^3 */
-#else
-				 0,  /* 3D textures unsupported */
-#endif
-				 11, /* max cube texture size is 2048x2048 */
-				 11, /* max texture rectangle size is 2048x2048 */
-				 12,
-				 GL_FALSE,
-				 i );
+   i = driQueryOptioni( &rmesa->radeon.optionCache, "allow_large_textures");
+
+   /* FIXME: When no memory manager is available we should set this 
+    * to some reasonable value based on texture memory pool size */
+   ctx->Const.MaxTextureLevels = 12;
+   ctx->Const.Max3DTextureLevels = 9;
+   ctx->Const.MaxCubeTextureLevels = 12;
+   ctx->Const.MaxTextureRectSize = 2048;
 
    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 
@@ -391,7 +369,7 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    ctx->Const.MinPointSizeAA = 1.0;
    ctx->Const.MaxPointSizeAA = 1.0;
    ctx->Const.PointSizeGranularity = 0.0625;
-   if (rmesa->r200Screen->drmSupportsPointSprites)
+   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites)
       ctx->Const.MaxPointSize = 2047.0;
    else
       ctx->Const.MaxPointSize = 1.0;
@@ -411,6 +389,10 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    ctx->Const.VertexProgram.MaxNativeParameters = R200_VSF_MAX_PARAM;
    ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
 
+   ctx->Const.MaxDrawBuffers = 1;
+
+   _mesa_set_mvp_with_dp4( ctx, GL_TRUE );
+
    /* Initialize the software rasterizer and helper modules.
     */
    _swrast_CreateContext( ctx );
@@ -445,33 +427,39 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    _math_matrix_set_identity( &rmesa->tmpmat );
 
    driInitExtensions( ctx, card_extensions, GL_TRUE );
-   if (!(rmesa->r200Screen->chip_flags & R200_CHIPSET_YCBCR_BROKEN)) {
+
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+     driInitExtensions(ctx, mm_extensions, GL_FALSE);
+   if (!(rmesa->radeon.radeonScreen->chip_flags & R200_CHIPSET_YCBCR_BROKEN)) {
      /* yuv textures don't work with some chips - R200 / rv280 okay so far
 	others get the bit ordering right but don't actually do YUV-RGB conversion */
       _mesa_enable_extension( ctx, "GL_MESA_ycbcr_texture" );
    }
-   if (rmesa->glCtx->Mesa_DXTn) {
+   if (rmesa->radeon.glCtx->Mesa_DXTn) {
       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
       _mesa_enable_extension( ctx, "GL_S3_s3tc" );
    }
-   else if (driQueryOptionb (&rmesa->optionCache, "force_s3tc_enable")) {
+   else if (driQueryOptionb (&rmesa->radeon.optionCache, "force_s3tc_enable")) {
       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
    }
 
-   if (rmesa->r200Screen->drmSupportsCubeMapsR200)
+   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR200)
       _mesa_enable_extension( ctx, "GL_ARB_texture_cube_map" );
-   if (rmesa->r200Screen->drmSupportsBlendColor) {
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
        driInitExtensions( ctx, blend_extensions, GL_FALSE );
    }
-   if(rmesa->r200Screen->drmSupportsVertexProgram)
+   if(rmesa->radeon.radeonScreen->drmSupportsVertexProgram)
       driInitSingleExtension( ctx, ARB_vp_extension );
-   if(driQueryOptionb(&rmesa->optionCache, "nv_vertex_program"))
+   if(driQueryOptionb(&rmesa->radeon.optionCache, "nv_vertex_program"))
       driInitSingleExtension( ctx, NV_vp_extension );
 
-   if ((ctx->Const.MaxTextureUnits == 6) && rmesa->r200Screen->drmSupportsFragShader)
+   if ((ctx->Const.MaxTextureUnits == 6) && rmesa->radeon.radeonScreen->drmSupportsFragShader)
       driInitSingleExtension( ctx, ATI_fs_extension );
-   if (rmesa->r200Screen->drmSupportsPointSprites)
+   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites)
       driInitExtensions( ctx, point_extensions, GL_FALSE );
+
+   if (!rmesa->radeon.radeonScreen->kernel_mm)
+      _mesa_disable_extension(ctx, "GL_ARB_occlusion_query");
 #if 0
    r200InitDriverFuncs( ctx );
    r200InitIoctlFuncs( ctx );
@@ -480,236 +468,43 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
 #endif
    /* plug in a few more device driver functions */
    /* XXX these should really go right after _mesa_init_driver_functions() */
+   radeon_fbo_init(&rmesa->radeon);
+   radeonInitSpanFuncs( ctx );
    r200InitPixelFuncs( ctx );
-   r200InitSpanFuncs( ctx );
    r200InitTnlFuncs( ctx );
    r200InitState( rmesa );
    r200InitSwtcl( ctx );
 
-   fthrottle_mode = driQueryOptioni(&rmesa->optionCache, "fthrottle_mode");
-   rmesa->iw.irq_seq = -1;
-   rmesa->irqsEmitted = 0;
-   rmesa->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
-		     rmesa->r200Screen->irq);
-
-   rmesa->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
-
-   if (!rmesa->do_irqs)
-      fprintf(stderr,
-	      "IRQ's not enabled, falling back to %s: %d %d\n",
-	      rmesa->do_usleeps ? "usleeps" : "busy waits",
-	      fthrottle_mode,
-	      rmesa->r200Screen->irq);
-
    rmesa->prefer_gart_client_texturing = 
       (getenv("R200_GART_CLIENT_TEXTURES") != 0);
 
-   (*sPriv->systemTime->getUST)( & rmesa->swap_ust );
-
-
-#if DO_DEBUG
-   R200_DEBUG  = driParseDebugString( getenv( "R200_DEBUG" ),
-				      debug_control );
-   R200_DEBUG |= driParseDebugString( getenv( "RADEON_DEBUG" ),
-				      debug_control );
-#endif
-
-   tcl_mode = driQueryOptioni(&rmesa->optionCache, "tcl_mode");
-   if (driQueryOptionb(&rmesa->optionCache, "no_rast")) {
+   tcl_mode = driQueryOptioni(&rmesa->radeon.optionCache, "tcl_mode");
+   if (driQueryOptionb(&rmesa->radeon.optionCache, "no_rast")) {
       fprintf(stderr, "disabling 3D acceleration\n");
       FALLBACK(rmesa, R200_FALLBACK_DISABLE, 1);
    }
    else if (tcl_mode == DRI_CONF_TCL_SW || getenv("R200_NO_TCL") ||
-	    !(rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL)) {
-      if (rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL) {
-	 rmesa->r200Screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+	    !(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+	 rmesa->radeon.radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
 	 fprintf(stderr, "Disabling HW TCL support\n");
       }
-      TCL_FALLBACK(rmesa->glCtx, R200_TCL_FALLBACK_TCL_DISABLE, 1);
+      TCL_FALLBACK(rmesa->radeon.glCtx, R200_TCL_FALLBACK_TCL_DISABLE, 1);
    }
 
    return GL_TRUE;
 }
 
 
-/* Destroy the device specific context.
- */
-/* Destroy the Mesa and driver specific context data.
- */
 void r200DestroyContext( __DRIcontextPrivate *driContextPriv )
 {
-   GET_CURRENT_CONTEXT(ctx);
-   r200ContextPtr rmesa = (r200ContextPtr) driContextPriv->driverPrivate;
-   r200ContextPtr current = ctx ? R200_CONTEXT(ctx) : NULL;
-
-   /* check if we're deleting the currently bound context */
-   if (rmesa == current) {
-      R200_FIREVERTICES( rmesa );
-      _mesa_make_current(NULL, NULL, NULL);
-   }
-
-   /* Free r200 context resources */
-   assert(rmesa); /* should never be null */
-   if ( rmesa ) {
-      GLboolean   release_texture_heaps;
-
-
-      release_texture_heaps = (rmesa->glCtx->Shared->RefCount == 1);
-      _swsetup_DestroyContext( rmesa->glCtx );
-      _tnl_DestroyContext( rmesa->glCtx );
-      _vbo_DestroyContext( rmesa->glCtx );
-      _swrast_DestroyContext( rmesa->glCtx );
-
-      r200DestroySwtcl( rmesa->glCtx );
-      r200ReleaseArrays( rmesa->glCtx, ~0 );
-
-      if (rmesa->dma.current.buf) {
-	 r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
-	 r200FlushCmdBuf( rmesa, __FUNCTION__ );
-      }
-
-      if (rmesa->state.scissor.pClipRects) {
-	 FREE(rmesa->state.scissor.pClipRects);
-	 rmesa->state.scissor.pClipRects = NULL;
-      }
-
-      if ( release_texture_heaps ) {
-         /* This share group is about to go away, free our private
-          * texture object data.
-          */
-         int i;
-
-         for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-	    driDestroyTextureHeap( rmesa->texture_heaps[ i ] );
-	    rmesa->texture_heaps[ i ] = NULL;
-         }
-
-	 assert( is_empty_list( & rmesa->swapped ) );
-      }
-
-      /* free the Mesa context */
-      rmesa->glCtx->DriverCtx = NULL;
-      _mesa_destroy_context( rmesa->glCtx );
-
-      /* free the option cache */
-      driDestroyOptionCache (&rmesa->optionCache);
-
-      FREE( rmesa );
-   }
-}
-
-
-
-
-void
-r200SwapBuffers( __DRIdrawablePrivate *dPriv )
-{
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      r200ContextPtr rmesa;
-      GLcontext *ctx;
-      rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
-      ctx = rmesa->glCtx;
-      if (ctx->Visual.doubleBufferMode) {
-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
-         if ( rmesa->doPageFlip ) {
-            r200PageFlip( dPriv );
-         }
-         else {
-	     r200CopyBuffer( dPriv, NULL );
-         }
-      }
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
-   }
-}
-
-void
-r200CopySubBuffer( __DRIdrawablePrivate *dPriv,
-		   int x, int y, int w, int h )
-{
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      r200ContextPtr rmesa;
-      GLcontext *ctx;
-      rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
-      ctx = rmesa->glCtx;
-      if (ctx->Visual.doubleBufferMode) {
-	 drm_clip_rect_t rect;
-	 rect.x1 = x + dPriv->x;
-	 rect.y1 = (dPriv->h - y - h) + dPriv->y;
-	 rect.x2 = rect.x1 + w;
-	 rect.y2 = rect.y1 + h;
-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
-	 r200CopyBuffer( dPriv, &rect );
-      }
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
-   }
-}
-
-/* Force the context `c' to be the current context and associate with it
- * buffer `b'.
- */
-GLboolean
-r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
-                   __DRIdrawablePrivate *driDrawPriv,
-                   __DRIdrawablePrivate *driReadPriv )
-{
-   if ( driContextPriv ) {
-      r200ContextPtr newCtx = 
-	 (r200ContextPtr) driContextPriv->driverPrivate;
-
-      if (R200_DEBUG & DEBUG_DRI)
-	 fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)newCtx->glCtx);
-
-      newCtx->dri.readable = driReadPriv;
-
-      if ( newCtx->dri.drawable != driDrawPriv ||
-           newCtx->lastStamp != driDrawPriv->lastStamp ) {
-	 if (driDrawPriv->swap_interval == (unsigned)-1) {
-	    driDrawPriv->vblFlags = (newCtx->r200Screen->irq != 0)
-	       ? driGetDefaultVBlankFlags(&newCtx->optionCache)
-	       : VBLANK_FLAG_NO_IRQ;
-
-	    driDrawableInitVBlank( driDrawPriv );
-	 }
-
-	 newCtx->dri.drawable = driDrawPriv;
-
-	 r200SetCliprects(newCtx);
-	 r200UpdateViewportOffset( newCtx->glCtx );
-      }
-
-      _mesa_make_current( newCtx->glCtx,
-			  (GLframebuffer *) driDrawPriv->driverPrivate,
-			  (GLframebuffer *) driReadPriv->driverPrivate );
-
-      _mesa_update_state( newCtx->glCtx );
-      r200ValidateState( newCtx->glCtx );
-
-   } else {
-      if (R200_DEBUG & DEBUG_DRI)
-	 fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
-      _mesa_make_current( NULL, NULL, NULL );
-   }
-
-   if (R200_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "End %s\n", __FUNCTION__);
-   return GL_TRUE;
-}
-
-/* Force the context `c' to be unbound from its buffer.
- */
-GLboolean
-r200UnbindContext( __DRIcontextPrivate *driContextPriv )
-{
-   r200ContextPtr rmesa = (r200ContextPtr) driContextPriv->driverPrivate;
-
-   if (R200_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)rmesa->glCtx);
-
-   return GL_TRUE;
+	int i;
+	r200ContextPtr rmesa = (r200ContextPtr)driContextPriv->driverPrivate;
+	if (rmesa)
+	{
+		for ( i = 0 ; i < R200_MAX_TEXTURE_UNITS ; i++ ) {
+			_math_matrix_dtr( &rmesa->TexGenMatrix[i] );
+		}
+	}
+	radeonDestroyContext(driContextPriv);
 }
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index 14a1dda46a..246f98c6dc 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -53,51 +53,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #error This driver requires a newer libdrm to compile
 #endif
 
+#include "radeon_screen.h"
+#include "radeon_common.h"
+
+#include "radeon_lock.h"
+
 struct r200_context;
 typedef struct r200_context r200ContextRec;
 typedef struct r200_context *r200ContextPtr;
 
-/* This union is used to avoid warnings/miscompilation
-   with float to uint32_t casts due to strict-aliasing */
-typedef union { GLfloat f; uint32_t ui32; } float_ui32_type;
-
-#include "r200_lock.h"
-#include "radeon_screen.h"
 #include "main/mm.h"
 
-/* Flags for software fallback cases */
-/* See correponding strings in r200_swtcl.c */
-#define R200_FALLBACK_TEXTURE           0x01
-#define R200_FALLBACK_DRAW_BUFFER       0x02
-#define R200_FALLBACK_STENCIL           0x04
-#define R200_FALLBACK_RENDER_MODE       0x08
-#define R200_FALLBACK_DISABLE           0x10
-#define R200_FALLBACK_BORDER_MODE       0x20
-
-/* The blit width for texture uploads
- */
-#define BLIT_WIDTH_BYTES 1024
-
-/* Use the templated vertex format:
- */
-#define COLOR_IS_RGBA
-#define TAG(x) r200##x
-#include "tnl_dd/t_dd_vertex.h"
-#undef TAG
-
-typedef void (*r200_tri_func)( r200ContextPtr,
-				 r200Vertex *,
-				 r200Vertex *,
-				 r200Vertex * );
-
-typedef void (*r200_line_func)( r200ContextPtr,
-				  r200Vertex *,
-				  r200Vertex * );
-
-typedef void (*r200_point_func)( r200ContextPtr,
-				   r200Vertex * );
-
-
 struct r200_vertex_program {
         struct gl_vertex_program mesa_program; /* Must be first */
         int translated;
@@ -112,93 +78,11 @@ struct r200_vertex_program {
         int fogmode;
 };
 
-struct r200_colorbuffer_state {
-   GLuint clear;
-#if 000
-   GLint drawOffset, drawPitch;
-#endif
-   int roundEnable;
-};
-
-
-struct r200_depthbuffer_state {
-   GLuint clear;
-   GLfloat scale;
-};
-
-#if 000
-struct r200_pixel_state {
-   GLint readOffset, readPitch;
-};
-#endif
-
-struct r200_scissor_state {
-   drm_clip_rect_t rect;
-   GLboolean enabled;
-
-   GLuint numClipRects;			/* Cliprects active */
-   GLuint numAllocedClipRects;		/* Cliprects available */
-   drm_clip_rect_t *pClipRects;
-};
-
-struct r200_stencilbuffer_state {
-   GLboolean hwBuffer;
-   GLuint clear;			/* rb3d_stencilrefmask value */
-};
-
-struct r200_stipple_state {
-   GLuint mask[32];
-};
-
-
-
-#define TEX_0   0x1
-#define TEX_1   0x2
-#define TEX_2	0x4
-#define TEX_3	0x8
-#define TEX_4	0x10
-#define TEX_5	0x20
-#define TEX_ALL 0x3f
-
-typedef struct r200_tex_obj r200TexObj, *r200TexObjPtr;
-
-/* Texture object in locally shared texture space.
- */
-struct r200_tex_obj {
-   driTextureObject   base;
-
-   GLuint bufAddr;			/* Offset to start of locally
-					   shared texture block */
-
-   GLuint dirty_state;		        /* Flags (1 per texunit) for
-					   whether or not this texobj
-					   has dirty hardware state
-					   (pp_*) that needs to be
-					   brought into the
-					   texunit. */
-
-   drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
-					/* Six, for the cube faces */
-   GLboolean image_override;		/* Image overridden by GLX_EXT_tfp */
-
-   GLuint pp_txfilter;		        /* hardware register values */
-   GLuint pp_txformat;
-   GLuint pp_txformat_x;
-   GLuint pp_txoffset;		        /* Image location in texmem.
-					   All cube faces follow. */
-   GLuint pp_txsize;		        /* npot only */
-   GLuint pp_txpitch;		        /* npot only */
-   GLuint pp_border_color;
-   GLuint pp_cubic_faces;	        /* cube face 1,2,3,4 log2 sizes */
-
-   GLboolean  border_fallback;
-
-   GLuint tile_bits;			/* hw texture tile bits used on this texture */
-};
+#define R200_TEX_ALL 0x3f
 
 
 struct r200_texture_env_state {
-   r200TexObjPtr texobj;
+   radeonTexObjPtr texobj;
    GLuint outputreg;
    GLuint unitneeded;
 };
@@ -210,19 +94,6 @@ struct r200_texture_state {
 };
 
 
-struct r200_state_atom {
-   struct r200_state_atom *next, *prev;
-   const char *name;		         /* for debug */
-   int cmd_size;		         /* size in bytes */
-   GLuint idx;
-   int *cmd;			         /* one or more cmd's */
-   int *lastcmd;			 /* one or more cmd's */
-   GLboolean dirty;
-   GLboolean (*check)( GLcontext *, int );    /* is this state active? */
-};
-   
-
-
 /* Trying to keep these relatively short as the variables are becoming
  * extravagently long.  Drop the driver name prefix off the front of
  * everything - I think we know which driver we're in by now, and keep the
@@ -596,182 +467,96 @@ struct r200_state_atom {
 #define PRF_STATE_SIZE    3
 
 
-struct r200_hw_state {
-   /* Head of the linked list of state atoms. */
-   struct r200_state_atom atomlist;
+#define SCI_CMD_0         0
+#define SCI_RE_AUX        1
+#define SCI_CMD_1         2
+#define SCI_XY_1          3
+#define SCI_CMD_2         4
+#define SCI_XY_2          5
+#define SCI_STATE_SIZE    6
+
+#define R200_QUERYOBJ_CMD_0  0
+#define R200_QUERYOBJ_DATA_0 1
+#define R200_QUERYOBJ_CMDSIZE  2
+
+#define STP_CMD_0 0
+#define STP_DATA_0 1
+#define STP_CMD_1 2
+#define STP_STATE_SIZE 35
 
+struct r200_hw_state {
    /* Hardware state, stored as cmdbuf commands:  
     *   -- Need to doublebuffer for
     *           - reviving state after loss of context
     *           - eliding noop statechange loops? (except line stipple count)
     */
-   struct r200_state_atom ctx;
-   struct r200_state_atom set;
-   struct r200_state_atom vte;
-   struct r200_state_atom lin;
-   struct r200_state_atom msk;
-   struct r200_state_atom vpt;
-   struct r200_state_atom vap;
-   struct r200_state_atom vtx;
-   struct r200_state_atom tcl;
-   struct r200_state_atom msl;
-   struct r200_state_atom tcg;
-   struct r200_state_atom msc;
-   struct r200_state_atom cst;
-   struct r200_state_atom tam;
-   struct r200_state_atom tf;
-   struct r200_state_atom tex[6];
-   struct r200_state_atom cube[6];
-   struct r200_state_atom zbs;
-   struct r200_state_atom mtl[2];
-   struct r200_state_atom mat[9];
-   struct r200_state_atom lit[8]; /* includes vec, scl commands */
-   struct r200_state_atom ucp[6];
-   struct r200_state_atom pix[6]; /* pixshader stages */
-   struct r200_state_atom eye; /* eye pos */
-   struct r200_state_atom grd; /* guard band clipping */
-   struct r200_state_atom fog;
-   struct r200_state_atom glt;
-   struct r200_state_atom prf;
-   struct r200_state_atom afs[2];
-   struct r200_state_atom pvs;
-   struct r200_state_atom vpi[2];
-   struct r200_state_atom vpp[2];
-   struct r200_state_atom atf;
-   struct r200_state_atom spr;
-   struct r200_state_atom ptp;
-
-   int max_state_size;	/* Number of bytes necessary for a full state emit. */
-   GLboolean is_dirty, all_dirty;
+   struct radeon_state_atom ctx;
+   struct radeon_state_atom set;
+   struct radeon_state_atom sci;
+   struct radeon_state_atom vte;
+   struct radeon_state_atom lin;
+   struct radeon_state_atom msk;
+   struct radeon_state_atom vpt;
+   struct radeon_state_atom vap;
+   struct radeon_state_atom vtx;
+   struct radeon_state_atom tcl;
+   struct radeon_state_atom msl;
+   struct radeon_state_atom tcg;
+   struct radeon_state_atom msc;
+   struct radeon_state_atom cst;
+   struct radeon_state_atom tam;
+   struct radeon_state_atom tf;
+   struct radeon_state_atom tex[6];
+   struct radeon_state_atom cube[6];
+   struct radeon_state_atom zbs;
+   struct radeon_state_atom mtl[2];
+   struct radeon_state_atom mat[9];
+   struct radeon_state_atom lit[8]; /* includes vec, scl commands */
+   struct radeon_state_atom ucp[6];
+   struct radeon_state_atom pix[6]; /* pixshader stages */
+   struct radeon_state_atom eye; /* eye pos */
+   struct radeon_state_atom grd; /* guard band clipping */
+   struct radeon_state_atom fog;
+   struct radeon_state_atom glt;
+   struct radeon_state_atom prf;
+   struct radeon_state_atom afs[2];
+   struct radeon_state_atom pvs;
+   struct radeon_state_atom vpi[2];
+   struct radeon_state_atom vpp[2];
+   struct radeon_state_atom atf;
+   struct radeon_state_atom spr;
+   struct radeon_state_atom ptp;
+   struct radeon_state_atom stp;
 };
 
 struct r200_state {
    /* Derived state for internal purposes:
     */
-   struct r200_colorbuffer_state color;
-   struct r200_depthbuffer_state depth;
-#if 00
-   struct r200_pixel_state pixel;
-#endif
-   struct r200_scissor_state scissor;
-   struct r200_stencilbuffer_state stencil;
-   struct r200_stipple_state stipple;
    struct r200_texture_state texture;
    GLuint envneeded;
 };
 
-/* Need refcounting on dma buffers:
- */
-struct r200_dma_buffer {
-   int refcount;		/* the number of retained regions in buf */
-   drmBufPtr buf;
-};
-
-#define GET_START(rvb) (rmesa->r200Screen->gart_buffer_offset +		\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-
-/* A retained region, eg vertices for indexed vertices.
- */
-struct r200_dma_region {
-   struct r200_dma_buffer *buf;
-   char *address;		/* == buf->address */
-   int start, end, ptr;		/* offsets from start of buf */
-   int aos_start;
-   int aos_stride;
-   int aos_size;
-};
-
-
-struct r200_dma {
-   /* Active dma region.  Allocations for vertices and retained
-    * regions come from here.  Also used for emitting random vertices,
-    * these may be flushed by calling flush_current();
-    */
-   struct r200_dma_region current;
-   
-   void (*flush)( r200ContextPtr );
-
-   char *buf0_address;		/* start of buf[0], for index calcs */
-   GLuint nr_released_bufs;	/* flush after so many buffers released */
-};
-
-struct r200_dri_mirror {
-   __DRIcontextPrivate	*context;	/* DRI context */
-   __DRIscreenPrivate	*screen;	/* DRI screen */
-   __DRIdrawablePrivate	*drawable;	/* DRI drawable bound to this ctx */
-   __DRIdrawablePrivate	*readable;	/* DRI readable bound to this ctx */
-
-   drm_context_t hwContext;
-   drm_hw_lock_t *hwLock;
-   int fd;
-   int drmMinor;
-};
-
-
 #define R200_CMD_BUF_SZ  (16*1024) 
 
-struct r200_store {
-   GLuint statenr;
-   GLuint primnr;
-   char cmd_buf[R200_CMD_BUF_SZ];
-   int cmd_used;   
-   int elts_start;
-};
-
-
+#define R200_ELT_BUF_SZ  (16*1024) 
 /* r200_tcl.c
  */
 struct r200_tcl_info {
    GLuint hw_primitive;
 
-/* hw can handle 12 components max */
-   struct r200_dma_region *aos_components[12];
-   GLuint nr_aos_components;
-
-   GLuint *Elts;
+   int elt_used;
 
-   struct r200_dma_region indexed_verts;
-   struct r200_dma_region vertex_data[15];
 };
 
 
 /* r200_swtcl.c
  */
 struct r200_swtcl_info {
-   GLuint RenderIndex;
-   
-   /**
-    * Size of a hardware vertex.  This is calculated when \c ::vertex_attrs is
-    * installed in the Mesa state vector.
-    */
-   GLuint vertex_size;
 
-   /**
-    * Attributes instructing the Mesa TCL pipeline where / how to put vertex
-    * data in the hardware buffer.
-    */
-   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
 
-   /**
-    * Number of elements of \c ::vertex_attrs that are actually used.
-    */
-   GLuint vertex_attr_count;
-
-   /**
-    * Cached pointer to the buffer where Mesa will store vertex data.
-    */
-   GLubyte *verts;
-
-   /* Fallback rasterization functions
-    */
-   r200_point_func draw_point;
-   r200_line_func draw_line;
-   r200_tri_func draw_tri;
-
-   GLuint hw_primitive;
-   GLenum render_primitive;
-   GLuint numverts;
+   radeon_point_func draw_point;
+   radeon_line_func draw_line;
+   radeon_tri_func draw_tri;
 
    /**
     * Offset of the 4UB color data within a hardware (swtcl) vertex.
@@ -787,27 +572,10 @@ struct r200_swtcl_info {
     * Should Mesa project vertex data or will the hardware do it?
     */
    GLboolean needproj;
-
-   struct r200_dma_region indexed_verts;
-};
-
-
-struct r200_ioctl {
-   GLuint vertex_offset;
-   GLuint vertex_size;
 };
 
 
 
-#define R200_MAX_PRIMS 64
-
-
-
-struct r200_prim {
-   GLuint start;
-   GLuint end;
-   GLuint prim;
-};
 
    /* A maximum total of 29 elements per vertex:  3 floats for position, 3
     * floats for normal, 4 floats for color, 4 bytes for secondary color,
@@ -822,9 +590,8 @@ struct r200_prim {
 
 #define R200_MAX_VERTEX_SIZE ((3*6)+11)
 
-
 struct r200_context {
-   GLcontext *glCtx;			/* Mesa context */
+   struct radeon_context radeon;
 
    /* Driver and hardware state management
     */
@@ -832,56 +599,15 @@ struct r200_context {
    struct r200_state state;
    struct r200_vertex_program *curr_vp_hw;
 
-   /* Texture object bookkeeping
-    */
-   unsigned              nr_heaps;
-   driTexHeap          * texture_heaps[ RADEON_NR_TEX_HEAPS ];
-   driTextureObject      swapped;
-   int                   texture_depth;
-   float                 initialMaxAnisotropy;
-
-   /* Rasterization and vertex state:
-    */
-   GLuint TclFallback;
-   GLuint Fallback;
-   GLuint NewGLState;
-   DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
-
    /* Vertex buffers
     */
-   struct r200_ioctl ioctl;
-   struct r200_dma dma;
-   struct r200_store store;
-   /* A full state emit as of the first state emit in the main store, in case
-    * the context is lost.
-    */
-   struct r200_store backup_store;
-
-   /* Page flipping
-    */
-   GLuint doPageFlip;
-
-   /* Busy waiting
-    */
-   GLuint do_usleeps;
-   GLuint do_irqs;
-   GLuint irqsEmitted;
-   drm_radeon_irq_wait_t iw;
+   struct radeon_ioctl ioctl;
+   struct radeon_store store;
 
    /* Clientdata textures;
     */
    GLuint prefer_gart_client_texturing;
 
-   /* Drawable, cliprect and scissor information
-    */
-   GLuint numClipRects;			/* Cliprects for the draw buffer */
-   drm_clip_rect_t *pClipRects;
-   unsigned int lastStamp;
-   GLboolean lost_context;
-   GLboolean save_on_next_emit;
-   radeonScreenPtr r200Screen;	/* Screen private DRI data */
-   drm_radeon_sarea_t *sarea;		/* Private SAREA data */
-
    /* TCL stuff
     */
    GLmatrix TexGenMatrix[R200_MAX_TEXTURE_UNITS];
@@ -893,15 +619,6 @@ struct r200_context {
    GLuint TexGenCompSel;
    GLmatrix tmpmat;
 
-   /* buffer swap
-    */
-   int64_t swap_ust;
-   int64_t swap_missed_ust;
-
-   GLuint swap_count;
-   GLuint swap_missed_count;
-
-
    /* r200_tcl.c
     */
    struct r200_tcl_info tcl;
@@ -910,14 +627,6 @@ struct r200_context {
     */
    struct r200_swtcl_info swtcl;
 
-   /* Mirrors of some DRI state
-    */
-   struct r200_dri_mirror dri;
-
-   /* Configuration cache
-    */
-   driOptionCache optionCache;
-
    GLboolean using_hyperz;
    GLboolean texmicrotile;
 
@@ -927,28 +636,10 @@ struct r200_context {
 #define R200_CONTEXT(ctx)		((r200ContextPtr)(ctx->DriverCtx))
 
 
-static INLINE GLuint r200PackColor( GLuint cpp,
-					GLubyte r, GLubyte g,
-					GLubyte b, GLubyte a )
-{
-   switch ( cpp ) {
-   case 2:
-      return PACK_COLOR_565( r, g, b );
-   case 4:
-      return PACK_COLOR_8888( a, r, g, b );
-   default:
-      return 0;
-   }
-}
-
-
 extern void r200DestroyContext( __DRIcontextPrivate *driContextPriv );
 extern GLboolean r200CreateContext( const __GLcontextModes *glVisual,
 				    __DRIcontextPrivate *driContextPriv,
 				    void *sharedContextPrivate);
-extern void r200SwapBuffers( __DRIdrawablePrivate *dPriv );
-extern void r200CopySubBuffer( __DRIdrawablePrivate * dPriv,
-			       int x, int y, int w, int h );
 extern GLboolean r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
 				  __DRIdrawablePrivate *driDrawPriv,
 				  __DRIdrawablePrivate *driReadPriv );
@@ -957,28 +648,9 @@ extern GLboolean r200UnbindContext( __DRIcontextPrivate *driContextPriv );
 /* ================================================================
  * Debugging:
  */
-#define DO_DEBUG		1
 
-#if DO_DEBUG
-extern int R200_DEBUG;
-#else
-#define R200_DEBUG		0
-#endif
+#define R200_DEBUG RADEON_DEBUG
+
 
-#define DEBUG_TEXTURE	0x001
-#define DEBUG_STATE	0x002
-#define DEBUG_IOCTL	0x004
-#define DEBUG_PRIMS	0x008
-#define DEBUG_VERTS	0x010
-#define DEBUG_FALLBACKS	0x020
-#define DEBUG_VFMT	0x040
-#define DEBUG_CODEGEN	0x080
-#define DEBUG_VERBOSE	0x100
-#define DEBUG_DRI       0x200
-#define DEBUG_DMA       0x400
-#define DEBUG_SANITY    0x800
-#define DEBUG_SYNC      0x1000
-#define DEBUG_PIXEL     0x2000
-#define DEBUG_MEMORY    0x4000
 
 #endif /* __R200_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_fragshader.c b/src/mesa/drivers/dri/r200/r200_fragshader.c
index d514b28219..85c1b7bdd1 100644
--- a/src/mesa/drivers/dri/r200/r200_fragshader.c
+++ b/src/mesa/drivers/dri/r200/r200_fragshader.c
@@ -522,7 +522,7 @@ static void r200UpdateFSConstants( GLcontext *ctx )
 	 CLAMPED_FLOAT_TO_UBYTE(con_byte[2], ctx->ATIFragmentShader.GlobalConstants[i][2]);
 	 CLAMPED_FLOAT_TO_UBYTE(con_byte[3], ctx->ATIFragmentShader.GlobalConstants[i][3]);
       }
-      rmesa->hw.atf.cmd[ATF_TFACTOR_0 + i] = r200PackColor (
+      rmesa->hw.atf.cmd[ATF_TFACTOR_0 + i] = radeonPackColor (
 	 4, con_byte[0], con_byte[1], con_byte[2], con_byte[3] );
    }
 }
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c
index 0741e57af7..b238adb972 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.c
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.c
@@ -31,7 +31,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * Authors:
  *   Keith Whitwell <keith@tungstengraphics.com>
  */
- 
+
 #include <sched.h>
 #include <errno.h>
 
@@ -41,6 +41,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "swrast/swrast.h"
 
+
+
+#include "radeon_common.h"
+#include "radeon_lock.h"
 #include "r200_context.h"
 #include "r200_state.h"
 #include "r200_ioctl.h"
@@ -54,635 +58,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R200_TIMEOUT             512
 #define R200_IDLE_RETRY           16
 
-
-static void r200WaitForIdle( r200ContextPtr rmesa );
-
-
-/* At this point we were in FlushCmdBufLocked but we had lost our context, so
- * we need to unwire our current cmdbuf, hook the one with the saved state in
- * it, flush it, and then put the current one back.  This is so commands at the
- * start of a cmdbuf can rely on the state being kept from the previous one.
- */
-static void r200BackUpAndEmitLostStateLocked( r200ContextPtr rmesa )
-{
-   GLuint nr_released_bufs;
-   struct r200_store saved_store;
-
-   if (rmesa->backup_store.cmd_used == 0)
-      return;
-
-   if (R200_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "Emitting backup state on lost context\n");
-
-   rmesa->lost_context = GL_FALSE;
-
-   nr_released_bufs = rmesa->dma.nr_released_bufs;
-   saved_store = rmesa->store;
-   rmesa->dma.nr_released_bufs = 0;
-   rmesa->store = rmesa->backup_store;
-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
-   rmesa->dma.nr_released_bufs = nr_released_bufs;
-   rmesa->store = saved_store;
-}
-
-int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller )
-{
-   int ret, i;
-   drm_radeon_cmd_buffer_t cmd;
-
-   if (rmesa->lost_context)
-      r200BackUpAndEmitLostStateLocked( rmesa );
-
-   if (R200_DEBUG & DEBUG_IOCTL) {
-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
-
-      if (0 & R200_DEBUG & DEBUG_VERBOSE) 
-	 for (i = 0 ; i < rmesa->store.cmd_used ; i += 4 )
-	    fprintf(stderr, "%d: %x\n", i/4, 
-		    *(int *)(&rmesa->store.cmd_buf[i]));
-   }
-
-   if (R200_DEBUG & DEBUG_DMA)
-      fprintf(stderr, "%s: Releasing %d buffers\n", __FUNCTION__,
-	      rmesa->dma.nr_released_bufs);
-
-
-   if (R200_DEBUG & DEBUG_SANITY) {
-      if (rmesa->state.scissor.enabled) 
-	 ret = r200SanityCmdBuffer( rmesa, 
-				    rmesa->state.scissor.numClipRects,
-				    rmesa->state.scissor.pClipRects);
-      else
-	 ret = r200SanityCmdBuffer( rmesa, 
-				    rmesa->numClipRects,
-				    rmesa->pClipRects);
-      if (ret) {
-	 fprintf(stderr, "drmSanityCommandWrite: %d\n", ret);	 
-	 goto out;
-      }
-   }
-
-
-   if (R200_DEBUG & DEBUG_MEMORY) {
-      if (! driValidateTextureHeaps( rmesa->texture_heaps, rmesa->nr_heaps,
-				     & rmesa->swapped ) ) {
-	 fprintf( stderr, "%s: texture memory is inconsistent - expect "
-		  "mangled textures\n", __FUNCTION__ );
-      }
-   }
-
-
-   cmd.bufsz = rmesa->store.cmd_used;
-   cmd.buf = rmesa->store.cmd_buf;
-
-   if (rmesa->state.scissor.enabled) {
-      cmd.nbox = rmesa->state.scissor.numClipRects;
-      cmd.boxes = (drm_clip_rect_t *)rmesa->state.scissor.pClipRects;
-   } else {
-      cmd.nbox = rmesa->numClipRects;
-      cmd.boxes = (drm_clip_rect_t *)rmesa->pClipRects;
-   }
-
-   ret = drmCommandWrite( rmesa->dri.fd,
-			  DRM_RADEON_CMDBUF,
-			  &cmd, sizeof(cmd) );
-
-   if (ret)
-      fprintf(stderr, "drmCommandWrite: %d\n", ret);
-
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "\nSyncing in %s\n\n", __FUNCTION__);
-      r200WaitForIdleLocked( rmesa );
-   }
-
-
- out:
-   rmesa->store.primnr = 0;
-   rmesa->store.statenr = 0;
-   rmesa->store.cmd_used = 0;
-   rmesa->dma.nr_released_bufs = 0;
-   rmesa->save_on_next_emit = 1;
-
-   return ret;
-}
-
-
-/* Note: does not emit any commands to avoid recursion on
- * r200AllocCmdBuf.
- */
-void r200FlushCmdBuf( r200ContextPtr rmesa, const char *caller )
-{
-   int ret;
-
-   LOCK_HARDWARE( rmesa );
-
-   ret = r200FlushCmdBufLocked( rmesa, caller );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if (ret) {
-      fprintf(stderr, "drmRadeonCmdBuffer: %d (exiting)\n", ret);
-      exit(ret);
-   }
-}
-
-
-/* =============================================================
- * Hardware vertex buffer handling
- */
-
-
-void r200RefillCurrentDmaRegion( r200ContextPtr rmesa )
-{
-   struct r200_dma_buffer *dmabuf;
-   int fd = rmesa->dri.fd;
-   int index = 0;
-   int size = 0;
-   drmDMAReq dma;
-   int ret;
-
-   if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
-      fprintf(stderr, "%s\n", __FUNCTION__);  
-
-   if (rmesa->dma.flush) {
-      rmesa->dma.flush( rmesa );
-   }
-
-   if (rmesa->dma.current.buf)
-      r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
-
-   if (rmesa->dma.nr_released_bufs > 4)
-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
-
-   dma.context = rmesa->dri.hwContext;
-   dma.send_count = 0;
-   dma.send_list = NULL;
-   dma.send_sizes = NULL;
-   dma.flags = 0;
-   dma.request_count = 1;
-   dma.request_size = RADEON_BUFFER_SIZE;
-   dma.request_list = &index;
-   dma.request_sizes = &size;
-   dma.granted_count = 0;
-
-   LOCK_HARDWARE(rmesa);	/* no need to validate */
-
-   while (1) {
-      ret = drmDMA( fd, &dma );
-      if (ret == 0)
-	 break;
-   
-      if (rmesa->dma.nr_released_bufs) {
-	 r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
-      }
-
-      if (rmesa->do_usleeps) {
-	 UNLOCK_HARDWARE( rmesa );
-	 DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa );
-      }
-   }
-
-   UNLOCK_HARDWARE(rmesa);
-
-   if (R200_DEBUG & DEBUG_DMA)
-      fprintf(stderr, "Allocated buffer %d\n", index);
-
-   dmabuf = CALLOC_STRUCT( r200_dma_buffer );
-   dmabuf->buf = &rmesa->r200Screen->buffers->list[index];
-   dmabuf->refcount = 1;
-
-   rmesa->dma.current.buf = dmabuf;
-   rmesa->dma.current.address = dmabuf->buf->address;
-   rmesa->dma.current.end = dmabuf->buf->total;
-   rmesa->dma.current.start = 0;
-   rmesa->dma.current.ptr = 0;
-}
-
-void r200ReleaseDmaRegion( r200ContextPtr rmesa,
-			     struct r200_dma_region *region,
-			     const char *caller )
-{
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
-   
-   if (!region->buf)
-      return;
-
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
-
-   if (--region->buf->refcount == 0) {
-      drm_radeon_cmd_header_t *cmd;
-
-      if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
-	 fprintf(stderr, "%s -- DISCARD BUF %d\n", __FUNCTION__,
-		 region->buf->buf->idx);  
-      
-      cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sizeof(*cmd), 
-						     __FUNCTION__ );
-      cmd->dma.cmd_type = RADEON_CMD_DMA_DISCARD;
-      cmd->dma.buf_idx = region->buf->buf->idx;
-      FREE(region->buf);
-      rmesa->dma.nr_released_bufs++;
-   }
-
-   region->buf = NULL;
-   region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r200AllocDmaRegion( r200ContextPtr rmesa, 
-			   struct r200_dma_region *region,
-			   int bytes,
-			   int alignment )
-{
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
-
-   if (region->buf)
-      r200ReleaseDmaRegion( rmesa, region, __FUNCTION__ );
-
-   alignment--;
-   rmesa->dma.current.start = rmesa->dma.current.ptr = 
-      (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      r200RefillCurrentDmaRegion( rmesa );
-
-   region->start = rmesa->dma.current.start;
-   region->ptr = rmesa->dma.current.start;
-   region->end = rmesa->dma.current.start + bytes;
-   region->address = rmesa->dma.current.address;
-   region->buf = rmesa->dma.current.buf;
-   region->buf->refcount++;
-
-   rmesa->dma.current.ptr += bytes; /* bug - if alignment > 7 */
-   rmesa->dma.current.start = 
-      rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;  
-
-   assert( rmesa->dma.current.ptr <= rmesa->dma.current.end );
-}
-
-/* ================================================================
- * SwapBuffers with client-side throttling
- */
-
-static uint32_t r200GetLastFrame(r200ContextPtr rmesa)
-{
-   drm_radeon_getparam_t gp;
-   int ret;
-   uint32_t frame;
-
-   gp.param = RADEON_PARAM_LAST_FRAME;
-   gp.value = (int *)&frame;
-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_GETPARAM,
-			      &gp, sizeof(gp) );
-   if ( ret ) {
-      fprintf( stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-
-   return frame;
-}
-
-static void r200EmitIrqLocked( r200ContextPtr rmesa )
-{
-   drm_radeon_irq_emit_t ie;
-   int ret;
-
-   ie.irq_seq = &rmesa->iw.irq_seq;
-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_IRQ_EMIT, 
-			      &ie, sizeof(ie) );
-   if ( ret ) {
-      fprintf( stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-}
-
-
-static void r200WaitIrq( r200ContextPtr rmesa )
-{
-   int ret;
-
-   do {
-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_IRQ_WAIT,
-			     &rmesa->iw, sizeof(rmesa->iw) );
-   } while (ret && (errno == EINTR || errno == EBUSY));
-
-   if ( ret ) {
-      fprintf( stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-}
-
-
-static void r200WaitForFrameCompletion( r200ContextPtr rmesa )
-{
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
-
-   if (rmesa->do_irqs) {
-      if (r200GetLastFrame(rmesa) < sarea->last_frame) {
-	 if (!rmesa->irqsEmitted) {
-	    while (r200GetLastFrame (rmesa) < sarea->last_frame)
-	       ;
-	 }
-	 else {
-	    UNLOCK_HARDWARE( rmesa ); 
-	    r200WaitIrq( rmesa );	
-	    LOCK_HARDWARE( rmesa ); 
-	 }
-	 rmesa->irqsEmitted = 10;
-      }
-
-      if (rmesa->irqsEmitted) {
-	 r200EmitIrqLocked( rmesa );
-	 rmesa->irqsEmitted--;
-      }
-   } 
-   else {
-      while (r200GetLastFrame (rmesa) < sarea->last_frame) {
-	 UNLOCK_HARDWARE( rmesa ); 
-	 if (rmesa->do_usleeps) 
-	    DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa ); 
-      }
-   }
-}
-
-
-
-/* Copy the back color buffer to the front color buffer.
- */
-void r200CopyBuffer( __DRIdrawablePrivate *dPriv,
-		      const drm_clip_rect_t	 *rect)
-{
-   r200ContextPtr rmesa;
-   GLint nbox, i, ret;
-   GLboolean   missed_target;
-   int64_t ust;
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-   assert(dPriv);
-   assert(dPriv->driContextPriv);
-   assert(dPriv->driContextPriv->driverPrivate);
-
-   rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
-
-   if ( R200_DEBUG & DEBUG_IOCTL ) {
-      fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *)rmesa->glCtx );
-   }
-
-   R200_FIREVERTICES( rmesa );
-
-   LOCK_HARDWARE( rmesa );
-
-
-   /* Throttle the frame rate -- only allow one pending swap buffers
-    * request at a time.
-    */
-   r200WaitForFrameCompletion( rmesa );
-   if (!rect)
-   {
-       UNLOCK_HARDWARE( rmesa );
-       driWaitForVBlank( dPriv, & missed_target );
-       LOCK_HARDWARE( rmesa );
-   }
-
-   nbox = dPriv->numClipRects; /* must be in locked region */
-
-   for ( i = 0 ; i < nbox ; ) {
-      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      GLint n = 0;
-
-      for ( ; i < nr ; i++ ) {
-
-	  *b = box[i];
-
-	  if (rect)
-	  {
-	     if (rect->x1 > b->x1)
-		 b->x1 = rect->x1;
-	     if (rect->y1 > b->y1)
-		 b->y1 = rect->y1;
-	     if (rect->x2 < b->x2)
-		 b->x2 = rect->x2;
-	     if (rect->y2 < b->y2)
-		 b->y2 = rect->y2;
-
-	     if (b->x1 >= b->x2 || b->y1 >= b->y2)
-		 continue;
-	  }
-
-	  b++;
-	  n++;
-      }
-      rmesa->sarea->nbox = n;
-
-      if (!n)
-	 continue;
-
-      ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
-
-      if ( ret ) {
-	 fprintf( stderr, "DRM_R200_SWAP_BUFFERS: return = %d\n", ret );
-	 UNLOCK_HARDWARE( rmesa );
-	 exit( 1 );
-      }
-   }
-
-   UNLOCK_HARDWARE( rmesa );
-   if (!rect)
-   {
-       rmesa->hw.all_dirty = GL_TRUE;
-
-       rmesa->swap_count++;
-       (*psp->systemTime->getUST)( & ust );
-       if ( missed_target ) {
-	   rmesa->swap_missed_count++;
-	   rmesa->swap_missed_ust = ust - rmesa->swap_ust;
-       }
-
-       rmesa->swap_ust = ust;
-
-       sched_yield();
-   }
-}
-
-void r200PageFlip( __DRIdrawablePrivate *dPriv )
-{
-   r200ContextPtr rmesa;
-   GLint ret;
-   GLboolean   missed_target;
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-   assert(dPriv);
-   assert(dPriv->driContextPriv);
-   assert(dPriv->driContextPriv->driverPrivate);
-
-   rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
-
-   if ( R200_DEBUG & DEBUG_IOCTL ) {
-      fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
-	      rmesa->sarea->pfCurrentPage);
-   }
-
-   R200_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-
-   if (!dPriv->numClipRects) {
-      UNLOCK_HARDWARE( rmesa );
-      usleep( 10000 );		/* throttle invisible client 10ms */
-      return;
-   }
-
-   /* Need to do this for the perf box placement:
-    */
-   {
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      b[0] = box[0];
-      rmesa->sarea->nbox = 1;
-   }
-
-   /* Throttle the frame rate -- only allow a few pending swap buffers
-    * request at a time.
-    */
-   r200WaitForFrameCompletion( rmesa );
-   UNLOCK_HARDWARE( rmesa );
-   driWaitForVBlank( dPriv, & missed_target );
-   if ( missed_target ) {
-      rmesa->swap_missed_count++;
-      (void) (*psp->systemTime->getUST)( & rmesa->swap_missed_ust );
-   }
-   LOCK_HARDWARE( rmesa );
-
-   ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_FLIP );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
-      exit( 1 );
-   }
-
-   rmesa->swap_count++;
-   (void) (*psp->systemTime->getUST)( & rmesa->swap_ust );
-
-#if 000
-   if ( rmesa->sarea->pfCurrentPage == 1 ) {
-	 rmesa->state.color.drawOffset = rmesa->r200Screen->frontOffset;
-	 rmesa->state.color.drawPitch  = rmesa->r200Screen->frontPitch;
-   } else {
-	 rmesa->state.color.drawOffset = rmesa->r200Screen->backOffset;
-	 rmesa->state.color.drawPitch  = rmesa->r200Screen->backPitch;
-   }
-
-   R200_STATECHANGE( rmesa, ctx );
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = rmesa->state.color.drawOffset
-					   + rmesa->r200Screen->fbLocation;
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH]  = rmesa->state.color.drawPitch;
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
-   }
-#else
-   /* Get ready for drawing next frame.  Update the renderbuffers'
-    * flippedOffset/Pitch fields so we draw into the right place.
-    */
-   driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-                        rmesa->sarea->pfCurrentPage);
-
-
-   r200UpdateDrawBuffer(rmesa->glCtx);
-#endif
-}
-
-
-/* ================================================================
- * Buffer clear
- */
-static void r200Clear( GLcontext *ctx, GLbitfield mask )
+static void r200KernelClear(GLcontext *ctx, GLuint flags)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-   GLuint flags = 0;
-   GLuint color_mask = 0;
-   GLint ret, i;
-   GLint cx, cy, cw, ch;
-
-   if ( R200_DEBUG & DEBUG_IOCTL ) {
-      fprintf( stderr, "r200Clear\n");
-   }
-
-   {
-      LOCK_HARDWARE( rmesa );
-      UNLOCK_HARDWARE( rmesa );
-      if ( dPriv->numClipRects == 0 ) 
-	 return;
-   }
-
-   r200Flush( ctx );
-
-   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
-      flags |= RADEON_FRONT;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      mask &= ~BUFFER_BIT_FRONT_LEFT;
-   }
-
-   if ( mask & BUFFER_BIT_BACK_LEFT ) {
-      flags |= RADEON_BACK;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      mask &= ~BUFFER_BIT_BACK_LEFT;
-   }
-
-   if ( mask & BUFFER_BIT_DEPTH ) {
-      flags |= RADEON_DEPTH;
-      mask &= ~BUFFER_BIT_DEPTH;
-   }
-
-   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->state.stencil.hwBuffer ) {
-      flags |= RADEON_STENCIL;
-      mask &= ~BUFFER_BIT_STENCIL;
-   }
-
-   if ( mask ) {
-      if (R200_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
-      _swrast_Clear( ctx, mask );
-   }
-
-   if ( !flags ) 
-      return;
-
-   if (rmesa->using_hyperz) {
-      flags |= RADEON_USE_COMP_ZBUF;
-/*      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200)
-	 flags |= RADEON_USE_HIERZ; */
-      if (!(rmesa->state.stencil.hwBuffer) ||
-	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
-	    ((rmesa->state.stencil.clear & R200_STENCIL_WRITE_MASK) == R200_STENCIL_WRITE_MASK))) {
-	  flags |= RADEON_CLEAR_FASTZ;
-      }
-   }
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLint cx, cy, cw, ch, ret;
+   GLuint i;
 
-   LOCK_HARDWARE( rmesa );
-
-   /* compute region after locking: */
-   cx = ctx->DrawBuffer->_Xmin;
-   cy = ctx->DrawBuffer->_Ymin;
-   cw = ctx->DrawBuffer->_Xmax - cx;
-   ch = ctx->DrawBuffer->_Ymax - cy;
-
-   /* Flip top to bottom */
-   cx += dPriv->x;
-   cy  = dPriv->y + dPriv->h - cy - ch;
+   LOCK_HARDWARE( &rmesa->radeon );
 
    /* Throttle the number of clear ioctls we do.
     */
@@ -693,7 +76,7 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
 
       gp.param = RADEON_PARAM_LAST_CLEAR;
       gp.value = (int *)&clear;
-      ret = drmCommandWriteRead( rmesa->dri.fd,
+      ret = drmCommandWriteRead( rmesa->radeon.dri.fd,
 		      DRM_RADEON_GETPARAM, &gp, sizeof(gp) );
 
       if ( ret ) {
@@ -703,24 +86,34 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
 
       /* Clear throttling needs more thought.
        */
-      if ( rmesa->sarea->last_clear - clear <= 25 ) {
+      if ( rmesa->radeon.sarea->last_clear - clear <= 25 ) {
 	 break;
       }
-      
-      if (rmesa->do_usleeps) {
-	 UNLOCK_HARDWARE( rmesa );
+
+      if (rmesa->radeon.do_usleeps) {
+	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa );
+	 LOCK_HARDWARE( &rmesa->radeon );
       }
    }
 
    /* Send current state to the hardware */
-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
+
+
+  /* compute region after locking: */
+   cx = ctx->DrawBuffer->_Xmin;
+   cy = ctx->DrawBuffer->_Ymin;
+   cw = ctx->DrawBuffer->_Xmax - cx;
+   ch = ctx->DrawBuffer->_Ymax - cy;
 
+   /* Flip top to bottom */
+   cx += dPriv->x;
+   cy  = dPriv->y + dPriv->h - cy - ch;
    for ( i = 0 ; i < dPriv->numClipRects ; ) {
       GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, dPriv->numClipRects );
       drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      drm_clip_rect_t *b = rmesa->radeon.sarea->boxes;
       drm_radeon_clear_t clear;
       drm_radeon_clear_rect_t depth_boxes[RADEON_NR_SAREA_CLIPRECTS];
       GLint n = 0;
@@ -755,17 +148,17 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
 	 }
       }
 
-      rmesa->sarea->nbox = n;
+      rmesa->radeon.sarea->nbox = n;
 
       clear.flags       = flags;
-      clear.clear_color = rmesa->state.color.clear;
-      clear.clear_depth = rmesa->state.depth.clear;	/* needed for hyperz */
+      clear.clear_color = rmesa->radeon.state.color.clear;
+      clear.clear_depth = rmesa->radeon.state.depth.clear;	/* needed for hyperz */
       clear.color_mask  = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      clear.depth_mask  = rmesa->state.stencil.clear;
+      clear.depth_mask  = rmesa->radeon.state.stencil.clear;
       clear.depth_boxes = depth_boxes;
 
       n--;
-      b = rmesa->sarea->boxes;
+      b = rmesa->radeon.sarea->boxes;
       for ( ; n >= 0 ; n-- ) {
 	 depth_boxes[n].f[CLEAR_X1] = (float)b[n].x1;
 	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
@@ -774,84 +167,94 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
 	 depth_boxes[n].f[CLEAR_DEPTH] = ctx->Depth.Clear;
       }
 
-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_CLEAR,
+      ret = drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_CLEAR,
 			     &clear, sizeof(clear));
 
 
       if ( ret ) {
-	 UNLOCK_HARDWARE( rmesa );
+	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 fprintf( stderr, "DRM_RADEON_CLEAR: return = %d\n", ret );
 	 exit( 1 );
       }
    }
-
-   UNLOCK_HARDWARE( rmesa );
-   rmesa->hw.all_dirty = GL_TRUE;
+   UNLOCK_HARDWARE( &rmesa->radeon );
 }
+/* ================================================================
+ * Buffer clear
+ */
+static void r200Clear( GLcontext *ctx, GLbitfield mask )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLuint flags = 0;
+   GLuint color_mask = 0;
+   GLuint orig_mask = mask;
 
+   if ( R200_DEBUG & RADEON_IOCTL ) {
+	   if (rmesa->radeon.sarea)
+	       fprintf( stderr, "r200Clear %x %d\n", mask, rmesa->radeon.sarea->pfCurrentPage);
+	   else
+	       fprintf( stderr, "r200Clear %x radeon->sarea is NULL\n", mask);
+   }
 
-void r200WaitForIdleLocked( r200ContextPtr rmesa )
-{
-    int ret;
-    int i = 0;
-    
-    do {
-       ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_CP_IDLE);
-       if (ret) 
-	  DO_USLEEP( 1 );
-    } while (ret && ++i < 100);
-    
-    if ( ret < 0 ) {
-       UNLOCK_HARDWARE( rmesa );
-       fprintf( stderr, "Error: R200 timed out... exiting\n" );
-       exit( -1 );
-    }
-}
+   {
+      LOCK_HARDWARE( &rmesa->radeon );
+      UNLOCK_HARDWARE( &rmesa->radeon );
+      if ( dPriv->numClipRects == 0 )
+	 return;
+   }
 
+   radeonFlush( ctx );
 
-static void r200WaitForIdle( r200ContextPtr rmesa )
-{
-   LOCK_HARDWARE(rmesa);
-   r200WaitForIdleLocked( rmesa );
-   UNLOCK_HARDWARE(rmesa);
-}
+   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
+      flags |= RADEON_FRONT;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_FRONT_LEFT;
+   }
 
+   if ( mask & BUFFER_BIT_BACK_LEFT ) {
+      flags |= RADEON_BACK;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_BACK_LEFT;
+   }
 
-void r200Flush( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   if ( mask & BUFFER_BIT_DEPTH ) {
+      flags |= RADEON_DEPTH;
+      mask &= ~BUFFER_BIT_DEPTH;
+   }
 
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   if ( (mask & BUFFER_BIT_STENCIL) ) {
+      flags |= RADEON_STENCIL;
+      mask &= ~BUFFER_BIT_STENCIL;
+   }
 
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
+   if ( mask ) {
+      if (R200_DEBUG & RADEON_FALLBACKS)
+	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
+      _swrast_Clear( ctx, mask );
+   }
 
-   r200EmitState( rmesa );
-   
-   if (rmesa->store.cmd_used)
-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
-}
+   if ( !flags )
+      return;
 
-/* Make sure all commands have been sent to the hardware and have
- * completed processing.
- */
-void r200Finish( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   r200Flush( ctx );
+   if (rmesa->using_hyperz) {
+      flags |= RADEON_USE_COMP_ZBUF;
+/*      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200)
+	 flags |= RADEON_USE_HIERZ; */
+      if (!((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
+	    ((rmesa->radeon.state.stencil.clear & R200_STENCIL_WRITE_MASK) == R200_STENCIL_WRITE_MASK))) {
+	  flags |= RADEON_CLEAR_FASTZ;
+      }
+   }
 
-   if (rmesa->do_irqs) {
-      LOCK_HARDWARE( rmesa );
-      r200EmitIrqLocked( rmesa );
-      UNLOCK_HARDWARE( rmesa );
-      r200WaitIrq( rmesa );
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+      radeonUserClear(ctx, orig_mask);
+   else {
+      r200KernelClear(ctx, flags);
+      rmesa->radeon.hw.all_dirty = GL_TRUE;
    }
-   else 
-      r200WaitForIdle( rmesa );
 }
 
-
 /* This version of AllocateMemoryMESA allocates only GART memory, and
  * only does so after the point at which the driver has been
  * initialized.
@@ -862,7 +265,7 @@ void r200Finish( GLcontext *ctx )
  * device fd.
  */
 void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
-			     GLfloat readfreq, GLfloat writefreq, 
+			     GLfloat readfreq, GLfloat writefreq,
 			     GLfloat priority)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -871,11 +274,11 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
    drm_radeon_mem_alloc_t alloc;
    int ret;
 
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq, 
+   if (R200_DEBUG & RADEON_IOCTL)
+      fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq,
 	      writefreq, priority);
 
-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->r200Screen->gartTextures.map)
+   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map)
       return NULL;
 
    if (getenv("R200_NO_ALLOC"))
@@ -886,17 +289,17 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
    alloc.size = size;
    alloc.region_offset = &region_offset;
 
-   ret = drmCommandWriteRead( rmesa->r200Screen->driScreen->fd,
+   ret = drmCommandWriteRead( rmesa->radeon.radeonScreen->driScreen->fd,
 			      DRM_RADEON_ALLOC,
 			      &alloc, sizeof(alloc));
-   
+
    if (ret) {
       fprintf(stderr, "%s: DRM_RADEON_ALLOC ret %d\n", __FUNCTION__, ret);
       return NULL;
    }
-   
+
    {
-      char *region_start = (char *)rmesa->r200Screen->gartTextures.map;
+      char *region_start = (char *)rmesa->radeon.radeonScreen->gartTextures.map;
       return (void *)(region_start + region_offset);
    }
 }
@@ -911,31 +314,31 @@ void r200FreeMemoryMESA(__DRIscreen *screen, GLvoid *pointer)
    drm_radeon_mem_free_t memfree;
    int ret;
 
-   if (R200_DEBUG & DEBUG_IOCTL)
+   if (R200_DEBUG & RADEON_IOCTL)
       fprintf(stderr, "%s %p\n", __FUNCTION__, pointer);
 
-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->r200Screen->gartTextures.map) {
+   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map) {
       fprintf(stderr, "%s: no context\n", __FUNCTION__);
       return;
    }
 
-   region_offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+   region_offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
 
-   if (region_offset < 0 || 
-       region_offset > rmesa->r200Screen->gartTextures.size) {
+   if (region_offset < 0 ||
+       region_offset > rmesa->radeon.radeonScreen->gartTextures.size) {
       fprintf(stderr, "offset %d outside range 0..%d\n", region_offset,
-	      rmesa->r200Screen->gartTextures.size);
+	      rmesa->radeon.radeonScreen->gartTextures.size);
       return;
    }
 
    memfree.region = RADEON_MEM_REGION_GART;
    memfree.region_offset = region_offset;
-   
-   ret = drmCommandWrite( rmesa->r200Screen->driScreen->fd,
+
+   ret = drmCommandWrite( rmesa->radeon.radeonScreen->driScreen->fd,
 			  DRM_RADEON_FREE,
 			  &memfree, sizeof(memfree));
-   
-   if (ret) 
+
+   if (ret)
       fprintf(stderr, "%s: DRM_RADEON_FREE ret %d\n", __FUNCTION__, ret);
 }
 
@@ -956,32 +359,32 @@ GLuint r200GetMemoryOffsetMESA(__DRIscreen *screen, const GLvoid *pointer)
 
    card_offset = r200GartOffsetFromVirtual( rmesa, pointer );
 
-   return card_offset - rmesa->r200Screen->gart_base;
+   return card_offset - rmesa->radeon.radeonScreen->gart_base;
 }
 
 GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
 			   GLint size )
 {
-   ptrdiff_t offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+   ptrdiff_t offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
    int valid = (size >= 0 &&
 		offset >= 0 &&
-		offset + size < rmesa->r200Screen->gartTextures.size);
+		offset + size < rmesa->radeon.radeonScreen->gartTextures.size);
 
-   if (R200_DEBUG & DEBUG_IOCTL)
+   if (R200_DEBUG & RADEON_IOCTL)
       fprintf(stderr, "r200IsGartMemory( %p ) : %d\n", pointer, valid );
-   
+
    return valid;
 }
 
 
 GLuint r200GartOffsetFromVirtual( r200ContextPtr rmesa, const GLvoid *pointer )
 {
-   ptrdiff_t offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+   ptrdiff_t offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
 
-   if (offset < 0 || offset > rmesa->r200Screen->gartTextures.size)
+   if (offset < 0 || offset > rmesa->radeon.radeonScreen->gartTextures.size)
       return ~0;
    else
-      return rmesa->r200Screen->gart_texture_offset + offset;
+      return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
 }
 
 
@@ -989,7 +392,7 @@ GLuint r200GartOffsetFromVirtual( r200ContextPtr rmesa, const GLvoid *pointer )
 void r200InitIoctlFuncs( struct dd_function_table *functions )
 {
     functions->Clear = r200Clear;
-    functions->Finish = r200Finish;
-    functions->Flush = r200Flush;
+    functions->Finish = radeonFinish;
+    functions->Flush = radeonFlush;
 }
 
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.h b/src/mesa/drivers/dri/r200/r200_ioctl.h
index f7458e4a0e..8d51aefa04 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.h
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.h
@@ -37,65 +37,31 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/simple_list.h"
 #include "radeon_dri.h"
-#include "r200_lock.h"
+
+#include "radeon_bocs_wrapper.h"
 
 #include "xf86drm.h"
 #include "drm.h"
 #include "radeon_drm.h"
 
-extern void r200EmitState( r200ContextPtr rmesa );
+extern void r200EmitMaxVtxIndex(r200ContextPtr rmesa, int count);
 extern void r200EmitVertexAOS( r200ContextPtr rmesa,
-				 GLuint vertex_size,
-				 GLuint offset );
+			       GLuint vertex_size,
+			       struct radeon_bo *bo,
+			       GLuint offset );
 
 extern void r200EmitVbufPrim( r200ContextPtr rmesa,
 				GLuint primitive,
 				GLuint vertex_nr );
 
-extern void r200FlushElts( r200ContextPtr rmesa );
+extern void r200FlushElts(GLcontext *ctx);
 
 extern GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
 					   GLuint primitive,
 					   GLuint min_nr );
 
-extern void r200EmitAOS( r200ContextPtr rmesa,
-			   struct r200_dma_region **regions,
-			   GLuint n,
-			   GLuint offset );
-
-extern void r200EmitBlit( r200ContextPtr rmesa,
-			  GLuint color_fmt,
-			  GLuint src_pitch,
-			  GLuint src_offset,
-			  GLuint dst_pitch,
-			  GLuint dst_offset,
-			  GLint srcx, GLint srcy,
-			  GLint dstx, GLint dsty,
-			  GLuint w, GLuint h );
-
-extern void r200EmitWait( r200ContextPtr rmesa, GLuint flags );
-
-extern void r200FlushCmdBuf( r200ContextPtr rmesa, const char * );
-extern int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller );
-
-extern void r200RefillCurrentDmaRegion( r200ContextPtr rmesa );
-
-extern void r200AllocDmaRegion( r200ContextPtr rmesa,
-				  struct r200_dma_region *region,
-				  int bytes, 
-				  int alignment );
-
-extern void r200ReleaseDmaRegion( r200ContextPtr rmesa,
-				    struct r200_dma_region *region,
-				    const char *caller );
-
-extern void r200CopyBuffer( __DRIdrawablePrivate *drawable,
-			    const drm_clip_rect_t      *rect);
-extern void r200PageFlip( __DRIdrawablePrivate *drawable );
-extern void r200Flush( GLcontext *ctx );
-extern void r200Finish( GLcontext *ctx );
-extern void r200WaitForIdleLocked( r200ContextPtr rmesa );
-extern void r200WaitForVBlank( r200ContextPtr rmesa );
+extern void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset);
+
 extern void r200InitIoctlFuncs( struct dd_function_table *functions );
 
 extern void *r200AllocateMemoryMESA( __DRIscreen *screen, GLsizei size, GLfloat readfreq,
@@ -119,8 +85,8 @@ void r200SetUpAtomList( r200ContextPtr rmesa );
  */
 #define R200_NEWPRIM( rmesa )			\
 do {						\
-   if ( rmesa->dma.flush )			\
-      rmesa->dma.flush( rmesa );	\
+   if ( rmesa->radeon.dma.flush )			\
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
 } while (0)
 
 /* Can accomodate several state changes and primitive changes without
@@ -130,22 +96,32 @@ do {						\
 do {								\
    R200_NEWPRIM( rmesa );					\
    rmesa->hw.ATOM.dirty = GL_TRUE;				\
-   rmesa->hw.is_dirty = GL_TRUE;				\
+   rmesa->radeon.hw.is_dirty = GL_TRUE;				\
 } while (0)
 
+#define R200_SET_STATE( rmesa, ATOM, index, newvalue ) 	\
+  do {	\
+    uint32_t __index = (index); \
+    uint32_t __dword = (newvalue); \
+    if (__dword != (rmesa)->hw.ATOM.cmd[__index]) { \
+      R200_STATECHANGE( (rmesa), ATOM ); \
+      (rmesa)->hw.ATOM.cmd[__index] = __dword; \
+    } \
+  } while(0)
+
 #define R200_DB_STATE( ATOM )			        \
    memcpy( rmesa->hw.ATOM.lastcmd, rmesa->hw.ATOM.cmd,	\
 	   rmesa->hw.ATOM.cmd_size * 4)
 
 static INLINE int R200_DB_STATECHANGE( 
    r200ContextPtr rmesa,
-   struct r200_state_atom *atom )
+   struct radeon_state_atom *atom )
 {
    if (memcmp(atom->cmd, atom->lastcmd, atom->cmd_size*4)) {
-      int *tmp;
+      GLuint *tmp;
       R200_NEWPRIM( rmesa );
       atom->dirty = GL_TRUE;
-      rmesa->hw.is_dirty = GL_TRUE;
+      rmesa->radeon.hw.is_dirty = GL_TRUE;
       tmp = atom->cmd; 
       atom->cmd = atom->lastcmd;
       atom->lastcmd = tmp;
@@ -156,54 +132,47 @@ static INLINE int R200_DB_STATECHANGE(
 }
 
 
-/* Fire the buffered vertices no matter what.
- */
-#define R200_FIREVERTICES( rmesa )			\
-do {							\
-   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
-      r200Flush( rmesa->glCtx );			\
-   }							\
-} while (0)
-
 /* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
  * are available, you will also be adding an rmesa->state.max_state_size because
  * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
  */
-#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
-#define VERT_AOS_BUFSZ	(5 * sizeof(int))
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2) + nr*2))
+#define VERT_AOS_BUFSZ	(5)
 #define ELTS_BUFSZ(nr)	(12 + nr * 2)
-#define VBUF_BUFSZ	(3 * sizeof(int))
-
-/* Ensure that a minimum amount of space is available in the command buffer.
- * This is used to ensure atomicity of state updates with the rendering requests
- * that rely on them.
- *
- * An alternative would be to implement a "soft lock" such that when the buffer
- * wraps at an inopportune time, we grab the lock, flush the current buffer,
- * and hang on to the lock until the critical section is finished and we flush
- * the buffer again and unlock.
- */
-static INLINE void r200EnsureCmdBufSpace( r200ContextPtr rmesa, int bytes )
-{
-   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
-   assert( bytes <= R200_CMD_BUF_SZ );
-}
+#define VBUF_BUFSZ	(3)
+#define SCISSOR_BUFSZ	(8)
+#define INDEX_BUFSZ	(8+2)
 
-/* Alloc space in the command buffer
- */
-static INLINE char *r200AllocCmdBuf( r200ContextPtr rmesa,
-					 int bytes, const char *where )
+static inline uint32_t cmdpacket3(int cmd_type)
 {
-   char * head;
+  drm_radeon_cmd_header_t cmd;
 
-   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
-      r200FlushCmdBuf( rmesa, where );
+  cmd.i = 0;
+  cmd.header.cmd_type = cmd_type;
+
+  return (uint32_t)cmd.i;
 
-   head = rmesa->store.cmd_buf + rmesa->store.cmd_used;
-   rmesa->store.cmd_used += bytes;
-   assert( rmesa->store.cmd_used <= R200_CMD_BUF_SZ );
-   return head;
 }
 
+#define OUT_BATCH_PACKET3(packet, num_extra) do {	      \
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
+      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3));				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    } else {						      \
+      OUT_BATCH(CP_PACKET2);				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    }							      \
+  } while(0)
+
+#define OUT_BATCH_PACKET3_CLIP(packet, num_extra) do {	      \
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
+      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3_CLIP));	      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    } else {						      \
+      OUT_BATCH(CP_PACKET2);				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    }							      \
+  } while(0)
+
+
 #endif /* __R200_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_lock.c b/src/mesa/drivers/dri/r200/r200_lock.c
deleted file mode 100644
index 99661a4bfb..0000000000
--- a/src/mesa/drivers/dri/r200/r200_lock.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
- 
-#include "r200_context.h"
-#include "r200_lock.h"
-#include "r200_tex.h"
-#include "r200_state.h"
-#include "r200_ioctl.h"
-
-#include "drirenderbuffer.h"
-
-
-#if DEBUG_LOCKING
-char *prevLockFile = NULL;
-int prevLockLine = 0;
-#endif
-
-/* Turn on/off page flipping according to the flags in the sarea:
- */
-static void
-r200UpdatePageFlipping( r200ContextPtr rmesa )
-{
-   rmesa->doPageFlip = rmesa->sarea->pfState;
-   if (rmesa->glCtx->WinSysDrawBuffer) {
-      driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-                           rmesa->sarea->pfCurrentPage);
-   }
-}
-
-
-
-/* Update the hardware state.  This is called if another main/context.has
- * grabbed the hardware lock, which includes the X server.  This
- * function also updates the driver's window state after the X server
- * moves, resizes or restacks a window -- the change will be reflected
- * in the drawable position and clip rects.  Since the X server grabs
- * the hardware lock when it changes the window state, this routine will
- * automatically be called after such a change.
- */
-void r200GetLock( r200ContextPtr rmesa, GLuint flags )
-{
-   __DRIdrawablePrivate *drawable = rmesa->dri.drawable;
-   __DRIdrawablePrivate *readable = rmesa->dri.readable;
-   __DRIscreenPrivate *sPriv = rmesa->dri.screen;
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
-   int i;
-
-   drmGetLock( rmesa->dri.fd, rmesa->dri.hwContext, flags );
-
-   /* The window might have moved, so we might need to get new clip
-    * rects.
-    *
-    * NOTE: This releases and regrabs the hw lock to allow the X server
-    * to respond to the DRI protocol request for new drawable info.
-    * Since the hardware state depends on having the latest drawable
-    * clip rects, all state checking must be done _after_ this call.
-    */
-   DRI_VALIDATE_DRAWABLE_INFO( sPriv, drawable );
-   if (drawable != readable) {
-      DRI_VALIDATE_DRAWABLE_INFO( sPriv, readable );
-   }
-
-   if ( rmesa->lastStamp != drawable->lastStamp ) {
-      r200UpdatePageFlipping( rmesa );
-      r200SetCliprects( rmesa );
-      r200UpdateViewportOffset( rmesa->glCtx );
-      driUpdateFramebufferSize(rmesa->glCtx, drawable);
-   }
-
-   R200_STATECHANGE( rmesa, ctx );
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
-   }
-   else rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &= ~R200_COLOR_TILE_ENABLE;
-
-   if ( sarea->ctx_owner != rmesa->dri.hwContext ) {
-      sarea->ctx_owner = rmesa->dri.hwContext;
-   }
-
-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-      DRI_AGE_TEXTURES( rmesa->texture_heaps[ i ] );
-   }
-
-   rmesa->lost_context = GL_TRUE;
-}
diff --git a/src/mesa/drivers/dri/r200/r200_lock.h b/src/mesa/drivers/dri/r200/r200_lock.h
deleted file mode 100644
index 4ff98907fb..0000000000
--- a/src/mesa/drivers/dri/r200/r200_lock.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#ifndef __R200_LOCK_H__
-#define __R200_LOCK_H__
-
-extern void r200GetLock( r200ContextPtr rmesa, GLuint flags );
-
-/* Turn DEBUG_LOCKING on to find locking conflicts.
- */
-#define DEBUG_LOCKING	0
-
-#if DEBUG_LOCKING
-extern char *prevLockFile;
-extern int prevLockLine;
-
-#define DEBUG_LOCK()							\
-   do {									\
-      prevLockFile = (__FILE__);					\
-      prevLockLine = (__LINE__);					\
-   } while (0)
-
-#define DEBUG_RESET()							\
-   do {									\
-      prevLockFile = 0;							\
-      prevLockLine = 0;							\
-   } while (0)
-
-#define DEBUG_CHECK_LOCK()						\
-   do {									\
-      if ( prevLockFile ) {						\
-	 fprintf( stderr,						\
-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
-		  prevLockFile, prevLockLine, __FILE__, __LINE__ );	\
-	 exit( 1 );							\
-      }									\
-   } while (0)
-
-#else
-
-#define DEBUG_LOCK()
-#define DEBUG_RESET()
-#define DEBUG_CHECK_LOCK()
-
-#endif
-
-/*
- * !!! We may want to separate locks from locks with validation.  This
- * could be used to improve performance for those things commands that
- * do not do any drawing !!!
- */
-
-
-/* Lock the hardware and validate our state.
- */
-#define LOCK_HARDWARE( rmesa )					\
-   do {								\
-      char __ret = 0;						\
-      DEBUG_CHECK_LOCK();					\
-      DRM_CAS( rmesa->dri.hwLock, rmesa->dri.hwContext,		\
-	       (DRM_LOCK_HELD | rmesa->dri.hwContext), __ret );	\
-      if ( __ret )						\
-	 r200GetLock( rmesa, 0 );				\
-      DEBUG_LOCK();						\
-   } while (0)
-
-#define UNLOCK_HARDWARE( rmesa )					\
-   do {									\
-      DRM_UNLOCK( rmesa->dri.fd,					\
-		  rmesa->dri.hwLock,					\
-		  rmesa->dri.hwContext );				\
-      DEBUG_RESET();							\
-   } while (0)
-
-#endif /* __R200_LOCK_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_maos.h b/src/mesa/drivers/dri/r200/r200_maos.h
index d3ed06d402..16a70475e1 100644
--- a/src/mesa/drivers/dri/r200/r200_maos.h
+++ b/src/mesa/drivers/dri/r200/r200_maos.h
@@ -38,6 +38,5 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 
 extern void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev );
-extern void r200ReleaseArrays( GLcontext *ctx, GLuint newinputs );
 
 #endif
diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
index 8512b9af47..383a0c4b0d 100644
--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
@@ -50,110 +50,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_maos.h"
 #include "r200_tcl.h"
 
-
-#if 0
-/* Usage:
- *   - from r200_tcl_render
- *   - call r200EmitArrays to ensure uptodate arrays in dma
- *   - emit primitives (new type?) which reference the data
- *       -- need to use elts for lineloop, quads, quadstrip/flat
- *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
- *
- */
-static void emit_ubyte_rgba3( GLcontext *ctx,
-		       struct r200_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   r200_color_t *out = (r200_color_t *)(rvb->start + rvb->address);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p\n",
-	      __FUNCTION__, count, stride, (void *)out);
-
-   for (i = 0; i < count; i++) {
-      out->red   = *data;
-      out->green = *(data+1);
-      out->blue  = *(data+2);
-      out->alpha = 0xFF;
-      out++;
-      data += stride;
-   }
-}
-
-static void emit_ubyte_rgba4( GLcontext *ctx,
-			      struct r200_dma_region *rvb,
-			      char *data,
-			      int stride,
-			      int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 4) {
-      for (i = 0; i < count; i++)
-	 ((int *)out)[i] = LE32_TO_CPU(((int *)data)[i]);
-   } else {
-      for (i = 0; i < count; i++) {
-	 *(int *)out++ = LE32_TO_CPU(*(int *)data);
-	 data += stride;
-      }
-   }
-}
-
-
-static void emit_ubyte_rgba( GLcontext *ctx,
-			     struct r200_dma_region *rvb,
-			     char *data,
-			     int size,
-			     int stride,
-			     int count )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      r200AllocDmaRegion( rmesa, rvb, 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
-   }
-   else {
-      r200AllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
-   }
-
-   /* Emit the data
-    */
-   switch (size) {
-   case 3:
-      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
-   }
-}
-#endif
-
-
 #if defined(USE_X86_ASM)
 #define COPY_DWORDS( dst, src, nr )					\
 do {									\
@@ -174,204 +70,34 @@ do {						\
 } while (0)
 #endif
 
-
-static void emit_vecfog( GLcontext *ctx,
-			 struct r200_dma_region *rvb,
-			 char *data,
-			 int stride,
-			 int count )
+static void r200_emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
+			     GLvoid *data, int stride, int count)
 {
-   int i;
-   GLfloat *out;
-
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      r200AllocDmaRegion( rmesa, rvb, 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
-   }
-   else {
-      r200AllocDmaRegion( rmesa, rvb, count * 4, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
-   }
-
-   /* Emit the data
-    */
-   out = (GLfloat *)(rvb->address + rvb->start);
-   for (i = 0; i < count; i++) {
-      out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
-      out++;
-      data += stride;
-   }
-
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	uint32_t *out;
+	int i;
+	int size = 1;
+
+	if (stride == 0) {
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
+		count = 1;
+		aos->stride = 0;
+	} else {
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
+		aos->stride = size;
+	}
+
+	aos->components = size;
+	aos->count = count;
+
+	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
+	for (i = 0; i < count; i++) {
+	  out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
+	  out++;
+	  data += stride;
+	}
 }
 
-
-static void emit_vec4( GLcontext *ctx,
-		       struct r200_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 4)
-      COPY_DWORDS( out, data, count );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out++;
-	 data += stride;
-      }
-}
-
-
-static void emit_vec8( GLcontext *ctx,
-		       struct r200_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 8)
-      COPY_DWORDS( out, data, count*2 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out += 2;
-	 data += stride;
-      }
-}
-
-static void emit_vec12( GLcontext *ctx,
-		       struct r200_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-	      __FUNCTION__, count, stride, (void *)out, (void *)data);
-
-   if (stride == 12)
-      COPY_DWORDS( out, data, count*3 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out[2] = *(int *)(data+8);
-	 out += 3;
-	 data += stride;
-      }
-}
-
-static void emit_vec16( GLcontext *ctx,
-			struct r200_dma_region *rvb,
-			char *data,
-			int stride,
-			int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 16)
-      COPY_DWORDS( out, data, count*4 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out[2] = *(int *)(data+8);
-	 out[3] = *(int *)(data+12);
-	 out += 4;
-	 data += stride;
-      }
-}
-
-
-static void emit_vector( GLcontext *ctx,
-			 struct r200_dma_region *rvb,
-			 char *data,
-			 int size,
-			 int stride,
-			 int count )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d size %d stride %d\n",
-	      __FUNCTION__, count, size, stride);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      r200AllocDmaRegion( rmesa, rvb, size * 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = size;
-   }
-   else {
-      r200AllocDmaRegion( rmesa, rvb, size * count * 4, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = size;
-      rvb->aos_size = size;
-   }
-
-   /* Emit the data
-    */
-   switch (size) {
-   case 1:
-      emit_vec4( ctx, rvb, data, stride, count );
-      break;
-   case 2:
-      emit_vec8( ctx, rvb, data, stride, count );
-      break;
-   case 3:
-      emit_vec12( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_vec16( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
-   }
-
-}
-
-
-
 /* Emit any changed arrays to new GART memory, re-emit a packet to
  * update the arrays.  
  */
@@ -379,12 +105,12 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
 {
    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    struct vertex_buffer *VB = &TNL_CONTEXT( ctx )->vb;
-   struct r200_dma_region **component = rmesa->tcl.aos_components;
    GLuint nr = 0;
    GLuint vfmt0 = 0, vfmt1 = 0;
    GLuint count = VB->Count;
    GLuint i, emitsize;
 
+   //   fprintf(stderr,"emit arrays\n");
    for ( i = 0; i < 15; i++ ) {
       GLubyte attrib = vimap_rev[i];
       if (attrib != 255) {
@@ -416,20 +142,20 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
 	 case 3:
 	    /* special handling to fix up fog. Will get us into trouble with vbos...*/
 	    assert(attrib == VERT_ATTRIB_FOG);
-	    if (!rmesa->tcl.vertex_data[i].buf) {
+	    if (!rmesa->radeon.tcl.aos[i].bo) {
 	       if (ctx->VertexProgram._Enabled)
-		  emit_vector( ctx,
-			 &(rmesa->tcl.vertex_data[i]),
-			 (char *)VB->AttribPtr[attrib]->data,
-			 1,
-			 VB->AttribPtr[attrib]->stride,
-			 count);
+		  rcommon_emit_vector( ctx,
+				       &(rmesa->radeon.tcl.aos[nr]),
+				       (char *)VB->AttribPtr[attrib]->data,
+				       1,
+				       VB->AttribPtr[attrib]->stride,
+				       count);
 	       else
-		  emit_vecfog( ctx,
-			 &(rmesa->tcl.vertex_data[i]),
-			 (char *)VB->AttribPtr[attrib]->data,
-			 VB->AttribPtr[attrib]->stride,
-			 count);
+		 r200_emit_vecfog( ctx,
+				   &(rmesa->radeon.tcl.aos[nr]),
+				   (char *)VB->AttribPtr[attrib]->data,
+				   VB->AttribPtr[attrib]->stride,
+				   count);
 	    }
 	    vfmt0 |= R200_VTX_DISCRETE_FOG;
 	    goto after_emit;
@@ -473,17 +199,17 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
 	 default:
 	    assert(0);
 	 }
-	 if (!rmesa->tcl.vertex_data[i].buf) {
-	    emit_vector( ctx,
-			 &(rmesa->tcl.vertex_data[i]),
-			 (char *)VB->AttribPtr[attrib]->data,
-			 emitsize,
-			 VB->AttribPtr[attrib]->stride,
-			 count );
+	 if (!rmesa->radeon.tcl.aos[nr].bo) {
+	   rcommon_emit_vector( ctx,
+				&(rmesa->radeon.tcl.aos[nr]),
+				(char *)VB->AttribPtr[attrib]->data,
+				emitsize,
+				VB->AttribPtr[attrib]->stride,
+				count );
 	 }
 after_emit:
 	 assert(nr < 12);
-	 component[nr++] = &rmesa->tcl.vertex_data[i];
+	 nr++;
       }
    }
 
@@ -494,19 +220,6 @@ after_emit:
       rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = vfmt1;
    }
 
-   rmesa->tcl.nr_aos_components = nr;
+   rmesa->radeon.tcl.aos_count = nr;
 }
 
-
-void r200ReleaseArrays( GLcontext *ctx, GLuint newinputs )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-
-   /* only do it for changed inputs ? */
-   int i;
-   for (i = 0; i < 15; i++) {
-      if (newinputs & (1 << i))
-	 r200ReleaseDmaRegion( rmesa,
-	    &rmesa->tcl.vertex_data[i], __FUNCTION__ );
-   }
-}
diff --git a/src/mesa/drivers/dri/r200/r200_pixel.c b/src/mesa/drivers/dri/r200/r200_pixel.c
index be68821dc1..95773871e0 100644
--- a/src/mesa/drivers/dri/r200/r200_pixel.c
+++ b/src/mesa/drivers/dri/r200/r200_pixel.c
@@ -51,29 +51,29 @@ check_color( const GLcontext *ctx, GLenum type, GLenum format,
 	     const void *pixels, GLint sz, GLint pitch )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint cpp = rmesa->r200Screen->cpp;
+   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
 
-   if (R200_DEBUG & DEBUG_PIXEL)
+   if (R200_DEBUG & RADEON_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
    if (	(pitch & 63) ||
 	ctx->_ImageTransferState ||
 	packing->SwapBytes ||
 	packing->LsbFirst) {
-      if (R200_DEBUG & DEBUG_PIXEL)
+      if (R200_DEBUG & RADEON_PIXEL)
 	 fprintf(stderr, "%s: failed 1\n", __FUNCTION__);
       return GL_FALSE;
    }
 
-   if ( type == GL_UNSIGNED_INT_8_8_8_8_REV && 
-	cpp == 4 && 
+   if ( type == GL_UNSIGNED_INT_8_8_8_8_REV &&
+	cpp == 4 &&
 	format == GL_BGRA ) {
-      if (R200_DEBUG & DEBUG_PIXEL)
+      if (R200_DEBUG & RADEON_PIXEL)
 	 fprintf(stderr, "%s: passed 2\n", __FUNCTION__);
       return GL_TRUE;
    }
 
-   if (R200_DEBUG & DEBUG_PIXEL)
+   if (R200_DEBUG & RADEON_PIXEL)
       fprintf(stderr, "%s: failed\n", __FUNCTION__);
 
    return GL_FALSE;
@@ -83,11 +83,11 @@ static GLboolean
 check_color_per_fragment_ops( const GLcontext *ctx )
 {
    int result;
-   result = (!(     ctx->Color.AlphaEnabled || 
+   result = (!(     ctx->Color.AlphaEnabled ||
 		    ctx->Depth.Test ||
 		    ctx->Fog.Enabled ||
 		    ctx->Scissor.Enabled ||
-		    ctx->Stencil.Enabled ||
+		    ctx->Stencil._Enabled ||
 		    !ctx->Color.ColorMask[0] ||
 		    !ctx->Color.ColorMask[1] ||
 		    !ctx->Color.ColorMask[2] ||
@@ -96,12 +96,12 @@ check_color_per_fragment_ops( const GLcontext *ctx )
 		    ctx->Texture._EnabledUnits
            ) &&
 	   ctx->Current.RasterPosValid);
-   
+
    return result;
 }
 
 
-
+#if 0
 static GLboolean
 clip_pixelrect( const GLcontext *ctx,
 		const GLframebuffer *buffer,
@@ -137,11 +137,12 @@ clip_pixelrect( const GLcontext *ctx,
    if (*height <= 0)
       return GL_FALSE;
 
-   *size = ((*y + *height - 1) * rmesa->r200Screen->frontPitch +
-	    (*x + *width - 1) * rmesa->r200Screen->cpp);
+   *size = ((*y + *height - 1) * rmesa->radeon.radeonScreen->frontPitch +
+	    (*x + *width - 1) * rmesa->radeon.radeonScreen->cpp);
 
    return GL_TRUE;
 }
+#endif
 
 static GLboolean
 r200TryReadPixels( GLcontext *ctx,
@@ -150,29 +151,30 @@ r200TryReadPixels( GLcontext *ctx,
 		  const struct gl_pixelstore_attrib *pack,
 		  GLvoid *pixels )
 {
+   return GL_FALSE;
+#if 0
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLint pitch = pack->RowLength ? pack->RowLength : width;
    GLint blit_format;
-   GLuint cpp = rmesa->r200Screen->cpp;
+   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
    GLint size = width * height * cpp;
 
-   if (R200_DEBUG & DEBUG_PIXEL)
+   if (R200_DEBUG & RADEON_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
    /* Only accelerate reading to GART buffers.
     */
-   if ( !r200IsGartMemory(rmesa, pixels, 
-			 pitch * height * rmesa->r200Screen->cpp ) ) {
-      if (R200_DEBUG & DEBUG_PIXEL)
+   if ( !r200IsGartMemory(rmesa, pixels,
+			 pitch * height * rmesa->radeon.radeonScreen->cpp ) ) {
+      if (R200_DEBUG & RADEON_PIXEL)
 	 fprintf(stderr, "%s: dest not GART\n", __FUNCTION__);
-      return GL_FALSE;
    }
 
    /* Need GL_PACK_INVERT_MESA to cope with upsidedown results from
     * blitter:
     */
    if (!pack->Invert) {
-      if (R200_DEBUG & DEBUG_PIXEL)
+      if (R200_DEBUG & RADEON_PIXEL)
 	 fprintf(stderr, "%s: MESA_PACK_INVERT not set\n", __FUNCTION__);
       return GL_FALSE;
    }
@@ -180,7 +182,7 @@ r200TryReadPixels( GLcontext *ctx,
    if (!check_color(ctx, type, format, pack, pixels, size, pitch))
       return GL_FALSE;
 
-   switch ( rmesa->r200Screen->cpp ) {
+   switch ( rmesa->radeon.radeonScreen->cpp ) {
    case 4:
       blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
       break;
@@ -197,40 +199,40 @@ r200TryReadPixels( GLcontext *ctx,
     * a full command buffer expects to be called unlocked.  As a
     * workaround, immediately flush the buffer on aquiring the lock.
     */
-   LOCK_HARDWARE( rmesa );
+   LOCK_HARDWARE( &rmesa->radeon );
 
    if (rmesa->store.cmd_used)
-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
 
    if (!clip_pixelrect(ctx, ctx->ReadBuffer, &x, &y, &width, &height,
 		       &size)) {
-      UNLOCK_HARDWARE( rmesa );
-      if (R200_DEBUG & DEBUG_PIXEL)
+      UNLOCK_HARDWARE( &rmesa->radeon );
+      if (R200_DEBUG & RADEON_PIXEL)
 	 fprintf(stderr, "%s totally clipped -- nothing to do\n",
 		 __FUNCTION__);
       return GL_TRUE;
    }
 
    {
-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+      __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
       driRenderbuffer *drb = (driRenderbuffer *) ctx->ReadBuffer->_ColorReadBuffer;
       int nbox = dPriv->numClipRects;
       int src_offset = drb->offset
-		     + rmesa->r200Screen->fbLocation;
+		     + rmesa->radeon.radeonScreen->fbLocation;
       int src_pitch = drb->pitch * drb->cpp;
       int dst_offset = r200GartOffsetFromVirtual( rmesa, pixels );
-      int dst_pitch = pitch * rmesa->r200Screen->cpp;
+      int dst_pitch = pitch * rmesa->radeon.radeonScreen->cpp;
       drm_clip_rect_t *box = dPriv->pClipRects;
       int i;
 
-      r200EmitWait( rmesa, RADEON_WAIT_3D ); 
+      r200EmitWait( rmesa, RADEON_WAIT_3D );
 
       y = dPriv->h - y - height;
       x += dPriv->x;
       y += dPriv->y;
 
 
-      if (R200_DEBUG & DEBUG_PIXEL)
+      if (R200_DEBUG & RADEON_PIXEL)
 	 fprintf(stderr, "readpixel blit src_pitch %d dst_pitch %d\n",
 		 src_pitch, dst_pitch);
 
@@ -240,7 +242,7 @@ r200TryReadPixels( GLcontext *ctx,
 	 GLint by = box[i].y1;
 	 GLint bw = box[i].x2 - bx;
 	 GLint bh = box[i].y2 - by;
-	 
+
 	 if (bx < x) bw -= x - bx, bx = x;
 	 if (by < y) bh -= y - by, by = y;
 	 if (bx + bw > x + width) bw = x + width - bx;
@@ -257,12 +259,12 @@ r200TryReadPixels( GLcontext *ctx,
 		       bw, bh );
       }
 
-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
    }
-   UNLOCK_HARDWARE( rmesa );
-
-   r200Finish( ctx ); /* required by GL */
+   UNLOCK_HARDWARE( &rmesa->radeon );
 
+   radeonFinish( ctx ); /* required by GL */
+#endif
    return GL_TRUE;
 }
 
@@ -273,12 +275,12 @@ r200ReadPixels( GLcontext *ctx,
 		 const struct gl_pixelstore_attrib *pack,
 		 GLvoid *pixels )
 {
-   if (R200_DEBUG & DEBUG_PIXEL)
+   if (R200_DEBUG & RADEON_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-   if (!r200TryReadPixels( ctx, x, y, width, height, format, type, pack, 
+   if (!r200TryReadPixels( ctx, x, y, width, height, format, type, pack,
 			   pixels))
-      _swrast_ReadPixels( ctx, x, y, width, height, format, type, pack, 
+      _swrast_ReadPixels( ctx, x, y, width, height, format, type, pack,
 			  pixels);
 }
 
@@ -291,8 +293,12 @@ static void do_draw_pix( GLcontext *ctx,
 			 const void *pixels,
 			 GLuint planemask)
 {
+   if (R200_DEBUG & RADEON_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+#if 0
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    drm_clip_rect_t *box = dPriv->pClipRects;
    struct gl_renderbuffer *rb = ctx->ReadBuffer->_ColorDrawBuffers[0];
    driRenderbuffer *drb = (driRenderbuffer *) rb;
@@ -301,12 +307,9 @@ static void do_draw_pix( GLcontext *ctx,
    int blit_format;
    int size;
    int src_offset = r200GartOffsetFromVirtual( rmesa, pixels );
-   int src_pitch = pitch * rmesa->r200Screen->cpp;
+   int src_pitch = pitch * rmesa->radeon.radeonScreen->cpp;
 
-   if (R200_DEBUG & DEBUG_PIXEL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   switch ( rmesa->r200Screen->cpp ) {
+   switch ( rmesa->radeon.radeonScreen->cpp ) {
    case 2:
       blit_format = R200_CP_COLOR_FORMAT_RGB565;
       break;
@@ -318,17 +321,17 @@ static void do_draw_pix( GLcontext *ctx,
    }
 
 
-   LOCK_HARDWARE( rmesa );
+   LOCK_HARDWARE( &rmesa->radeon );
 
    if (rmesa->store.cmd_used)
-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
 
    y -= height;			/* cope with pixel zoom */
-   
+
    if (!clip_pixelrect(ctx, ctx->DrawBuffer,
 		       &x, &y, &width, &height,
 		       &size)) {
-      UNLOCK_HARDWARE( rmesa );
+      UNLOCK_HARDWARE( &rmesa->radeon );
       return;
    }
 
@@ -357,15 +360,16 @@ static void do_draw_pix( GLcontext *ctx,
 		    blit_format,
 		    src_pitch, src_offset,
 		    drb->pitch * drb->cpp,
-		    drb->offset + rmesa->r200Screen->fbLocation,
+		    drb->offset + rmesa->radeon.radeonScreen->fbLocation,
 		    bx - x, by - y,
 		    bx, by,
 		    bw, bh );
    }
 
-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
-   r200WaitForIdleLocked( rmesa ); /* required by GL */
-   UNLOCK_HARDWARE( rmesa );
+   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
+   radeonWaitForIdleLocked( &rmesa->radeon ); /* required by GL */
+   UNLOCK_HARDWARE( &rmesa->radeon );
+#endif
 }
 
 
@@ -381,10 +385,10 @@ r200TryDrawPixels( GLcontext *ctx,
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLint pitch = unpack->RowLength ? unpack->RowLength : width;
    GLuint planemask;
-   GLuint cpp = rmesa->r200Screen->cpp;
+   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
    GLint size = height * pitch * cpp;
 
-   if (R200_DEBUG & DEBUG_PIXEL)
+   if (R200_DEBUG & RADEON_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
    /* check that we're drawing to exactly one color buffer */
@@ -395,7 +399,7 @@ r200TryDrawPixels( GLcontext *ctx,
    case GL_RGB:
    case GL_RGBA:
    case GL_BGRA:
-      planemask = r200PackColor(cpp,
+      planemask = radeonPackColor(cpp,
 				ctx->Color.ColorMask[RCOMP],
 				ctx->Color.ColorMask[GCOMP],
 				ctx->Color.ColorMask[BCOMP],
@@ -407,10 +411,10 @@ r200TryDrawPixels( GLcontext *ctx,
       if (planemask != ~0)
 	 return GL_FALSE;	/* fix me -- should be possible */
 
-      /* Can't do conversions on GART reads/draws. 
+      /* Can't do conversions on GART reads/draws.
        */
       if ( !r200IsGartMemory( rmesa, pixels, size ) ) {
-	 if (R200_DEBUG & DEBUG_PIXEL)
+	 if (R200_DEBUG & RADEON_PIXEL)
 	    fprintf(stderr, "%s: not GART memory\n", __FUNCTION__);
 	 return GL_FALSE;
       }
@@ -431,7 +435,7 @@ r200TryDrawPixels( GLcontext *ctx,
       return GL_FALSE;
    }
 
-   if ( r200IsGartMemory(rmesa, pixels, size) )
+   if (0)// r200IsGartMemory(rmesa, pixels, size) )
    {
       do_draw_pix( ctx, x, y, width, height, pitch, pixels, planemask );
       return GL_TRUE;
@@ -453,7 +457,7 @@ r200DrawPixels( GLcontext *ctx,
 		 const struct gl_pixelstore_attrib *unpack,
 		 const GLvoid *pixels )
 {
-   if (R200_DEBUG & DEBUG_PIXEL)
+   if (R200_DEBUG & RADEON_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
    if (!r200TryDrawPixels( ctx, x, y, width, height, format, type,
@@ -471,7 +475,7 @@ r200Bitmap( GLcontext *ctx, GLint px, GLint py,
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   if (rmesa->Fallback)
+   if (rmesa->radeon.Fallback)
       _swrast_Bitmap( ctx, px, py, width, height, unpack, bitmap );
    else
       r200PointsBitmap( ctx, px, py, width, height, unpack, bitmap );
@@ -482,9 +486,9 @@ r200Bitmap( GLcontext *ctx, GLint px, GLint py,
 void r200InitPixelFuncs( GLcontext *ctx )
 {
    if (!getenv("R200_NO_BLITS")) {
-      ctx->Driver.ReadPixels = r200ReadPixels;  
-      ctx->Driver.DrawPixels = r200DrawPixels; 
-      if (getenv("R200_HW_BITMAP")) 
+      ctx->Driver.ReadPixels = r200ReadPixels;
+      ctx->Driver.DrawPixels = r200DrawPixels;
+      if (getenv("R200_HW_BITMAP"))
 	 ctx->Driver.Bitmap = r200Bitmap;
    }
 }
diff --git a/src/mesa/drivers/dri/r200/r200_reg.h b/src/mesa/drivers/dri/r200/r200_reg.h
index 5ce287f7a5..526a624b69 100644
--- a/src/mesa/drivers/dri/r200/r200_reg.h
+++ b/src/mesa/drivers/dri/r200/r200_reg.h
@@ -463,8 +463,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define     R200_VSC_UPDATE_USER_COLOR_1_ENABLE    0x00020000
 /* gap */
 #define R200_SE_TCL_VECTOR_INDX_REG                0x2200
+#       define RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT  16
+#       define RADEON_VEC_INDX_DWORD_COUNT_SHIFT     28
 #define R200_SE_TCL_VECTOR_DATA_REG                0x2204
 #define R200_SE_TCL_SCALAR_INDX_REG                0x2208
+#       define RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT  16
 #define R200_SE_TCL_SCALAR_DATA_REG                0x220c
 /* gap */
 #define R200_SE_TCL_MATRIX_SEL_0                   0x2230
@@ -949,6 +952,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define     R200_LOD_BIAS_MASK                        (0xfff80000)
 #define     R200_LOD_BIAS_SHIFT                       19
 #define R200_PP_TXSIZE_0                  0x2c0c /* NPOT only */
+#define R200_PP_TX_WIDTHMASK_SHIFT 0
+#define R200_PP_TX_HEIGHTMASK_SHIFT 16
+
 #define R200_PP_TXPITCH_0                 0x2c10 /* NPOT only */
 #define R200_PP_BORDER_COLOR_0            0x2c14
 #define R200_PP_CUBIC_FACES_0             0x2c18
diff --git a/src/mesa/drivers/dri/r200/r200_sanity.c b/src/mesa/drivers/dri/r200/r200_sanity.c
index 36530c224e..1241a926ba 100644
--- a/src/mesa/drivers/dri/r200/r200_sanity.c
+++ b/src/mesa/drivers/dri/r200/r200_sanity.c
@@ -48,11 +48,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define MORE_VERBOSE 1
 
 #if MORE_VERBOSE
-#define VERBOSE (R200_DEBUG & DEBUG_VERBOSE)
+#define VERBOSE (R200_DEBUG & RADEON_VERBOSE)
 #define NORMAL  (1)
 #else
 #define VERBOSE 0
-#define NORMAL  (R200_DEBUG & DEBUG_VERBOSE)
+#define NORMAL  (R200_DEBUG & RADEON_VERBOSE)
 #endif
 
 
diff --git a/src/mesa/drivers/dri/r200/r200_span.c b/src/mesa/drivers/dri/r200/r200_span.c
deleted file mode 100644
index 9783678028..0000000000
--- a/src/mesa/drivers/dri/r200/r200_span.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/colormac.h"
-#include "swrast/swrast.h"
-
-#include "r200_context.h"
-#include "r200_ioctl.h"
-#include "r200_state.h"
-#include "r200_span.h"
-#include "r200_tex.h"
-
-#define DBG 0
-
-/*
- * Note that all information needed to access pixels in a renderbuffer
- * should be obtained through the gl_renderbuffer parameter, not per-context
- * information.
- */
-#define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
-   const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
-   GLuint p;							\
-   (void) p;
-
-#define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
-   const GLuint bottom = dPriv->h - 1;			\
-   GLuint xo = dPriv->x;				\
-   GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
-
-#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
-
-#define Y_FLIP(Y) (bottom - (Y))
-
-#define HW_LOCK() 
-
-#define HW_UNLOCK()							
-
-
-
-/* ================================================================
- * Color buffer
- */
-
-/* 16 bit, RGB565 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    r200##x##_RGB565
-#define TAG2(x,y) r200##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
-#include "spantmp2.h"
-
-/* 32 bit, ARGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    r200##x##_ARGB8888
-#define TAG2(x,y) r200##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
-#include "spantmp2.h"
-
-
-/* ================================================================
- * Depth buffer
- */
-
-/* The Radeon family has depth tiling on all the time, so we have to convert
- * the x,y coordinates into the memory bus address (mba) in the same
- * manner as the engine.  In each case, the linear block address (ba)
- * is calculated, and then wired with x and y to produce the final
- * memory address.
- * The chip will do address translation on its own if the surface registers
- * are set up correctly. It is not quite enough to get it working with hyperz too...
- */
-
-/* extract bit 'b' of x, result is zero or one */
-#define BIT(x,b) ((x & (1<<b))>>b)
-
-static GLuint
-r200_mba_z32( driRenderbuffer *drb, GLint x, GLint y )
-{
-   GLuint pitch = drb->pitch;
-   if (drb->depthHasSurface) {
-      return 4 * (x + y * pitch);
-   }
-   else {
-      GLuint b = ((y & 0x7FF) >> 4) * ((pitch & 0xFFF) >> 5) + ((x & 0x7FF) >> 5);
-      GLuint a = 
-         (BIT(x,0) << 2) |
-         (BIT(y,0) << 3) |
-         (BIT(x,1) << 4) |
-         (BIT(y,1) << 5) |
-         (BIT(x,3) << 6) |
-         (BIT(x,4) << 7) |
-         (BIT(x,2) << 8) |
-         (BIT(y,2) << 9) |
-         (BIT(y,3) << 10) |
-         (((pitch & 0x20) ? (b & 0x01) : ((b & 0x01) ^ (BIT(y,4)))) << 11) |
-         ((b >> 1) << 12);
-      return a;
-   }
-}
-
-static GLuint
-r200_mba_z16( driRenderbuffer *drb, GLint x, GLint y )
-{
-   GLuint pitch = drb->pitch;
-   if (drb->depthHasSurface) {
-      return 2 * (x + y * pitch);
-   }
-   else {
-      GLuint b = ((y & 0x7FF) >> 4) * ((pitch & 0xFFF) >> 6) + ((x & 0x7FF) >> 6);
-      GLuint a = 
-         (BIT(x,0) << 1) |
-         (BIT(y,0) << 2) |
-         (BIT(x,1) << 3) |
-         (BIT(y,1) << 4) |
-         (BIT(x,2) << 5) |
-         (BIT(x,4) << 6) |
-         (BIT(x,5) << 7) |
-         (BIT(x,3) << 8) |
-         (BIT(y,2) << 9) |
-         (BIT(y,3) << 10) |
-         (((pitch & 0x40) ? (b & 0x01) : ((b & 0x01) ^ (BIT(y,4)))) << 11) |
-         ((b >> 1) << 12);
-      return a;
-   }
-}
-
-
-/* 16-bit depth buffer functions
- */
-#define VALUE_TYPE GLushort
-
-#define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + r200_mba_z16( drb, _x + xo, _y + yo )) = d;
-
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + r200_mba_z16( drb, _x + xo, _y + yo ));
-
-#define TAG(x) r200##x##_z16
-#include "depthtmp.h"
-
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- */
-#define VALUE_TYPE GLuint
-
-#define WRITE_DEPTH( _x, _y, d )					\
-do {									\
-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xff000000;							\
-   tmp |= ((d) & 0x00ffffff);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + r200_mba_z32( drb, _x + xo,			\
-					 _y + yo )) & 0x00ffffff;
-
-#define TAG(x) r200##x##_z24_s8
-#include "depthtmp.h"
-
-
-/* ================================================================
- * Stencil buffer
- */
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- */
-#define WRITE_STENCIL( _x, _y, d )					\
-do {									\
-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0x00ffffff;							\
-   tmp |= (((d) & 0xff) << 24);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-
-#define READ_STENCIL( d, _x, _y )					\
-do {									\
-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xff000000;							\
-   d = tmp >> 24;							\
-} while (0)
-
-#define TAG(x) r200##x##_z24_s8
-#include "stenciltmp.h"
-
-
-/* Move locking out to get reasonable span performance (10x better
- * than doing this in HW_LOCK above).  WaitForIdle() is the main
- * culprit.
- */
-
-static void r200SpanRenderStart( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-
-   R200_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-   r200WaitForIdleLocked( rmesa );
-
-   /* Read & rewrite the first pixel in the frame buffer.  This should
-    * be a noop, right?  In fact without this conform fails as reading
-    * from the framebuffer sometimes produces old results -- the
-    * on-card read cache gets mixed up and doesn't notice that the
-    * framebuffer has been updated.
-    *
-    * In the worst case this is buggy too as p might get the wrong
-    * value first time, so really need a hidden pixel somewhere for this.
-    */
-   {
-      int p;
-      driRenderbuffer *drb =
-	 (driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
-      volatile int *buf =
-	 (volatile int *)(rmesa->dri.screen->pFB + drb->offset);
-      p = *buf;
-      *buf = p;
-   }
-}
-
-static void r200SpanRenderFinish( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-   _swrast_flush( ctx );
-   UNLOCK_HARDWARE( rmesa );
-}
-
-void r200InitSpanFuncs( GLcontext *ctx )
-{
-   struct swrast_device_driver *swdd = _swrast_GetDeviceDriverReference(ctx);
-   swdd->SpanRenderStart          = r200SpanRenderStart;
-   swdd->SpanRenderFinish         = r200SpanRenderFinish; 
-}
-
-
-
-/**
- * Plug in the Get/Put routines for the given driRenderbuffer.
- */
-void
-radeonSetSpanFunctions(driRenderbuffer *drb, const GLvisual *vis)
-{
-   if (drb->Base.InternalFormat == GL_RGBA) {
-      if (vis->redBits == 5 && vis->greenBits == 6 && vis->blueBits == 5) {
-         r200InitPointers_RGB565(&drb->Base);
-      }
-      else {
-         r200InitPointers_ARGB8888(&drb->Base);
-      }
-   }
-   else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-      r200InitDepthPointers_z16(&drb->Base);
-   }
-   else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-      r200InitDepthPointers_z24_s8(&drb->Base);
-   }
-   else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-      r200InitStencilPointers_z24_s8(&drb->Base);
-   }
-}
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 0eaaaf69ac..76852e315c 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -47,6 +47,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_pipeline.h"
 #include "swrast_setup/swrast_setup.h"
 
+#include "radeon_common.h"
+#include "radeon_mipmap_tree.h"
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_state.h"
@@ -77,7 +79,7 @@ static void r200AlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
 
    switch ( func ) {
    case GL_NEVER:
-      pp_misc |= R200_ALPHA_TEST_FAIL; 
+      pp_misc |= R200_ALPHA_TEST_FAIL;
       break;
    case GL_LESS:
       pp_misc |= R200_ALPHA_TEST_LESS;
@@ -114,8 +116,8 @@ static void r200BlendColor( GLcontext *ctx, const GLfloat cf[4] )
    CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
    CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
    CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
-   if (rmesa->r200Screen->drmSupportsBlendColor)
-      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = r200PackColor( 4, color[0], color[1], color[2], color[3] );
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
+      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = radeonPackColor( 4, color[0], color[1], color[2], color[3] );
 }
 
 /**
@@ -213,7 +215,7 @@ static void r200_set_blend_state( GLcontext * ctx )
 
    R200_STATECHANGE( rmesa, ctx );
 
-   if (rmesa->r200Screen->drmSupportsBlendColor) {
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
       if (ctx->Color.ColorLogicOpEnabled) {
          rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] =  cntl | R200_ROP_ENABLE;
          rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = eqn | func;
@@ -278,7 +280,7 @@ static void r200_set_blend_state( GLcontext * ctx )
       return;
    }
 
-   if (!rmesa->r200Screen->drmSupportsBlendColor) {
+   if (!rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
       rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = eqn | func;
       return;
    }
@@ -383,10 +385,10 @@ static void r200ClearDepth( GLcontext *ctx, GLclampd d )
 
    switch ( format ) {
    case R200_DEPTH_FORMAT_16BIT_INT_Z:
-      rmesa->state.depth.clear = d * 0x0000ffff;
+      rmesa->radeon.state.depth.clear = d * 0x0000ffff;
       break;
    case R200_DEPTH_FORMAT_24BIT_INT_Z:
-      rmesa->state.depth.clear = d * 0x00ffffff;
+      rmesa->radeon.state.depth.clear = d * 0x00ffffff;
       break;
    }
 }
@@ -477,10 +479,10 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 	 }
       }
       break;
-   case GL_FOG_COLOR: 
+   case GL_FOG_COLOR:
       R200_STATECHANGE( rmesa, ctx );
       UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
-      i = r200PackColor( 4, col[0], col[1], col[2], 0 );
+      i = radeonPackColor( 4, col[0], col[1], col[2], 0 );
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~R200_FOG_COLOR_MASK;
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= i;
       break;
@@ -505,7 +507,7 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 
       if (out_0 != rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0]) {
 	 R200_STATECHANGE( rmesa, vtx );
-	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_0;	 
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_0;
       }
 
       break;
@@ -521,102 +523,6 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
    }
 }
 
-
-/* =============================================================
- * Scissoring
- */
-
-
-static GLboolean intersect_rect( drm_clip_rect_t *out,
-				 drm_clip_rect_t *a,
-				 drm_clip_rect_t *b )
-{
-   *out = *a;
-   if ( b->x1 > out->x1 ) out->x1 = b->x1;
-   if ( b->y1 > out->y1 ) out->y1 = b->y1;
-   if ( b->x2 < out->x2 ) out->x2 = b->x2;
-   if ( b->y2 < out->y2 ) out->y2 = b->y2;
-   if ( out->x1 >= out->x2 ) return GL_FALSE;
-   if ( out->y1 >= out->y2 ) return GL_FALSE;
-   return GL_TRUE;
-}
-
-
-void r200RecalcScissorRects( r200ContextPtr rmesa )
-{
-   drm_clip_rect_t *out;
-   int i;
-
-   /* Grow cliprect store?
-    */
-   if (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
-      while (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
-	 rmesa->state.scissor.numAllocedClipRects += 1;	/* zero case */
-	 rmesa->state.scissor.numAllocedClipRects *= 2;
-      }
-
-      if (rmesa->state.scissor.pClipRects)
-	 FREE(rmesa->state.scissor.pClipRects);
-
-      rmesa->state.scissor.pClipRects = 
-	 MALLOC( rmesa->state.scissor.numAllocedClipRects * 
-		 sizeof(drm_clip_rect_t) );
-
-      if ( rmesa->state.scissor.pClipRects == NULL ) {
-	 rmesa->state.scissor.numAllocedClipRects = 0;
-	 return;
-      }
-   }
-   
-   out = rmesa->state.scissor.pClipRects;
-   rmesa->state.scissor.numClipRects = 0;
-
-   for ( i = 0 ; i < rmesa->numClipRects ;  i++ ) {
-      if ( intersect_rect( out, 
-			   &rmesa->pClipRects[i], 
-			   &rmesa->state.scissor.rect ) ) {
-	 rmesa->state.scissor.numClipRects++;
-	 out++;
-      }
-   }
-}
-
-
-static void r200UpdateScissor( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if ( rmesa->dri.drawable ) {
-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-
-      int x = ctx->Scissor.X;
-      int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
-      int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
-      int h = dPriv->h - ctx->Scissor.Y - 1;
-
-      rmesa->state.scissor.rect.x1 = x + dPriv->x;
-      rmesa->state.scissor.rect.y1 = y + dPriv->y;
-      rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
-      rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
-
-      r200RecalcScissorRects( rmesa );
-   }
-}
-
-
-static void r200Scissor( GLcontext *ctx,
-			   GLint x, GLint y, GLsizei w, GLsizei h )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if ( ctx->Scissor.Enabled ) {
-      R200_FIREVERTICES( rmesa );	/* don't pipeline cliprect changes */
-      r200UpdateScissor( ctx );
-   }
-
-}
-
-
 /* =============================================================
  * Culling
  */
@@ -668,6 +574,10 @@ static void r200FrontFace( GLcontext *ctx, GLenum mode )
    R200_STATECHANGE( rmesa, tcl );
    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_CULL_FRONT_IS_CCW;
 
+   /* Winding is inverted when rendering to FBO */
+   if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+      mode = (mode == GL_CW) ? GL_CCW : GL_CW;
+
    switch ( mode ) {
    case GL_CW:
       rmesa->hw.set.cmd[SET_SE_CNTL] |= R200_FFACE_CULL_CW;
@@ -790,7 +700,7 @@ static void r200LineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
    R200_STATECHANGE( rmesa, lin );
-   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] =
       ((((GLuint)factor & 0xff) << 16) | ((GLuint)pattern));
 }
 
@@ -803,21 +713,27 @@ static void r200ColorMask( GLcontext *ctx,
 			   GLboolean b, GLboolean a )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint mask = r200PackColor( rmesa->r200Screen->cpp,
-				ctx->Color.ColorMask[RCOMP],
-				ctx->Color.ColorMask[GCOMP],
-				ctx->Color.ColorMask[BCOMP],
-				ctx->Color.ColorMask[ACOMP] );
-
+   GLuint mask;
+   struct radeon_renderbuffer *rrb;
    GLuint flag = rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] & ~R200_PLANE_MASK_ENABLE;
 
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   if (!rrb)
+     return;
+   mask = radeonPackColor( rrb->cpp,
+			   ctx->Color.ColorMask[RCOMP],
+			   ctx->Color.ColorMask[GCOMP],
+			   ctx->Color.ColorMask[BCOMP],
+			   ctx->Color.ColorMask[ACOMP] );
+
+
    if (!(r && g && b && a))
       flag |= R200_PLANE_MASK_ENABLE;
 
-   if ( rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] != flag ) { 
-      R200_STATECHANGE( rmesa, ctx ); 
-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = flag; 
-   } 
+   if ( rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] != flag ) {
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = flag;
+   }
 
    if ( rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] != mask ) {
       R200_STATECHANGE( rmesa, msk );
@@ -834,7 +750,8 @@ static void r200PolygonOffset( GLcontext *ctx,
 			       GLfloat factor, GLfloat units )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   float_ui32_type constant =  { units * rmesa->state.depth.scale };
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   float_ui32_type constant =  { units * depthScale };
    float_ui32_type factoru = { factor };
 
 /*    factor *= 2; */
@@ -847,41 +764,16 @@ static void r200PolygonOffset( GLcontext *ctx,
    rmesa->hw.zbs.cmd[ZBS_SE_ZBIAS_CONSTANT] = constant.ui32;
 }
 
-static void r200PolygonStipple( GLcontext *ctx, const GLubyte *mask )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint i;
-   drm_radeon_stipple_t stipple;
-
-   /* Must flip pattern upside down.
-    */
-   for ( i = 0 ; i < 32 ; i++ ) {
-      rmesa->state.stipple.mask[31 - i] = ((GLuint *) mask)[i];
-   }
-
-   /* TODO: push this into cmd mechanism
-    */
-   R200_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-
-   /* FIXME: Use window x,y offsets into stipple RAM.
-    */
-   stipple.mask = rmesa->state.stipple.mask;
-   drmCommandWrite( rmesa->dri.fd, DRM_RADEON_STIPPLE, 
-                    &stipple, sizeof(stipple) );
-   UNLOCK_HARDWARE( rmesa );
-}
-
 static void r200PolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
 
    /* Can't generally do unfilled via tcl, but some good special
-    * cases work. 
+    * cases work.
     */
    TCL_FALLBACK( ctx, R200_TCL_FALLBACK_UNFILLED, flag);
-   if (rmesa->TclFallback) {
+   if (rmesa->radeon.TclFallback) {
       r200ChooseRenderState( ctx );
       r200ChooseVertexState( ctx );
    }
@@ -920,34 +812,34 @@ static void r200UpdateSpecular( GLcontext *ctx )
 
    if (ctx->Light.Enabled &&
        ctx->Light.Model.ColorControl == GL_SEPARATE_SPECULAR_COLOR) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
 	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT) |
-	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_0;
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_1;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHTING_ENABLE;
       p |=  R200_SPECULAR_ENABLE;
-      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= 
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &=
 	 ~R200_DIFFUSE_SPECULAR_COMBINE;
    }
    else if (ctx->Light.Enabled) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_0;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHTING_ENABLE;
    } else if (ctx->Fog.ColorSumEnabled ) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
 	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT) |
-	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       p |=  R200_SPECULAR_ENABLE;
    } else {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));
    }
 
    if (ctx->Fog.Enabled) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_1;
    }
 
@@ -958,7 +850,7 @@ static void r200UpdateSpecular( GLcontext *ctx )
 
    /* Update vertex/render formats
     */
-   if (rmesa->TclFallback) { 
+   if (rmesa->radeon.TclFallback) {
       r200ChooseRenderState( ctx );
       r200ChooseVertexState( ctx );
    }
@@ -970,7 +862,7 @@ static void r200UpdateSpecular( GLcontext *ctx )
  */
 
 
-/* Update on colormaterial, material emmissive/ambient, 
+/* Update on colormaterial, material emmissive/ambient,
  * lightmodel.globalambient
  */
 static void update_global_ambient( GLcontext *ctx )
@@ -984,23 +876,23 @@ static void update_global_ambient( GLcontext *ctx )
     */
    if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] &
        ((3 << R200_FRONT_EMISSIVE_SOURCE_SHIFT) |
-	(3 << R200_FRONT_AMBIENT_SOURCE_SHIFT))) == 0) 
+	(3 << R200_FRONT_AMBIENT_SOURCE_SHIFT))) == 0)
    {
-      COPY_3V( &fcmd[GLT_RED], 
+      COPY_3V( &fcmd[GLT_RED],
 	       ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_EMISSION]);
       ACC_SCALE_3V( &fcmd[GLT_RED],
 		   ctx->Light.Model.Ambient,
 		   ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_AMBIENT]);
-   } 
+   }
    else
    {
       COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
    }
-   
+
    R200_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
 }
 
-/* Update on change to 
+/* Update on change to
  *    - light[p].colors
  *    - light[p].enabled
  */
@@ -1014,10 +906,10 @@ static void update_light_colors( GLcontext *ctx, GLuint p )
       r200ContextPtr rmesa = R200_CONTEXT(ctx);
       float *fcmd = (float *)R200_DB_STATE( lit[p] );
 
-      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );
       COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
       COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
-      
+
       R200_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
    }
 }
@@ -1037,7 +929,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 
    if (ctx->Light.ColorMaterialEnabled) {
       GLuint mask = ctx->Light.ColorMaterialBitmask;
-   
+
       if (mask & MAT_BIT_FRONT_EMISSION) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_EMISSIVE_SOURCE_SHIFT);
@@ -1053,7 +945,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       else
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_AMBIENT_SOURCE_SHIFT);
-	 
+
       if (mask & MAT_BIT_FRONT_DIFFUSE) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_DIFFUSE_SOURCE_SHIFT);
@@ -1061,7 +953,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       else
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_DIFFUSE_SOURCE_SHIFT);
-   
+
       if (mask & MAT_BIT_FRONT_SPECULAR) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_SPECULAR_SOURCE_SHIFT);
@@ -1070,7 +962,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_SPECULAR_SOURCE_SHIFT);
       }
-   
+
       if (mask & MAT_BIT_BACK_EMISSION) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_BACK_EMISSIVE_SOURCE_SHIFT);
@@ -1120,8 +1012,8 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       R200_STATECHANGE( rmesa, tcl );
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] = light_model_ctl1;
    }
-   
-   
+
+
 }
 
 void r200UpdateMaterial( GLcontext *ctx )
@@ -1131,12 +1023,12 @@ void r200UpdateMaterial( GLcontext *ctx )
    GLfloat *fcmd = (GLfloat *)R200_DB_STATE( mtl[0] );
    GLfloat *fcmd2 = (GLfloat *)R200_DB_STATE( mtl[1] );
    GLuint mask = ~0;
-   
+
    /* Might be possible and faster to update everything unconditionally? */
    if (ctx->Light.ColorMaterialEnabled)
       mask &= ~ctx->Light.ColorMaterialBitmask;
 
-   if (R200_DEBUG & DEBUG_STATE)
+   if (R200_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
    if (mask & MAT_BIT_FRONT_EMISSION) {
@@ -1210,14 +1102,14 @@ void r200UpdateMaterial( GLcontext *ctx )
  *       _VP_inf_norm
  *       _h_inf_norm
  *       _Position
- *       _NormDirection
+ *       _NormSpotDirection
  *       _ModelViewInvScale
  *       _NeedEyeCoords
  *       _EyeZDir
  *
  * which are calculated in light.c and are correct for the current
  * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
- * and _MESA_NEW_NEED_EYE_COORDS.  
+ * and _MESA_NEW_NEED_EYE_COORDS.
  */
 static void update_light( GLcontext *ctx )
 {
@@ -1234,8 +1126,8 @@ static void update_light( GLcontext *ctx )
 	 tmp &= ~R200_LIGHT_IN_MODELSPACE;
       else
 	 tmp |= R200_LIGHT_IN_MODELSPACE;
-      
-      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]) 
+
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0])
       {
 	 R200_STATECHANGE( rmesa, tcl );
 	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] = tmp;
@@ -1259,17 +1151,17 @@ static void update_light( GLcontext *ctx )
 	 if (ctx->Light.Light[p].Enabled) {
 	    struct gl_light *l = &ctx->Light.Light[p];
 	    GLfloat *fcmd = (GLfloat *)R200_DB_STATE( lit[p] );
-	    
+
 	    if (l->EyePosition[3] == 0.0) {
-	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
-	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm );
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm );
 	       fcmd[LIT_POSITION_W] = 0;
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    } else {
 	       COPY_4V( &fcmd[LIT_POSITION_X], l->_Position );
-	       fcmd[LIT_DIRECTION_X] = -l->_NormDirection[0];
-	       fcmd[LIT_DIRECTION_Y] = -l->_NormDirection[1];
-	       fcmd[LIT_DIRECTION_Z] = -l->_NormDirection[2];
+	       fcmd[LIT_DIRECTION_X] = -l->_NormSpotDirection[0];
+	       fcmd[LIT_DIRECTION_Y] = -l->_NormSpotDirection[1];
+	       fcmd[LIT_DIRECTION_Z] = -l->_NormSpotDirection[2];
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    }
 
@@ -1286,21 +1178,21 @@ static void r200Lightfv( GLcontext *ctx, GLenum light,
    GLint p = light - GL_LIGHT0;
    struct gl_light *l = &ctx->Light.Light[p];
    GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
-   
+
 
    switch (pname) {
-   case GL_AMBIENT:		
+   case GL_AMBIENT:
    case GL_DIFFUSE:
    case GL_SPECULAR:
       update_light_colors( ctx, p );
       break;
 
-   case GL_SPOT_DIRECTION: 
-      /* picked up in update_light */	
+   case GL_SPOT_DIRECTION:
+      /* picked up in update_light */
       break;
 
    case GL_POSITION: {
-      /* positions picked up in update_light, but can do flag here */	
+      /* positions picked up in update_light, but can do flag here */
       GLuint flag = (p&1)? R200_LIGHT_1_IS_LOCAL : R200_LIGHT_0_IS_LOCAL;
       GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
 
@@ -1416,7 +1308,7 @@ static void r200LightModelfv( GLcontext *ctx, GLenum pname,
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
    switch (pname) {
-      case GL_LIGHT_MODEL_AMBIENT: 
+      case GL_LIGHT_MODEL_AMBIENT:
 	 update_global_ambient( ctx );
 	 break;
 
@@ -1430,7 +1322,7 @@ static void r200LightModelfv( GLcontext *ctx, GLenum pname,
 	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHT_TWOSIDE;
 	 else
 	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~(R200_LIGHT_TWOSIDE);
-	 if (rmesa->TclFallback) {
+	 if (rmesa->radeon.TclFallback) {
 	    r200ChooseRenderState( ctx );
 	    r200ChooseVertexState( ctx );
 	 }
@@ -1675,7 +1567,7 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   rmesa->state.stencil.clear = 
+   rmesa->radeon.state.stencil.clear =
       ((GLuint) (ctx->Stencil.Clear & 0xff) |
        (0xff << R200_STENCIL_MASK_SHIFT) |
        ((ctx->Stencil.WriteMask[0] & 0xff) << R200_STENCIL_WRITEMASK_SHIFT));
@@ -1700,19 +1592,29 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
 void r200UpdateWindow( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-   GLfloat xoffset = (GLfloat)dPriv->x;
-   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+   GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer ? (ctx->DrawBuffer->Name != 0) : 0);
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   GLfloat y_scale, y_bias;
+
+   if (render_to_fbo) {
+      y_scale = 1.0;
+      y_bias = 0;
+   } else {
+      y_scale = -1.0;
+      y_bias = yoffset;
+   }
 
    float_ui32_type sx = { v[MAT_SX] };
    float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
-   float_ui32_type sy = { - v[MAT_SY] };
-   float_ui32_type ty = { (- v[MAT_TY]) + yoffset + SUBPIXEL_Y };
-   float_ui32_type sz = { v[MAT_SZ] * rmesa->state.depth.scale };
-   float_ui32_type tz = { v[MAT_TZ] * rmesa->state.depth.scale };
+   float_ui32_type sy = { v[MAT_SY] * y_scale };
+   float_ui32_type ty = { (v[MAT_TY] * y_scale) + y_bias + SUBPIXEL_Y };
+   float_ui32_type sz = { v[MAT_SZ] * depthScale };
+   float_ui32_type tz = { v[MAT_TZ] * depthScale };
 
-   R200_FIREVERTICES( rmesa );
    R200_STATECHANGE( rmesa, vpt );
 
    rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = sx.ui32;
@@ -1723,6 +1625,30 @@ void r200UpdateWindow( GLcontext *ctx )
    rmesa->hw.vpt.cmd[VPT_SE_VPORT_ZOFFSET] = tz.ui32;
 }
 
+void r200_vtbl_update_scissor( GLcontext *ctx )
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   unsigned x1, y1, x2, y2;
+   struct radeon_renderbuffer *rrb;
+
+   R200_SET_STATE(r200, set, SET_RE_CNTL, R200_SCISSOR_ENABLE | r200->hw.set.cmd[SET_RE_CNTL]);
+
+   if (r200->radeon.state.scissor.enabled) {
+      x1 = r200->radeon.state.scissor.rect.x1;
+      y1 = r200->radeon.state.scissor.rect.y1;
+      x2 = r200->radeon.state.scissor.rect.x2;
+      y2 = r200->radeon.state.scissor.rect.y2;
+   } else {
+      rrb = radeon_get_colorbuffer(&r200->radeon);
+      x1 = 0;
+      y1 = 0;
+      x2 = rrb->base.Width - 1;
+      y2 = rrb->base.Height - 1;
+   }
+
+   R200_SET_STATE(r200, sci, SCI_XY_1, x1 | (y1 << 16));
+   R200_SET_STATE(r200, sci, SCI_XY_2, x2 | (y2 << 16));
+}
 
 
 static void r200Viewport( GLcontext *ctx, GLint x, GLint y,
@@ -1733,6 +1659,8 @@ static void r200Viewport( GLcontext *ctx, GLint x, GLint y,
     * values, or keep the originals hanging around.
     */
    r200UpdateWindow( ctx );
+
+   radeon_viewport(ctx, x, y, width, height);
 }
 
 static void r200DepthRange( GLcontext *ctx, GLclampd nearval,
@@ -1744,7 +1672,7 @@ static void r200DepthRange( GLcontext *ctx, GLclampd nearval,
 void r200UpdateViewportOffset( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1774,8 +1702,8 @@ void r200UpdateViewportOffset( GLcontext *ctx )
                 R200_STIPPLE_Y_OFFSET_MASK);
 
          /* add magic offsets, then invert */
-         stx = 31 - ((rmesa->dri.drawable->x - 1) & R200_STIPPLE_COORD_MASK);
-         sty = 31 - ((rmesa->dri.drawable->y + rmesa->dri.drawable->h - 1)
+         stx = 31 - ((dPriv->x - 1) & R200_STIPPLE_COORD_MASK);
+         sty = 31 - ((dPriv->y + dPriv->h - 1)
                      & R200_STIPPLE_COORD_MASK);
 
          m |= ((stx << R200_STIPPLE_X_OFFSET_SHIFT) |
@@ -1788,7 +1716,7 @@ void r200UpdateViewportOffset( GLcontext *ctx )
       }
    }
 
-   r200UpdateScissor( ctx );
+   radeonUpdateScissor( ctx );
 }
 
 
@@ -1801,11 +1729,16 @@ static void r200ClearColor( GLcontext *ctx, const GLfloat c[4] )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLubyte color[4];
+   struct radeon_renderbuffer *rrb;
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   if (!rrb)
+     return;
    CLAMPED_FLOAT_TO_UBYTE(color[0], c[0]);
    CLAMPED_FLOAT_TO_UBYTE(color[1], c[1]);
    CLAMPED_FLOAT_TO_UBYTE(color[2], c[2]);
    CLAMPED_FLOAT_TO_UBYTE(color[3], c[3]);
-   rmesa->state.color.clear = r200PackColor( rmesa->r200Screen->cpp,
+   rmesa->radeon.state.color.clear = radeonPackColor( rrb->cpp,
                                              color[0], color[1],
                                              color[2], color[3] );
 }
@@ -1848,96 +1781,6 @@ static void r200LogicOpCode( GLcontext *ctx, GLenum opcode )
    rmesa->hw.msk.cmd[MSK_RB3D_ROPCNTL] = r200_rop_tab[rop];
 }
 
-
-/*
- * Set up the cliprects for either front or back-buffer drawing.
- */
-void r200SetCliprects( r200ContextPtr rmesa )
-{
-   __DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-   __DRIdrawablePrivate *const readable = rmesa->dri.readable;
-   GLframebuffer *const draw_fb = (GLframebuffer*) drawable->driverPrivate;
-   GLframebuffer *const read_fb = (GLframebuffer*) readable->driverPrivate;
-
-   if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BIT_BACK_LEFT) {
-      /* Can't ignore 2d windows if we are page flipping.
-       */
-      if ( drawable->numBackClipRects == 0 || rmesa->doPageFlip ) {
-         rmesa->numClipRects = drawable->numClipRects;
-         rmesa->pClipRects = drawable->pClipRects;
-      }
-      else {
-         rmesa->numClipRects = drawable->numBackClipRects;
-         rmesa->pClipRects = drawable->pBackClipRects;
-      }
-   }
-   else {
-     /* front buffer (or none, or multiple buffers) */
-     rmesa->numClipRects = drawable->numClipRects;
-     rmesa->pClipRects = drawable->pClipRects;
-  }
-
-   if ((draw_fb->Width != drawable->w) || (draw_fb->Height != drawable->h)) {
-      _mesa_resize_framebuffer(rmesa->glCtx, draw_fb,
-			       drawable->w, drawable->h);
-      draw_fb->Initialized = GL_TRUE;
-   }
-
-   if (drawable != readable) {
-      if ((read_fb->Width != readable->w) ||
-	  (read_fb->Height != readable->h)) {
-	 _mesa_resize_framebuffer(rmesa->glCtx, read_fb,
-				  readable->w, readable->h);
-	 read_fb->Initialized = GL_TRUE;
-      }
-   }
-
-   if (rmesa->state.scissor.enabled)
-      r200RecalcScissorRects( rmesa );
-
-   rmesa->lastStamp = drawable->lastStamp;
-}
-
-
-static void r200DrawBuffer( GLcontext *ctx, GLenum mode )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (R200_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "%s %s\n", __FUNCTION__,
-	      _mesa_lookup_enum_by_nr( mode ));
-
-   R200_FIREVERTICES(rmesa);	/* don't pipeline cliprect changes */
-
-   if (ctx->DrawBuffer->_NumColorDrawBuffers != 1) {
-      /* 0 (GL_NONE) buffers or multiple color drawing buffers */
-      FALLBACK( rmesa, R200_FALLBACK_DRAW_BUFFER, GL_TRUE );
-      return;
-   }
-
-   switch ( ctx->DrawBuffer->_ColorDrawBufferIndexes[0] ) {
-   case BUFFER_FRONT_LEFT:
-   case BUFFER_BACK_LEFT:
-      FALLBACK( rmesa, R200_FALLBACK_DRAW_BUFFER, GL_FALSE );
-      break;
-   default:
-      FALLBACK( rmesa, R200_FALLBACK_DRAW_BUFFER, GL_TRUE );
-      return;
-   }
-
-   r200SetCliprects( rmesa );
-
-   /* We'll set the drawing engine's offset/pitch parameters later
-    * when we update other state.
-    */
-}
-
-
-static void r200ReadBuffer( GLcontext *ctx, GLenum mode )
-{
-   /* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
-}
-
 /* =============================================================
  * State enable/disable
  */
@@ -1947,7 +1790,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLuint p, flag;
 
-   if ( R200_DEBUG & DEBUG_STATE )
+   if ( R200_DEBUG & RADEON_STATE )
       fprintf( stderr, "%s( %s = %s )\n", __FUNCTION__,
 	       _mesa_lookup_enum_by_nr( cap ),
 	       state ? "GL_TRUE" : "GL_FALSE" );
@@ -1979,7 +1822,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_CLIP_PLANE2:
    case GL_CLIP_PLANE3:
    case GL_CLIP_PLANE4:
-   case GL_CLIP_PLANE5: 
+   case GL_CLIP_PLANE5:
       p = cap-GL_CLIP_PLANE0;
       R200_STATECHANGE( rmesa, tcl );
       if (state) {
@@ -2013,10 +1856,10 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
       R200_STATECHANGE(rmesa, ctx );
       if ( state ) {
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_DITHER_ENABLE;
-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->state.color.roundEnable;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->radeon.state.color.roundEnable;
       } else {
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_DITHER_ENABLE;
-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->state.color.roundEnable;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->radeon.state.color.roundEnable;
       }
       break;
 
@@ -2031,7 +1874,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_TCL_FOG_MASK;
       }
       r200UpdateSpecular( ctx ); /* for PK_SPEC */
-      if (rmesa->TclFallback) 
+      if (rmesa->radeon.TclFallback)
 	 r200ChooseVertexState( ctx );
       _mesa_allow_light_in_model( ctx, !state );
       break;
@@ -2046,13 +1889,13 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_LIGHT7:
       R200_STATECHANGE(rmesa, tcl);
       p = cap - GL_LIGHT0;
-      if (p&1) 
+      if (p&1)
 	 flag = (R200_LIGHT_1_ENABLE |
-		 R200_LIGHT_1_ENABLE_AMBIENT | 
+		 R200_LIGHT_1_ENABLE_AMBIENT |
 		 R200_LIGHT_1_ENABLE_SPECULAR);
       else
 	 flag = (R200_LIGHT_0_ENABLE |
-		 R200_LIGHT_0_ENABLE_AMBIENT | 
+		 R200_LIGHT_0_ENABLE_AMBIENT |
 		 R200_LIGHT_0_ENABLE_SPECULAR);
 
       if (state)
@@ -2060,7 +1903,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
       else
 	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] &= ~flag;
 
-      /* 
+      /*
        */
       update_light_colors( ctx, p );
       break;
@@ -2068,7 +1911,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_LIGHTING:
       r200UpdateSpecular(ctx);
       /* for reflection map fixup - might set recheck_texgen for all units too */
-      rmesa->NewGLState |= _NEW_TEXTURE;
+      rmesa->radeon.NewGLState |= _NEW_TEXTURE;
       break;
 
    case GL_LINE_SMOOTH:
@@ -2181,21 +2024,30 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    }
 
    case GL_SCISSOR_TEST:
-      R200_FIREVERTICES( rmesa );
-      rmesa->state.scissor.enabled = state;
-      r200UpdateScissor( ctx );
+      radeon_firevertices(&rmesa->radeon);
+      rmesa->radeon.state.scissor.enabled = state;
+      radeonUpdateScissor( ctx );
       break;
 
    case GL_STENCIL_TEST:
-      if ( rmesa->state.stencil.hwBuffer ) {
-	 R200_STATECHANGE( rmesa, ctx );
-	 if ( state ) {
-	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_STENCIL_ENABLE;
+      {
+	 GLboolean hw_stencil = GL_FALSE;
+	 if (ctx->DrawBuffer) {
+	    struct radeon_renderbuffer *rrbStencil
+	       = radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+	    hw_stencil = (rrbStencil && rrbStencil->bo);
+	 }
+
+	 if (hw_stencil) {
+	    R200_STATECHANGE( rmesa, ctx );
+	    if ( state ) {
+	       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_STENCIL_ENABLE;
+	    } else {
+	       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_STENCIL_ENABLE;
+	    }
 	 } else {
-	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_STENCIL_ENABLE;
+	    FALLBACK( rmesa, R200_FALLBACK_STENCIL, state );
 	 }
-      } else {
-	 FALLBACK( rmesa, R200_FALLBACK_STENCIL, state );
       }
       break;
 
@@ -2205,7 +2057,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_TEXTURE_GEN_T:
       /* Picked up in r200UpdateTextureState.
        */
-      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE; 
+      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE;
       break;
 
    case GL_COLOR_SUM_EXT:
@@ -2322,7 +2174,7 @@ void r200LightingSpaceChange( GLcontext *ctx )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLboolean tmp;
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s %d BEFORE %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]);
 
@@ -2338,7 +2190,7 @@ void r200LightingSpaceChange( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~R200_RESCALE_NORMALS;
    }
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s %d AFTER %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]);
 }
@@ -2381,7 +2233,7 @@ static void update_texturematrix( GLcontext *ctx )
    GLuint compsel = rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL];
    int unit;
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s before COMPSEL: %x\n", __FUNCTION__,
 	      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL]);
 
@@ -2389,7 +2241,7 @@ static void update_texturematrix( GLcontext *ctx )
    rmesa->TexMatCompSel = 0;
 
    for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
-      if (!ctx->Texture.Unit[unit]._ReallyEnabled) 
+      if (!ctx->Texture.Unit[unit]._ReallyEnabled)
 	 continue;
 
       if (ctx->TextureMatrixStack[unit].Top->type != MATRIX_IDENTITY) {
@@ -2399,21 +2251,21 @@ static void update_texturematrix( GLcontext *ctx )
 	 rmesa->TexMatCompSel |= R200_OUTPUT_TEX_0 << unit;
 
 	 if (rmesa->TexGenEnabled & (R200_TEXMAT_0_ENABLE << unit)) {
-	    /* Need to preconcatenate any active texgen 
+	    /* Need to preconcatenate any active texgen
 	     * obj/eyeplane matrices:
 	     */
 	    _math_matrix_mul_matrix( &rmesa->tmpmat,
-				     ctx->TextureMatrixStack[unit].Top, 
+				     ctx->TextureMatrixStack[unit].Top,
 				     &rmesa->TexGenMatrix[unit] );
 	    upload_matrix( rmesa, rmesa->tmpmat.m, R200_MTX_TEX0+unit );
-	 } 
+	 }
 	 else {
-	    upload_matrix( rmesa, ctx->TextureMatrixStack[unit].Top->m, 
+	    upload_matrix( rmesa, ctx->TextureMatrixStack[unit].Top->m,
 			   R200_MTX_TEX0+unit );
 	 }
       }
       else if (rmesa->TexGenEnabled & (R200_TEXMAT_0_ENABLE << unit)) {
-	 upload_matrix( rmesa, rmesa->TexGenMatrix[unit].m, 
+	 upload_matrix( rmesa, rmesa->TexGenMatrix[unit].m,
 			R200_MTX_TEX0+unit );
       }
    }
@@ -2432,69 +2284,84 @@ static void update_texturematrix( GLcontext *ctx )
    }
 }
 
-
-
-/**
- * Tell the card where to render (offset, pitch).
- * Effected by glDrawBuffer, etc
- */
-void
-r200UpdateDrawBuffer(GLcontext *ctx)
+static GLboolean r200ValidateBuffers(GLcontext *ctx)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   driRenderbuffer *drb;
+   struct radeon_renderbuffer *rrb;
+   struct radeon_dma_bo *dma_bo;
+   int i, ret;
 
-   if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-      /* draw to front */
-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
-   }
-   else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-      /* draw to back */
-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+   radeon_cs_space_reset_bos(rmesa->radeon.cmdbuf.cs);
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   /* color buffer */
+   if (rrb && rrb->bo) {
+     radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, rrb->bo,
+				       0, RADEON_GEM_DOMAIN_VRAM);
    }
-   else {
-      /* drawing to multiple buffers, or none */
-      return;
+
+   /* depth buffer */
+   rrb = radeon_get_depthbuffer(&rmesa->radeon);
+   /* color buffer */
+   if (rrb && rrb->bo) {
+     radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, rrb->bo,
+				       0, RADEON_GEM_DOMAIN_VRAM);
    }
 
-   assert(drb);
-   assert(drb->flippedPitch);
+   for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+      radeonTexObj *t;
 
-   R200_STATECHANGE( rmesa, ctx );
+      if (!ctx->Texture.Unit[i]._ReallyEnabled)
+	 continue;
 
-   /* Note: we used the (possibly) page-flipped values */
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
-     = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
-	& R200_COLOROFFSET_MASK);
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+      t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+      if (t->image_override && t->bo)
+	radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, t->bo,
+			   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+      else if (t->mt->bo)
+	radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, t->mt->bo,
+			   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
    }
-}
-
 
+   dma_bo = first_elem(&rmesa->radeon.dma.reserved);
+   {
+       ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, dma_bo->bo, RADEON_GEM_DOMAIN_GTT, 0);
+       if (ret)
+	   return GL_FALSE;
+   }
+   return GL_TRUE;
+}
 
-void r200ValidateState( GLcontext *ctx )
+GLboolean r200ValidateState( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint new_state = rmesa->NewGLState;
+   GLuint new_state = rmesa->radeon.NewGLState;
 
-   if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
-     r200UpdateDrawBuffer(ctx);
+   if (new_state & _NEW_BUFFERS) {
+      _mesa_update_framebuffer(ctx);
+      /* this updates the DrawBuffer's Width/Height if it's a FBO */
+      _mesa_update_draw_buffer_bounds(ctx);
+
+      R200_STATECHANGE(rmesa, ctx);
    }
 
-   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM)) {
+   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)) {
       r200UpdateTextureState( ctx );
-      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
+      new_state |= rmesa->radeon.NewGLState; /* may add TEXTURE_MATRIX */
       r200UpdateLocalViewer( ctx );
    }
 
+   /* we need to do a space check here */
+   if (!r200ValidateBuffers(ctx))
+     return GL_FALSE;
+
 /* FIXME: don't really need most of these when vertex progs are enabled */
 
    /* Need an event driven matrix update?
     */
-   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
       upload_matrix( rmesa, ctx->_ModelProjectMatrix.m, R200_MTX_MVP );
 
    /* Need these for lighting (shouldn't upload otherwise)
@@ -2518,11 +2385,12 @@ void r200ValidateState( GLcontext *ctx )
    /* emit all active clip planes if projection matrix changes.
     */
    if (new_state & (_NEW_PROJECTION)) {
-      if (ctx->Transform.ClipPlanesEnabled) 
+      if (ctx->Transform.ClipPlanesEnabled)
 	 r200UpdateClipPlanes( ctx );
    }
 
    if (new_state & (_NEW_PROGRAM|
+                    _NEW_PROGRAM_CONSTANTS |
    /* need to test for pretty much anything due to possible parameter bindings */
 	_NEW_MODELVIEW|_NEW_PROJECTION|_NEW_TRANSFORM|
 	_NEW_LIGHT|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX|
@@ -2533,7 +2401,8 @@ void r200ValidateState( GLcontext *ctx )
       else TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, 0);
    }
 
-   rmesa->NewGLState = 0;
+   rmesa->radeon.NewGLState = 0;
+   return GL_TRUE;
 }
 
 
@@ -2544,7 +2413,7 @@ static void r200InvalidateState( GLcontext *ctx, GLuint new_state )
    _vbo_InvalidateState( ctx, new_state );
    _tnl_InvalidateState( ctx, new_state );
    _ae_invalidate_state( ctx, new_state );
-   R200_CONTEXT(ctx)->NewGLState |= new_state;
+   R200_CONTEXT(ctx)->radeon.NewGLState |= new_state;
 }
 
 /* A hack.  The r200 can actually cope just fine with materials
@@ -2573,12 +2442,13 @@ static void r200WrapRunPipeline( GLcontext *ctx )
    GLboolean has_material;
 
    if (0)
-      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->NewGLState);
+      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->radeon.NewGLState);
 
    /* Validate state:
     */
-   if (rmesa->NewGLState)
-      r200ValidateState( ctx );
+   if (rmesa->radeon.NewGLState)
+      if (!r200ValidateState( ctx ))
+	 FALLBACK(rmesa, RADEON_FALLBACK_TEXTURE, GL_TRUE);
 
    has_material = !ctx->VertexProgram._Enabled && ctx->Light.Enabled && check_material( ctx );
 
@@ -2587,7 +2457,7 @@ static void r200WrapRunPipeline( GLcontext *ctx )
    }
 
    /* Run the pipeline.
-    */ 
+    */
    _tnl_run_pipeline( ctx );
 
    if (has_material) {
@@ -2596,15 +2466,30 @@ static void r200WrapRunPipeline( GLcontext *ctx )
 }
 
 
+static void r200PolygonStipple( GLcontext *ctx, const GLubyte *mask )
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   GLint i;
+
+   radeon_firevertices(&r200->radeon);
+
+   R200_STATECHANGE(r200, stp);
+
+   /* Must flip pattern upside down.
+    */
+   for ( i = 31 ; i >= 0; i--) {
+     r200->hw.stp.cmd[3 + i] = ((GLuint *) mask)[i];
+   }
+}
 /* Initialize the driver's state functions.
  */
-void r200InitStateFuncs( struct dd_function_table *functions )
+void r200InitStateFuncs( struct dd_function_table *functions, GLboolean dri2 )
 {
    functions->UpdateState		= r200InvalidateState;
    functions->LightingSpaceChange	= r200LightingSpaceChange;
 
-   functions->DrawBuffer		= r200DrawBuffer;
-   functions->ReadBuffer		= r200ReadBuffer;
+   functions->DrawBuffer		= radeonDrawBuffer;
+   functions->ReadBuffer		= radeonReadBuffer;
 
    functions->AlphaFunc			= r200AlphaFunc;
    functions->BlendColor		= r200BlendColor;
@@ -2632,11 +2517,14 @@ void r200InitStateFuncs( struct dd_function_table *functions )
    functions->LogicOpcode		= r200LogicOpCode;
    functions->PolygonMode		= r200PolygonMode;
    functions->PolygonOffset		= r200PolygonOffset;
-   functions->PolygonStipple		= r200PolygonStipple;
+   if (dri2)
+      functions->PolygonStipple		= r200PolygonStipple;
+   else
+      functions->PolygonStipple		= radeonPolygonStipplePreKMS;
    functions->PointParameterfv		= r200PointParameter;
    functions->PointSize			= r200PointSize;
    functions->RenderMode		= r200RenderMode;
-   functions->Scissor			= r200Scissor;
+   functions->Scissor			= radeonScissor;
    functions->ShadeModel		= r200ShadeModel;
    functions->StencilFuncSeparate	= r200StencilFuncSeparate;
    functions->StencilMaskSeparate	= r200StencilMaskSeparate;
diff --git a/src/mesa/drivers/dri/r200/r200_state.h b/src/mesa/drivers/dri/r200/r200_state.h
index a917163a00..9c62f0a644 100644
--- a/src/mesa/drivers/dri/r200/r200_state.h
+++ b/src/mesa/drivers/dri/r200/r200_state.h
@@ -38,28 +38,24 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 
 extern void r200InitState( r200ContextPtr rmesa );
-extern void r200InitStateFuncs( struct dd_function_table *functions );
+extern void r200InitStateFuncs( struct dd_function_table *functions, GLboolean dri2 );
 extern void r200InitTnlFuncs( GLcontext *ctx );
 
 extern void r200UpdateMaterial( GLcontext *ctx );
 
-extern void r200SetCliprects( r200ContextPtr rmesa );
-extern void r200RecalcScissorRects( r200ContextPtr rmesa );
 extern void r200UpdateViewportOffset( GLcontext *ctx );
 extern void r200UpdateWindow( GLcontext *ctx );
 extern void r200UpdateDrawBuffer(GLcontext *ctx);
 
-extern void r200ValidateState( GLcontext *ctx );
-
-extern void r200PrintDirty( r200ContextPtr rmesa,
-			      const char *msg );
+extern GLboolean r200ValidateState( GLcontext *ctx );
 
+extern void r200_vtbl_update_scissor( GLcontext *ctx );
 
 extern void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode );
 #define FALLBACK( rmesa, bit, mode ) do {				\
    if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",		\
 		     __FUNCTION__, bit, mode );				\
-   r200Fallback( rmesa->glCtx, bit, mode );				\
+   r200Fallback( rmesa->radeon.glCtx, bit, mode );				\
 } while (0)
 
 extern void r200LightingSpaceChange( GLcontext *ctx );
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index 9e4677eda4..7697306d88 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -43,40 +43,141 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_pipeline.h"
 #include "swrast_setup/swrast_setup.h"
 
+#include "radeon_common.h"
+#include "radeon_mipmap_tree.h"
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_state.h"
 #include "r200_tcl.h"
 #include "r200_tex.h"
 #include "r200_swtcl.h"
+#include "radeon_queryobj.h"
 
 #include "xmlpool.h"
 
+/* New (1.3) state mechanism.  3 commands (packet, scalar, vector) in
+ * 1.3 cmdbuffers allow all previous state to be updated as well as
+ * the tcl scalar and vector areas.
+ */
+static struct {
+	int start;
+	int len;
+	const char *name;
+} packet[RADEON_MAX_STATE_PACKETS] = {
+	{RADEON_PP_MISC, 7, "RADEON_PP_MISC"},
+	{RADEON_PP_CNTL, 3, "RADEON_PP_CNTL"},
+	{RADEON_RB3D_COLORPITCH, 1, "RADEON_RB3D_COLORPITCH"},
+	{RADEON_RE_LINE_PATTERN, 2, "RADEON_RE_LINE_PATTERN"},
+	{RADEON_SE_LINE_WIDTH, 1, "RADEON_SE_LINE_WIDTH"},
+	{RADEON_PP_LUM_MATRIX, 1, "RADEON_PP_LUM_MATRIX"},
+	{RADEON_PP_ROT_MATRIX_0, 2, "RADEON_PP_ROT_MATRIX_0"},
+	{RADEON_RB3D_STENCILREFMASK, 3, "RADEON_RB3D_STENCILREFMASK"},
+	{RADEON_SE_VPORT_XSCALE, 6, "RADEON_SE_VPORT_XSCALE"},
+	{RADEON_SE_CNTL, 2, "RADEON_SE_CNTL"},
+	{RADEON_SE_CNTL_STATUS, 1, "RADEON_SE_CNTL_STATUS"},
+	{RADEON_RE_MISC, 1, "RADEON_RE_MISC"},
+	{RADEON_PP_TXFILTER_0, 6, "RADEON_PP_TXFILTER_0"},
+	{RADEON_PP_BORDER_COLOR_0, 1, "RADEON_PP_BORDER_COLOR_0"},
+	{RADEON_PP_TXFILTER_1, 6, "RADEON_PP_TXFILTER_1"},
+	{RADEON_PP_BORDER_COLOR_1, 1, "RADEON_PP_BORDER_COLOR_1"},
+	{RADEON_PP_TXFILTER_2, 6, "RADEON_PP_TXFILTER_2"},
+	{RADEON_PP_BORDER_COLOR_2, 1, "RADEON_PP_BORDER_COLOR_2"},
+	{RADEON_SE_ZBIAS_FACTOR, 2, "RADEON_SE_ZBIAS_FACTOR"},
+	{RADEON_SE_TCL_OUTPUT_VTX_FMT, 11, "RADEON_SE_TCL_OUTPUT_VTX_FMT"},
+	{RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED, 17,
+		    "RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED"},
+	{R200_PP_TXCBLEND_0, 4, "R200_PP_TXCBLEND_0"},
+	{R200_PP_TXCBLEND_1, 4, "R200_PP_TXCBLEND_1"},
+	{R200_PP_TXCBLEND_2, 4, "R200_PP_TXCBLEND_2"},
+	{R200_PP_TXCBLEND_3, 4, "R200_PP_TXCBLEND_3"},
+	{R200_PP_TXCBLEND_4, 4, "R200_PP_TXCBLEND_4"},
+	{R200_PP_TXCBLEND_5, 4, "R200_PP_TXCBLEND_5"},
+	{R200_PP_TXCBLEND_6, 4, "R200_PP_TXCBLEND_6"},
+	{R200_PP_TXCBLEND_7, 4, "R200_PP_TXCBLEND_7"},
+	{R200_SE_TCL_LIGHT_MODEL_CTL_0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0"},
+	{R200_PP_TFACTOR_0, 6, "R200_PP_TFACTOR_0"},
+	{R200_SE_VTX_FMT_0, 4, "R200_SE_VTX_FMT_0"},
+	{R200_SE_VAP_CNTL, 1, "R200_SE_VAP_CNTL"},
+	{R200_SE_TCL_MATRIX_SEL_0, 5, "R200_SE_TCL_MATRIX_SEL_0"},
+	{R200_SE_TCL_TEX_PROC_CTL_2, 5, "R200_SE_TCL_TEX_PROC_CTL_2"},
+	{R200_SE_TCL_UCP_VERT_BLEND_CTL, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL"},
+	{R200_PP_TXFILTER_0, 6, "R200_PP_TXFILTER_0"},
+	{R200_PP_TXFILTER_1, 6, "R200_PP_TXFILTER_1"},
+	{R200_PP_TXFILTER_2, 6, "R200_PP_TXFILTER_2"},
+	{R200_PP_TXFILTER_3, 6, "R200_PP_TXFILTER_3"},
+	{R200_PP_TXFILTER_4, 6, "R200_PP_TXFILTER_4"},
+	{R200_PP_TXFILTER_5, 6, "R200_PP_TXFILTER_5"},
+	{R200_PP_TXOFFSET_0, 1, "R200_PP_TXOFFSET_0"},
+	{R200_PP_TXOFFSET_1, 1, "R200_PP_TXOFFSET_1"},
+	{R200_PP_TXOFFSET_2, 1, "R200_PP_TXOFFSET_2"},
+	{R200_PP_TXOFFSET_3, 1, "R200_PP_TXOFFSET_3"},
+	{R200_PP_TXOFFSET_4, 1, "R200_PP_TXOFFSET_4"},
+	{R200_PP_TXOFFSET_5, 1, "R200_PP_TXOFFSET_5"},
+	{R200_SE_VTE_CNTL, 1, "R200_SE_VTE_CNTL"},
+	{R200_SE_TCL_OUTPUT_VTX_COMP_SEL, 1,
+	 "R200_SE_TCL_OUTPUT_VTX_COMP_SEL"},
+	{R200_PP_TAM_DEBUG3, 1, "R200_PP_TAM_DEBUG3"},
+	{R200_PP_CNTL_X, 1, "R200_PP_CNTL_X"},
+	{R200_RB3D_DEPTHXY_OFFSET, 1, "R200_RB3D_DEPTHXY_OFFSET"},
+	{R200_RE_AUX_SCISSOR_CNTL, 1, "R200_RE_AUX_SCISSOR_CNTL"},
+	{R200_RE_SCISSOR_TL_0, 2, "R200_RE_SCISSOR_TL_0"},
+	{R200_RE_SCISSOR_TL_1, 2, "R200_RE_SCISSOR_TL_1"},
+	{R200_RE_SCISSOR_TL_2, 2, "R200_RE_SCISSOR_TL_2"},
+	{R200_SE_VAP_CNTL_STATUS, 1, "R200_SE_VAP_CNTL_STATUS"},
+	{R200_SE_VTX_STATE_CNTL, 1, "R200_SE_VTX_STATE_CNTL"},
+	{R200_RE_POINTSIZE, 1, "R200_RE_POINTSIZE"},
+	{R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, 4,
+		    "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0"},
+	{R200_PP_CUBIC_FACES_0, 1, "R200_PP_CUBIC_FACES_0"},	/* 61 */
+	{R200_PP_CUBIC_OFFSET_F1_0, 5, "R200_PP_CUBIC_OFFSET_F1_0"}, /* 62 */
+	{R200_PP_CUBIC_FACES_1, 1, "R200_PP_CUBIC_FACES_1"},
+	{R200_PP_CUBIC_OFFSET_F1_1, 5, "R200_PP_CUBIC_OFFSET_F1_1"},
+	{R200_PP_CUBIC_FACES_2, 1, "R200_PP_CUBIC_FACES_2"},
+	{R200_PP_CUBIC_OFFSET_F1_2, 5, "R200_PP_CUBIC_OFFSET_F1_2"},
+	{R200_PP_CUBIC_FACES_3, 1, "R200_PP_CUBIC_FACES_3"},
+	{R200_PP_CUBIC_OFFSET_F1_3, 5, "R200_PP_CUBIC_OFFSET_F1_3"},
+	{R200_PP_CUBIC_FACES_4, 1, "R200_PP_CUBIC_FACES_4"},
+	{R200_PP_CUBIC_OFFSET_F1_4, 5, "R200_PP_CUBIC_OFFSET_F1_4"},
+	{R200_PP_CUBIC_FACES_5, 1, "R200_PP_CUBIC_FACES_5"},
+	{R200_PP_CUBIC_OFFSET_F1_5, 5, "R200_PP_CUBIC_OFFSET_F1_5"},
+	{RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0"},
+	{RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1"},
+	{RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"},
+	{R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"},
+	{R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
+	{RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0"},
+	{RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0"},
+	{RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1"},
+	{RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0"},
+	{RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2"},
+	{RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0"},
+	{R200_PP_TRI_PERF, 2, "R200_PP_TRI_PERF"},
+	{R200_PP_TXCBLEND_8, 32, "R200_PP_AFS_0"},     /* 85 */
+	{R200_PP_TXCBLEND_0, 32, "R200_PP_AFS_1"},
+	{R200_PP_TFACTOR_0, 8, "R200_ATF_TFACTOR"},
+	{R200_PP_TXFILTER_0, 8, "R200_PP_TXCTLALL_0"},
+	{R200_PP_TXFILTER_1, 8, "R200_PP_TXCTLALL_1"},
+	{R200_PP_TXFILTER_2, 8, "R200_PP_TXCTLALL_2"},
+	{R200_PP_TXFILTER_3, 8, "R200_PP_TXCTLALL_3"},
+	{R200_PP_TXFILTER_4, 8, "R200_PP_TXCTLALL_4"},
+	{R200_PP_TXFILTER_5, 8, "R200_PP_TXCTLALL_5"},
+	{R200_VAP_PVS_CNTL_1, 2, "R200_VAP_PVS_CNTL"},
+};
+
 /* =============================================================
  * State initialization
  */
-
-void r200PrintDirty( r200ContextPtr rmesa, const char *msg )
+static int cmdpkt( r200ContextPtr rmesa, int id ) 
 {
-   struct r200_state_atom *l;
-
-   fprintf(stderr, msg);
-   fprintf(stderr, ": ");
+   drm_radeon_cmd_header_t h;
 
-   foreach(l, &rmesa->hw.atomlist) {
-      if (l->dirty || rmesa->hw.all_dirty)
-	 fprintf(stderr, "%s, ", l->name);
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     return CP_PACKET0(packet[id].start, packet[id].len - 1);
+   } else {
+     h.i = 0;
+     h.packet.cmd_type = RADEON_CMD_PACKET;
+     h.packet.packet_id = id;
    }
-
-   fprintf(stderr, "\n");
-}
-
-static int cmdpkt( int id ) 
-{
-   drm_radeon_cmd_header_t h;
-   h.i = 0;
-   h.packet.cmd_type = RADEON_CMD_PACKET;
-   h.packet.packet_id = id;
    return h.i;
 }
 
@@ -126,151 +227,607 @@ static int cmdscl2( int offset, int stride, int count )
    return h.i;
 }
 
-#define CHECK( NM, FLAG )				\
-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+/**
+ * Check functions are used to check if state is active.
+ * If it is active check function returns maximum emit size.
+ */
+#define CHECK( NM, FLAG, ADD )				\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom) \
 {							\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   (void) idx;						\
    (void) rmesa;					\
-   return FLAG;						\
+   return (FLAG) ? atom->cmd_size + (ADD) : 0;			\
 }
 
-#define TCL_CHECK( NM, FLAG )				\
-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
-{							\
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   (void) idx;						\
-   return !rmesa->TclFallback && !ctx->VertexProgram._Enabled && (FLAG);	\
+#define TCL_CHECK( NM, FLAG, ADD )				\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom) \
+{									\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);				\
+   return (!rmesa->radeon.TclFallback && !ctx->VertexProgram._Enabled && (FLAG)) ? atom->cmd_size + (ADD) : 0; \
 }
 
-#define TCL_OR_VP_CHECK( NM, FLAG )			\
-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+#define TCL_OR_VP_CHECK( NM, FLAG, ADD )			\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom ) \
 {							\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   (void) idx;						\
-   return !rmesa->TclFallback && (FLAG);		\
+   return (!rmesa->radeon.TclFallback && (FLAG)) ? atom->cmd_size + (ADD) : 0;	\
 }
 
-#define VP_CHECK( NM, FLAG )				\
-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
-{							\
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   (void) idx;						\
-   return !rmesa->TclFallback && ctx->VertexProgram._Enabled && (FLAG);		\
+#define VP_CHECK( NM, FLAG, ADD )				\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom ) \
+{									\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);				\
+   (void) atom;								\
+   return (!rmesa->radeon.TclFallback && ctx->VertexProgram._Enabled && (FLAG)) ? atom->cmd_size + (ADD) : 0; \
+}
+
+CHECK( always, GL_TRUE, 0 )
+CHECK( always_add4, GL_TRUE, 4 )
+CHECK( never, GL_FALSE, 0 )
+CHECK( tex_any, ctx->Texture._EnabledUnits, 0 )
+CHECK( tf, (ctx->Texture._EnabledUnits && !ctx->ATIFragmentShader._Enabled), 0 );
+CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled, 0 )
+   CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled), 0 )
+CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)), 0 )
+CHECK( afs, ctx->ATIFragmentShader._Enabled, 0 )
+CHECK( tex_cube, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT, 3 + 3*5 - CUBE_STATE_SIZE )
+CHECK( tex_cube_cs, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT, 2 + 4*5 - CUBE_STATE_SIZE )
+TCL_CHECK( tcl_fog, ctx->Fog.Enabled, 0 )
+TCL_CHECK( tcl_fog_add4, ctx->Fog.Enabled, 4 )
+TCL_CHECK( tcl, GL_TRUE, 0 )
+TCL_CHECK( tcl_add8, GL_TRUE, 8 )
+TCL_CHECK( tcl_add4, GL_TRUE, 4 )
+TCL_CHECK( tcl_tex, rmesa->state.texture.unit[atom->idx].unitneeded, 0 )
+TCL_CHECK( tcl_lighting, ctx->Light.Enabled, 0 )
+TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[atom->idx].Enabled, 0 )
+TCL_CHECK( tcl_tex_add4, rmesa->state.texture.unit[atom->idx].unitneeded, 4 )
+TCL_CHECK( tcl_lighting_add4, ctx->Light.Enabled, 4 )
+TCL_CHECK( tcl_lighting_add6, ctx->Light.Enabled, 6 )
+TCL_CHECK( tcl_light_add8, ctx->Light.Enabled && ctx->Light.Light[atom->idx].Enabled, 8 )
+TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << (atom->idx))), 0 )
+TCL_OR_VP_CHECK( tcl_ucp_add4, (ctx->Transform.ClipPlanesEnabled & (1 << (atom->idx))), 4 )
+TCL_OR_VP_CHECK( tcl_or_vp, GL_TRUE, 0 )
+TCL_OR_VP_CHECK( tcl_or_vp_add2, GL_TRUE, 2 )
+VP_CHECK( tcl_vp, GL_TRUE, 0 )
+VP_CHECK( tcl_vp_add4, GL_TRUE, 4 )
+VP_CHECK( tcl_vp_size, ctx->VertexProgram.Current->Base.NumNativeInstructions > 64, 0 )
+VP_CHECK( tcl_vpp_size, ctx->VertexProgram.Current->Base.NumNativeParameters > 96, 0 )
+VP_CHECK( tcl_vp_size_add4, ctx->VertexProgram.Current->Base.NumNativeInstructions > 64, 4 )
+VP_CHECK( tcl_vpp_size_add4, ctx->VertexProgram.Current->Base.NumNativeParameters > 96, 4 )
+
+#define OUT_VEC(hdr, data) do {			\
+    drm_radeon_cmd_header_t h;					\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
+    OUT_BATCH(0);							\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
+    OUT_BATCH(h.vectors.offset | (h.vectors.stride << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, h.vectors.count - 1));	\
+    OUT_BATCH_TABLE((data), h.vectors.count);				\
+  } while(0)
+
+#define OUT_VECLINEAR(hdr, data) do {					\
+    drm_radeon_cmd_header_t h;						\
+    uint32_t _start, _sz;						\
+    h.i = hdr;								\
+    _start = h.veclinear.addr_lo | (h.veclinear.addr_hi << 8);		\
+    _sz = h.veclinear.count * 4;					\
+    if (r200->radeon.radeonScreen->kernel_mm && _sz) { \
+    BEGIN_BATCH_NO_AUTOSTATE(dwords); \
+    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
+    OUT_BATCH(0);							\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
+    OUT_BATCH(_start | (1 << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT));	\
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, _sz - 1));	\
+    OUT_BATCH_TABLE((data), _sz);					\
+    END_BATCH(); \
+    } \
+  } while(0)
+
+#define OUT_SCL(hdr, data) do {					\
+    drm_radeon_cmd_header_t h;						\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
+    OUT_BATCH((h.scalars.offset) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
+    OUT_BATCH_TABLE((data), h.scalars.count);				\
+  } while(0)
+
+#define OUT_SCL2(hdr, data) do {					\
+    drm_radeon_cmd_header_t h;						\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
+    OUT_BATCH((h.scalars.offset + 0x100) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
+    OUT_BATCH_TABLE((data), h.scalars.count);				\
+  } while(0)
+static int check_rrb(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb;
+   rrb = radeon_get_colorbuffer(&r200->radeon);
+   if (!rrb || !rrb->bo)
+      return 0;
+   return atom->cmd_size;
+}
+
+static void mtl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[MTL_CMD_0], (atom->cmd+1));
+   OUT_SCL2(atom->cmd[MTL_CMD_1], (atom->cmd + 18));
+   END_BATCH();
 }
 
+static void lit_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[LIT_CMD_0], atom->cmd+1);
+   OUT_VEC(atom->cmd[LIT_CMD_1], atom->cmd+LIT_CMD_1+1);
+   END_BATCH();
+}
 
-CHECK( always, GL_TRUE )
-CHECK( never, GL_FALSE )
-CHECK( tex_any, ctx->Texture._EnabledUnits )
-CHECK( tf, (ctx->Texture._EnabledUnits && !ctx->ATIFragmentShader._Enabled) );
-CHECK( tex_pair, (rmesa->state.texture.unit[idx].unitneeded | rmesa->state.texture.unit[idx & ~1].unitneeded) )
-CHECK( tex, rmesa->state.texture.unit[idx].unitneeded )
-CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled )
-CHECK( texenv, (rmesa->state.envneeded & (1 << idx) && !ctx->ATIFragmentShader._Enabled) )
-CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)) )
-CHECK( afs, ctx->ATIFragmentShader._Enabled )
-CHECK( tex_cube, rmesa->state.texture.unit[idx].unitneeded & TEXTURE_CUBE_BIT )
-TCL_CHECK( tcl_fog, ctx->Fog.Enabled )
-TCL_CHECK( tcl, GL_TRUE )
-TCL_CHECK( tcl_tex, rmesa->state.texture.unit[idx].unitneeded )
-TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
-TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[idx].Enabled )
-TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) )
-TCL_OR_VP_CHECK( tcl_or_vp, GL_TRUE )
-VP_CHECK( tcl_vp, GL_TRUE )
-VP_CHECK( tcl_vp_size, ctx->VertexProgram.Current->Base.NumNativeInstructions > 64 )
-VP_CHECK( tcl_vpp_size, ctx->VertexProgram.Current->Base.NumNativeParameters > 96 )
+static void ptp_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[PTP_CMD_0], atom->cmd+1);
+   OUT_VEC(atom->cmd[PTP_CMD_1], atom->cmd+PTP_CMD_1+1);
+   END_BATCH();
+}
 
+static void veclinear_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
 
-/* Initialize the context's hardware state.
- */
-void r200InitState( r200ContextPtr rmesa )
+   OUT_VECLINEAR(atom->cmd[0], atom->cmd+1);
+}
+
+static void scl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
-   GLcontext *ctx = rmesa->glCtx;
-   GLuint color_fmt, depth_fmt, i;
-   GLint drawPitch, drawOffset;
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
 
-   switch ( rmesa->r200Screen->cpp ) {
-   case 2:
-      color_fmt = R200_COLOR_FORMAT_RGB565;
-      break;
-   case 4:
-      color_fmt = R200_COLOR_FORMAT_ARGB8888;
-      break;
-   default:
-      fprintf( stderr, "Error: Unsupported pixel depth... exiting\n" );
-      exit( -1 );
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_SCL(atom->cmd[0], atom->cmd+1);
+   END_BATCH();
+}
+
+
+static void vec_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[0], atom->cmd+1);
+   END_BATCH();
+}
+
+static void ctx_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   struct radeon_renderbuffer *rrb;
+   uint32_t cbpitch;
+   uint32_t zbpitch, depth_fmt;
+   uint32_t dwords = atom->check(ctx, atom);
+
+   /* output the first 7 bytes of context */
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_BATCH_TABLE(atom->cmd, 5);
+
+   rrb = radeon_get_depthbuffer(&r200->radeon);
+   if (!rrb) {
+     OUT_BATCH(0);
+     OUT_BATCH(0);
+   } else {
+     zbpitch = (rrb->pitch / rrb->cpp);
+     if (r200->using_hyperz)
+       zbpitch |= RADEON_DEPTH_HYPERZ;
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+     OUT_BATCH(zbpitch);
+     if (rrb->cpp == 4) 
+       depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z; 
+     else 
+       depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z; 
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK; 
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt; 
+   }
+     
+   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
+   OUT_BATCH(atom->cmd[CTX_CMD_1]);
+   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
+
+   rrb = radeon_get_colorbuffer(&r200->radeon);
+   if (!rrb || !rrb->bo) {
+     OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+     OUT_BATCH(atom->cmd[CTX_RB3D_COLOROFFSET]);
+   } else {
+     atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10); 
+     if (rrb->cpp == 4) 
+       atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888; 
+     else 
+       atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565; 
+ 
+     OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]); 
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
    }
 
-   rmesa->state.color.clear = 0x00000000;
+   OUT_BATCH(atom->cmd[CTX_CMD_2]);
 
-   switch ( ctx->Visual.depthBits ) {
-   case 16:
-      rmesa->state.depth.clear = 0x0000ffff;
-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffff;
-      depth_fmt = R200_DEPTH_FORMAT_16BIT_INT_Z;
-      rmesa->state.stencil.clear = 0x00000000;
-      break;
-   case 24:
-      rmesa->state.depth.clear = 0x00ffffff;
-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffffff;
-      depth_fmt = R200_DEPTH_FORMAT_24BIT_INT_Z;
-      rmesa->state.stencil.clear = 0xffff0000;
-      break;
-   default:
-      fprintf( stderr, "Error: Unsupported depth %d... exiting\n",
-	       ctx->Visual.depthBits );
-      exit( -1 );
+   if (!rrb || !rrb->bo) {
+     OUT_BATCH(atom->cmd[CTX_RB3D_COLORPITCH]);
+   } else {
+     cbpitch = (rrb->pitch / rrb->cpp);
+     if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+       cbpitch |= R200_COLOR_TILE_ENABLE;
+     OUT_BATCH(cbpitch);
+   }
+
+   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM)
+     OUT_BATCH_TABLE((atom->cmd + 14), 4);
+
+   END_BATCH();
+}
+
+static int check_always_ctx( GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb, *drb;
+   uint32_t dwords;
+
+   rrb = radeon_get_colorbuffer(&r200->radeon);
+   if (!rrb || !rrb->bo) {
+      return 0;
    }
 
-   /* Only have hw stencil when depth buffer is 24 bits deep */
-   rmesa->state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
-				     ctx->Visual.depthBits == 24 );
+   drb = radeon_get_depthbuffer(&r200->radeon);
 
-   rmesa->Fallback = 0;
+   dwords = 10;
+   if (drb)
+     dwords += 6;
+   if (rrb)
+     dwords += 8;
+   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM)
+     dwords += 4;
 
-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
-      drawOffset = rmesa->r200Screen->backOffset;
-      drawPitch  = rmesa->r200Screen->backPitch;
-   } else {
-      drawOffset = rmesa->r200Screen->frontOffset;
-      drawPitch  = rmesa->r200Screen->frontPitch;
+
+   return dwords;
+}
+
+static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   struct radeon_renderbuffer *rrb, *drb;
+   uint32_t cbpitch = 0;
+   uint32_t zbpitch = 0;
+   uint32_t dwords = atom->check(ctx, atom);
+   uint32_t depth_fmt;
+
+   rrb = radeon_get_colorbuffer(&r200->radeon);
+   if (!rrb || !rrb->bo) {
+      return;
    }
-#if 000
-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
-      rmesa->state.color.drawOffset = rmesa->r200Screen->backOffset;
-      rmesa->state.color.drawPitch  = rmesa->r200Screen->backPitch;
+
+   atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
+   if (rrb->cpp == 4)
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
+   else switch (rrb->base._ActualFormat) {
+   case GL_RGB5:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
+	break;
+   case GL_RGBA4:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
+	break;
+   case GL_RGB5_A1:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
+	break;
+   }
+
+   cbpitch = (rrb->pitch / rrb->cpp);
+   if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+       cbpitch |= R200_COLOR_TILE_ENABLE;
+
+   drb = radeon_get_depthbuffer(&r200->radeon);
+   if (drb) {
+     zbpitch = (drb->pitch / drb->cpp);
+     if (drb->cpp == 4)
+        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
+     else
+        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
+   }
+
+   /* output the first 7 bytes of context */
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   /* In the CS case we need to split this up */
+   OUT_BATCH(CP_PACKET0(packet[0].start, 3));
+   OUT_BATCH_TABLE((atom->cmd + 1), 4);
+
+   if (drb) {
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHOFFSET, 0));
+     OUT_BATCH_RELOC(0, drb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHPITCH, 0));
+     OUT_BATCH(zbpitch);
+   }
+
+   OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZSTENCILCNTL, 0));
+   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
+   OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 1));
+   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
+   OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+
+
+   if (rrb) {
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLOROFFSET, 0));
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0));
+     OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+   }
+
+   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) {
+     OUT_BATCH_TABLE((atom->cmd + 14), 4);
+   }
+
+   END_BATCH();
+}
+
+static int get_tex_size(GLcontext* ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   uint32_t dwords = atom->cmd_size + 2;
+   int i = atom->idx;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   if (!(t && t->mt && !t->image_override))
+     dwords -= 2;
+
+   return dwords;
+}
+
+static int check_tex_pair(GLcontext* ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   /** XOR is bit flip operation so use it for finding pair */
+   if (!(r200->state.texture.unit[atom->idx].unitneeded | r200->state.texture.unit[atom->idx ^ 1].unitneeded))
+     return 0;
+
+   return get_tex_size(ctx, atom);
+}
+
+static int check_tex(GLcontext* ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   if (!(r200->state.texture.unit[atom->idx].unitneeded))
+     return 0;
+
+   return get_tex_size(ctx, atom);
+}
+
+
+static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+   int i = atom->idx;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   /* is this ok even with drm older than 1.18? */
+   OUT_BATCH_TABLE(atom->cmd, 10);
+
+   if (t && t->mt && !t->image_override) {
+     OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		  RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+   } else if (!t) {
+     /* workaround for old CS mechanism */
+     OUT_BATCH(r200->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP]);
    } else {
-      rmesa->state.color.drawOffset = rmesa->r200Screen->frontOffset;
-      rmesa->state.color.drawPitch  = rmesa->r200Screen->frontPitch;
+     OUT_BATCH(t->override_offset);
    }
 
-   rmesa->state.pixel.readOffset = rmesa->state.color.drawOffset;
-   rmesa->state.pixel.readPitch  = rmesa->state.color.drawPitch;
-#endif
+   END_BATCH();
+}
+
+static int get_tex_mm_size(GLcontext* ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   uint32_t dwords = atom->cmd_size + 2;
+   int hastexture = 1;
+   int i = atom->idx;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   if (!t)
+	hastexture = 0;
+   else {
+	if (!t->mt && !t->bo)
+		hastexture = 0;
+   }
+
+   if (!hastexture)
+     dwords -= 4;
+   return dwords;
+}
+
+static int check_tex_pair_mm(GLcontext* ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   /** XOR is bit flip operation so use it for finding pair */
+   if (!(r200->state.texture.unit[atom->idx].unitneeded | r200->state.texture.unit[atom->idx ^ 1].unitneeded))
+     return 0;
+
+   return get_tex_mm_size(ctx, atom);
+}
 
-   rmesa->hw.max_state_size = 0;
+static int check_tex_mm(GLcontext* ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   if (!(r200->state.texture.unit[atom->idx].unitneeded))
+     return 0;
+
+   return get_tex_mm_size(ctx, atom);
+}
+
+
+static void tex_emit_mm(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+   int i = atom->idx;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   if (!r200->state.texture.unit[i].unitneeded)
+        dwords -= 4;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   OUT_BATCH(CP_PACKET0(R200_PP_TXFILTER_0 + (32 * i), 7));
+   OUT_BATCH_TABLE((atom->cmd + 1), 8);
+
+   if (dwords > atom->cmd_size) {
+     OUT_BATCH(CP_PACKET0(R200_PP_TXOFFSET_0 + (24 * i), 0));
+     if (t->mt && !t->image_override) {
+        OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		  RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+      } else {
+	if (t->bo)
+            OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
+                            RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+      }
+   }
+   END_BATCH();
+}
+
+
+static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+   int i = atom->idx, j;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+
+   if (!(t && !t->image_override))
+     dwords = 2;
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   /* XXX that size won't really match with image_override... */
+   OUT_BATCH_TABLE(atom->cmd, 2);
+
+   if (t && !t->image_override) {
+     lvl = &t->mt->levels[0];
+     OUT_BATCH_TABLE((atom->cmd + 2), 1);
+     for (j = 1; j <= 5; j++) {
+       OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     }
+   }
+   END_BATCH();
+}
+
+static void cube_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+   int i = atom->idx, j;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+   if (!(t && !t->image_override))
+     dwords = 2;
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_BATCH_TABLE(atom->cmd, 2);
+
+   if (t && !t->image_override) {
+     lvl = &t->mt->levels[0];
+     for (j = 1; j <= 5; j++) {
+       OUT_BATCH(CP_PACKET0(R200_PP_CUBIC_OFFSET_F1_0 + (24*i) + (4 * (j-1)), 0));
+       OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     }
+   }
+   END_BATCH();
+}
+
+/* Initialize the context's hardware state.
+ */
+void r200InitState( r200ContextPtr rmesa )
+{
+   GLcontext *ctx = rmesa->radeon.glCtx;
+   GLuint i;
+
+   rmesa->radeon.state.color.clear = 0x00000000;
+
+   switch ( ctx->Visual.depthBits ) {
+   case 16:
+      rmesa->radeon.state.depth.clear = 0x0000ffff;
+      rmesa->radeon.state.stencil.clear = 0x00000000;
+      break;
+   case 24:
+   default:
+      rmesa->radeon.state.depth.clear = 0x00ffffff;
+      rmesa->radeon.state.stencil.clear = 0xffff0000;
+      break;
+   }
+
+   rmesa->radeon.Fallback = 0;
+
+   rmesa->radeon.hw.max_state_size = 0;
 
 #define ALLOC_STATE( ATOM, CHK, SZ, NM, IDX )				\
    do {								\
       rmesa->hw.ATOM.cmd_size = SZ;				\
-      rmesa->hw.ATOM.cmd = (int *)CALLOC(SZ * sizeof(int));	\
-      rmesa->hw.ATOM.lastcmd = (int *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.cmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.lastcmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
       rmesa->hw.ATOM.name = NM;					\
       rmesa->hw.ATOM.idx = IDX;					\
-      rmesa->hw.ATOM.check = check_##CHK;			\
+      if (check_##CHK != check_never) {				\
+         rmesa->hw.ATOM.check = check_##CHK;			\
+         rmesa->radeon.hw.max_state_size += SZ * sizeof(int);	\
+      } else {							\
+         rmesa->hw.ATOM.check = NULL;				\
+      }								\
       rmesa->hw.ATOM.dirty = GL_FALSE;				\
-      rmesa->hw.max_state_size += SZ * sizeof(int);		\
    } while (0)
 
 
    /* Allocate state buffers:
     */
-   if (rmesa->r200Screen->drmSupportsBlendColor)
-      ALLOC_STATE( ctx, always, CTX_STATE_SIZE_NEWDRM, "CTX/context", 0 );
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
+      ALLOC_STATE( ctx, always_add4, CTX_STATE_SIZE_NEWDRM, "CTX/context", 0 );
+   else
+      ALLOC_STATE( ctx, always_add4, CTX_STATE_SIZE_OLDDRM, "CTX/context", 0 );
+
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+   {
+     rmesa->hw.ctx.emit = ctx_emit_cs;
+     rmesa->hw.ctx.check = check_always_ctx;
+   }
    else
-      ALLOC_STATE( ctx, always, CTX_STATE_SIZE_OLDDRM, "CTX/context", 0 );
+   {
+     rmesa->hw.ctx.emit = ctx_emit;
+   }
    ALLOC_STATE( set, always, SET_STATE_SIZE, "SET/setup", 0 );
    ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
    ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
@@ -282,52 +839,75 @@ void r200InitState( r200ContextPtr rmesa )
    ALLOC_STATE( cst, always, CST_STATE_SIZE, "CST/constant", 0 );
    ALLOC_STATE( zbs, always, ZBS_STATE_SIZE, "ZBS/zbias", 0 );
    ALLOC_STATE( tf, tf, TF_STATE_SIZE, "TF/tfactor", 0 );
-   if (rmesa->r200Screen->drmSupportsFragShader) {
-      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
-      /* make sure texture units 0/1 are emitted pair-wise for r200 t0 hang workaround */
-	 ALLOC_STATE( tex[0], tex_pair, TEX_STATE_SIZE_NEWDRM, "TEX/tex-0", 0 );
-	 ALLOC_STATE( tex[1], tex_pair, TEX_STATE_SIZE_NEWDRM, "TEX/tex-1", 1 );
-	 ALLOC_STATE( tam, tex_any, TAM_STATE_SIZE, "TAM/tam", 0 );
+   {
+      int state_size = TEX_STATE_SIZE_NEWDRM;
+      if (!rmesa->radeon.radeonScreen->drmSupportsFragShader) {
+         state_size = TEX_STATE_SIZE_OLDDRM;
       }
-      else {
-	 ALLOC_STATE( tex[0], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-0", 0 );
-	 ALLOC_STATE( tex[1], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-1", 1 );
-	 ALLOC_STATE( tam, never, TAM_STATE_SIZE, "TAM/tam", 0 );
+      if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
+         if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
+            /* make sure texture units 0/1 are emitted pair-wise for r200 t0 hang workaround */
+            ALLOC_STATE( tex[0], tex_pair_mm, state_size, "TEX/tex-0", 0 );
+            ALLOC_STATE( tex[1], tex_pair_mm, state_size, "TEX/tex-1", 1 );
+            ALLOC_STATE( tam, tex_any, TAM_STATE_SIZE, "TAM/tam", 0 );
+         }
+         else {
+            ALLOC_STATE( tex[0], tex_mm, state_size, "TEX/tex-0", 0 );
+            ALLOC_STATE( tex[1], tex_mm, state_size, "TEX/tex-1", 1 );
+            ALLOC_STATE( tam, never, TAM_STATE_SIZE, "TAM/tam", 0 );
+         }
+         ALLOC_STATE( tex[2], tex_mm, state_size, "TEX/tex-2", 2 );
+         ALLOC_STATE( tex[3], tex_mm, state_size, "TEX/tex-3", 3 );
+         ALLOC_STATE( tex[4], tex_mm, state_size, "TEX/tex-4", 4 );
+         ALLOC_STATE( tex[5], tex_mm, state_size, "TEX/tex-5", 5 );
+         if (!rmesa->radeon.radeonScreen->kernel_mm)
+         {
+            if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
+               rmesa->hw.tex[0].check = check_tex_pair;
+               rmesa->hw.tex[1].check = check_tex_pair;
+            } else {
+               rmesa->hw.tex[0].check = check_tex;
+               rmesa->hw.tex[1].check = check_tex;
+            }
+            rmesa->hw.tex[2].check = check_tex;
+            rmesa->hw.tex[3].check = check_tex;
+            rmesa->hw.tex[4].check = check_tex;
+            rmesa->hw.tex[5].check = check_tex;
+         }
+         if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
+            ALLOC_STATE( atf, afs, ATF_STATE_SIZE, "ATF/tfactor", 0 );
+            ALLOC_STATE( afs[0], afs_pass1, AFS_STATE_SIZE, "AFS/afsinst-0", 0 );
+            ALLOC_STATE( afs[1], afs, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
+         } else {
+            ALLOC_STATE( atf, never, ATF_STATE_SIZE, "ATF/tfactor", 0 );
+            ALLOC_STATE( afs[0], never, AFS_STATE_SIZE, "AFS/afsinst-0", 0 );
+            ALLOC_STATE( afs[1], never, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
+         }
       }
-      ALLOC_STATE( tex[2], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-2", 2 );
-      ALLOC_STATE( tex[3], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-3", 3 );
-      ALLOC_STATE( tex[4], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-4", 4 );
-      ALLOC_STATE( tex[5], tex, TEX_STATE_SIZE_NEWDRM, "TEX/tex-5", 5 );
-      ALLOC_STATE( atf, afs, ATF_STATE_SIZE, "ATF/tfactor", 0 );
-      ALLOC_STATE( afs[0], afs_pass1, AFS_STATE_SIZE, "AFS/afsinst-0", 0 );
-      ALLOC_STATE( afs[1], afs, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
    }
-   else {
-      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
-	 ALLOC_STATE( tex[0], tex_pair, TEX_STATE_SIZE_OLDDRM, "TEX/tex-0", 0 );
-	 ALLOC_STATE( tex[1], tex_pair, TEX_STATE_SIZE_OLDDRM, "TEX/tex-1", 1 );
-	 ALLOC_STATE( tam, tex_any, TAM_STATE_SIZE, "TAM/tam", 0 );
-      }
-      else {
-	 ALLOC_STATE( tex[0], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-0", 0 );
-	 ALLOC_STATE( tex[1], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-1", 1 );
-	 ALLOC_STATE( tam, never, TAM_STATE_SIZE, "TAM/tam", 0 );
-      }
-      ALLOC_STATE( tex[2], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-2", 2 );
-      ALLOC_STATE( tex[3], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-3", 3 );
-      ALLOC_STATE( tex[4], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-4", 4 );
-      ALLOC_STATE( tex[5], tex, TEX_STATE_SIZE_OLDDRM, "TEX/tex-5", 5 );
-      ALLOC_STATE( atf, never, ATF_STATE_SIZE, "TF/tfactor", 0 );
-      ALLOC_STATE( afs[0], never, AFS_STATE_SIZE, "AFS/afsinst-0", 0 );
-      ALLOC_STATE( afs[1], never, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
-   }
-   if (rmesa->r200Screen->drmSupportsCubeMapsR200) {
+   /* polygon stipple is done with irq for non-kms */
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+       ALLOC_STATE( stp, always, STP_STATE_SIZE, "STP/stp", 0 );
+   }
+
+   for (i = 0; i < 6; i++)
+      if (rmesa->radeon.radeonScreen->kernel_mm)
+          rmesa->hw.tex[i].emit = tex_emit_mm;
+      else
+          rmesa->hw.tex[i].emit = tex_emit;
+   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR200) {
       ALLOC_STATE( cube[0], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-0", 0 );
       ALLOC_STATE( cube[1], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-1", 1 );
       ALLOC_STATE( cube[2], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-2", 2 );
       ALLOC_STATE( cube[3], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-3", 3 );
       ALLOC_STATE( cube[4], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
       ALLOC_STATE( cube[5], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
+      for (i = 0; i < 6; i++)
+          if (rmesa->radeon.radeonScreen->kernel_mm) {
+              rmesa->hw.cube[i].emit = cube_emit_cs;
+              rmesa->hw.cube[i].check = check_tex_cube_cs;
+          } else
+              rmesa->hw.cube[i].emit = cube_emit;
    }
    else {
       ALLOC_STATE( cube[0], never, CUBE_STATE_SIZE, "CUBE/tex-0", 0 );
@@ -337,12 +917,20 @@ void r200InitState( r200ContextPtr rmesa )
       ALLOC_STATE( cube[4], never, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
       ALLOC_STATE( cube[5], never, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
    }
-   if (rmesa->r200Screen->drmSupportsVertexProgram) {
+
+   if (rmesa->radeon.radeonScreen->drmSupportsVertexProgram) {
       ALLOC_STATE( pvs, tcl_vp, PVS_STATE_SIZE, "PVS/pvscntl", 0 );
-      ALLOC_STATE( vpi[0], tcl_vp, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
-      ALLOC_STATE( vpi[1], tcl_vp_size, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
-      ALLOC_STATE( vpp[0], tcl_vp, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 );
-      ALLOC_STATE( vpp[1], tcl_vpp_size, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 );
+      if (rmesa->radeon.radeonScreen->kernel_mm) {
+         ALLOC_STATE( vpi[0], tcl_vp_add4, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
+         ALLOC_STATE( vpi[1], tcl_vp_size_add4, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
+         ALLOC_STATE( vpp[0], tcl_vp_add4, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 );
+         ALLOC_STATE( vpp[1], tcl_vpp_size_add4, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 );
+      } else {
+         ALLOC_STATE( vpi[0], tcl_vp, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
+         ALLOC_STATE( vpi[1], tcl_vp_size, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
+         ALLOC_STATE( vpp[0], tcl_vp, VPP_STATE_SIZE, "VPP/vertexparam-0", 0 );
+         ALLOC_STATE( vpp[1], tcl_vpp_size, VPP_STATE_SIZE, "VPP/vertexparam-1", 1 );
+      }
    }
    else {
       ALLOC_STATE( pvs, never, PVS_STATE_SIZE, "PVS/pvscntl", 0 );
@@ -355,50 +943,87 @@ void r200InitState( r200ContextPtr rmesa )
    ALLOC_STATE( tcl, tcl_or_vp, TCL_STATE_SIZE, "TCL/tcl", 0 );
    ALLOC_STATE( msl, tcl, MSL_STATE_SIZE, "MSL/matrix-select", 0 );
    ALLOC_STATE( tcg, tcl, TCG_STATE_SIZE, "TCG/texcoordgen", 0 );
-   ALLOC_STATE( mtl[0], tcl_lighting, MTL_STATE_SIZE, "MTL0/material0", 0 );
-   ALLOC_STATE( mtl[1], tcl_lighting, MTL_STATE_SIZE, "MTL1/material1", 1 );
-   ALLOC_STATE( grd, tcl_or_vp, GRD_STATE_SIZE, "GRD/guard-band", 0 );
-   ALLOC_STATE( fog, tcl_fog, FOG_STATE_SIZE, "FOG/fog", 0 );
-   ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 0 );
-   ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 0 );
-   ALLOC_STATE( mat[R200_MTX_MV], tcl, MAT_STATE_SIZE, "MAT/modelview", 0 );
-   ALLOC_STATE( mat[R200_MTX_IMV], tcl, MAT_STATE_SIZE, "MAT/it-modelview", 0 );
-   ALLOC_STATE( mat[R200_MTX_MVP], tcl, MAT_STATE_SIZE, "MAT/modelproject", 0 );
-   ALLOC_STATE( mat[R200_MTX_TEX0], tcl_tex, MAT_STATE_SIZE, "MAT/texmat0", 0 );
-   ALLOC_STATE( mat[R200_MTX_TEX1], tcl_tex, MAT_STATE_SIZE, "MAT/texmat1", 1 );
-   ALLOC_STATE( mat[R200_MTX_TEX2], tcl_tex, MAT_STATE_SIZE, "MAT/texmat2", 2 );
-   ALLOC_STATE( mat[R200_MTX_TEX3], tcl_tex, MAT_STATE_SIZE, "MAT/texmat3", 3 );
-   ALLOC_STATE( mat[R200_MTX_TEX4], tcl_tex, MAT_STATE_SIZE, "MAT/texmat4", 4 );
-   ALLOC_STATE( mat[R200_MTX_TEX5], tcl_tex, MAT_STATE_SIZE, "MAT/texmat5", 5 );
-   ALLOC_STATE( ucp[0], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-0", 0 );
-   ALLOC_STATE( ucp[1], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
-   ALLOC_STATE( ucp[2], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-2", 2 );
-   ALLOC_STATE( ucp[3], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-3", 3 );
-   ALLOC_STATE( ucp[4], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-4", 4 );
-   ALLOC_STATE( ucp[5], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-5", 5 );
-   ALLOC_STATE( lit[0], tcl_light, LIT_STATE_SIZE, "LIT/light-0", 0 );
-   ALLOC_STATE( lit[1], tcl_light, LIT_STATE_SIZE, "LIT/light-1", 1 );
-   ALLOC_STATE( lit[2], tcl_light, LIT_STATE_SIZE, "LIT/light-2", 2 );
-   ALLOC_STATE( lit[3], tcl_light, LIT_STATE_SIZE, "LIT/light-3", 3 );
-   ALLOC_STATE( lit[4], tcl_light, LIT_STATE_SIZE, "LIT/light-4", 4 );
-   ALLOC_STATE( lit[5], tcl_light, LIT_STATE_SIZE, "LIT/light-5", 5 );
-   ALLOC_STATE( lit[6], tcl_light, LIT_STATE_SIZE, "LIT/light-6", 6 );
-   ALLOC_STATE( lit[7], tcl_light, LIT_STATE_SIZE, "LIT/light-7", 7 );
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      ALLOC_STATE( mtl[0], tcl_lighting_add6, MTL_STATE_SIZE, "MTL0/material0", 0 );
+      ALLOC_STATE( mtl[1], tcl_lighting_add6, MTL_STATE_SIZE, "MTL1/material1", 1 );
+      ALLOC_STATE( grd, tcl_or_vp_add2, GRD_STATE_SIZE, "GRD/guard-band", 0 );
+      ALLOC_STATE( fog, tcl_fog_add4, FOG_STATE_SIZE, "FOG/fog", 0 );
+      ALLOC_STATE( glt, tcl_lighting_add4, GLT_STATE_SIZE, "GLT/light-global", 0 );
+      ALLOC_STATE( eye, tcl_lighting_add4, EYE_STATE_SIZE, "EYE/eye-vector", 0 );
+      ALLOC_STATE( mat[R200_MTX_MV], tcl_add4, MAT_STATE_SIZE, "MAT/modelview", 0 );
+      ALLOC_STATE( mat[R200_MTX_IMV], tcl_add4, MAT_STATE_SIZE, "MAT/it-modelview", 0 );
+      ALLOC_STATE( mat[R200_MTX_MVP], tcl_add4, MAT_STATE_SIZE, "MAT/modelproject", 0 );
+      ALLOC_STATE( mat[R200_MTX_TEX0], tcl_tex_add4, MAT_STATE_SIZE, "MAT/texmat0", 0 );
+      ALLOC_STATE( mat[R200_MTX_TEX1], tcl_tex_add4, MAT_STATE_SIZE, "MAT/texmat1", 1 );
+      ALLOC_STATE( mat[R200_MTX_TEX2], tcl_tex_add4, MAT_STATE_SIZE, "MAT/texmat2", 2 );
+      ALLOC_STATE( mat[R200_MTX_TEX3], tcl_tex_add4, MAT_STATE_SIZE, "MAT/texmat3", 3 );
+      ALLOC_STATE( mat[R200_MTX_TEX4], tcl_tex_add4, MAT_STATE_SIZE, "MAT/texmat4", 4 );
+      ALLOC_STATE( mat[R200_MTX_TEX5], tcl_tex_add4, MAT_STATE_SIZE, "MAT/texmat5", 5 );
+      ALLOC_STATE( ucp[0], tcl_ucp_add4, UCP_STATE_SIZE, "UCP/userclip-0", 0 );
+      ALLOC_STATE( ucp[1], tcl_ucp_add4, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
+      ALLOC_STATE( ucp[2], tcl_ucp_add4, UCP_STATE_SIZE, "UCP/userclip-2", 2 );
+      ALLOC_STATE( ucp[3], tcl_ucp_add4, UCP_STATE_SIZE, "UCP/userclip-3", 3 );
+      ALLOC_STATE( ucp[4], tcl_ucp_add4, UCP_STATE_SIZE, "UCP/userclip-4", 4 );
+      ALLOC_STATE( ucp[5], tcl_ucp_add4, UCP_STATE_SIZE, "UCP/userclip-5", 5 );
+      ALLOC_STATE( lit[0], tcl_light_add8, LIT_STATE_SIZE, "LIT/light-0", 0 );
+      ALLOC_STATE( lit[1], tcl_light_add8, LIT_STATE_SIZE, "LIT/light-1", 1 );
+      ALLOC_STATE( lit[2], tcl_light_add8, LIT_STATE_SIZE, "LIT/light-2", 2 );
+      ALLOC_STATE( lit[3], tcl_light_add8, LIT_STATE_SIZE, "LIT/light-3", 3 );
+      ALLOC_STATE( lit[4], tcl_light_add8, LIT_STATE_SIZE, "LIT/light-4", 4 );
+      ALLOC_STATE( lit[5], tcl_light_add8, LIT_STATE_SIZE, "LIT/light-5", 5 );
+      ALLOC_STATE( lit[6], tcl_light_add8, LIT_STATE_SIZE, "LIT/light-6", 6 );
+      ALLOC_STATE( lit[7], tcl_light_add8, LIT_STATE_SIZE, "LIT/light-7", 7 );
+      ALLOC_STATE( sci, rrb, SCI_STATE_SIZE, "SCI/scissor", 0 );
+   } else {
+      ALLOC_STATE( mtl[0], tcl_lighting, MTL_STATE_SIZE, "MTL0/material0", 0 );
+      ALLOC_STATE( mtl[1], tcl_lighting, MTL_STATE_SIZE, "MTL1/material1", 1 );
+      ALLOC_STATE( grd, tcl_or_vp, GRD_STATE_SIZE, "GRD/guard-band", 0 );
+      ALLOC_STATE( fog, tcl_fog, FOG_STATE_SIZE, "FOG/fog", 0 );
+      ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 0 );
+      ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 0 );
+      ALLOC_STATE( mat[R200_MTX_MV], tcl, MAT_STATE_SIZE, "MAT/modelview", 0 );
+      ALLOC_STATE( mat[R200_MTX_IMV], tcl, MAT_STATE_SIZE, "MAT/it-modelview", 0 );
+      ALLOC_STATE( mat[R200_MTX_MVP], tcl, MAT_STATE_SIZE, "MAT/modelproject", 0 );
+      ALLOC_STATE( mat[R200_MTX_TEX0], tcl_tex, MAT_STATE_SIZE, "MAT/texmat0", 0 );
+      ALLOC_STATE( mat[R200_MTX_TEX1], tcl_tex, MAT_STATE_SIZE, "MAT/texmat1", 1 );
+      ALLOC_STATE( mat[R200_MTX_TEX2], tcl_tex, MAT_STATE_SIZE, "MAT/texmat2", 2 );
+      ALLOC_STATE( mat[R200_MTX_TEX3], tcl_tex, MAT_STATE_SIZE, "MAT/texmat3", 3 );
+      ALLOC_STATE( mat[R200_MTX_TEX4], tcl_tex, MAT_STATE_SIZE, "MAT/texmat4", 4 );
+      ALLOC_STATE( mat[R200_MTX_TEX5], tcl_tex, MAT_STATE_SIZE, "MAT/texmat5", 5 );
+      ALLOC_STATE( ucp[0], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-0", 0 );
+      ALLOC_STATE( ucp[1], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
+      ALLOC_STATE( ucp[2], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-2", 2 );
+      ALLOC_STATE( ucp[3], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-3", 3 );
+      ALLOC_STATE( ucp[4], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-4", 4 );
+      ALLOC_STATE( ucp[5], tcl_ucp, UCP_STATE_SIZE, "UCP/userclip-5", 5 );
+      ALLOC_STATE( lit[0], tcl_light, LIT_STATE_SIZE, "LIT/light-0", 0 );
+      ALLOC_STATE( lit[1], tcl_light, LIT_STATE_SIZE, "LIT/light-1", 1 );
+      ALLOC_STATE( lit[2], tcl_light, LIT_STATE_SIZE, "LIT/light-2", 2 );
+      ALLOC_STATE( lit[3], tcl_light, LIT_STATE_SIZE, "LIT/light-3", 3 );
+      ALLOC_STATE( lit[4], tcl_light, LIT_STATE_SIZE, "LIT/light-4", 4 );
+      ALLOC_STATE( lit[5], tcl_light, LIT_STATE_SIZE, "LIT/light-5", 5 );
+      ALLOC_STATE( lit[6], tcl_light, LIT_STATE_SIZE, "LIT/light-6", 6 );
+      ALLOC_STATE( lit[7], tcl_light, LIT_STATE_SIZE, "LIT/light-7", 7 );
+      ALLOC_STATE( sci, never, SCI_STATE_SIZE, "SCI/scissor", 0 );
+   }
    ALLOC_STATE( pix[0], pix_zero, PIX_STATE_SIZE, "PIX/pixstage-0", 0 );
    ALLOC_STATE( pix[1], texenv, PIX_STATE_SIZE, "PIX/pixstage-1", 1 );
    ALLOC_STATE( pix[2], texenv, PIX_STATE_SIZE, "PIX/pixstage-2", 2 );
    ALLOC_STATE( pix[3], texenv, PIX_STATE_SIZE, "PIX/pixstage-3", 3 );
    ALLOC_STATE( pix[4], texenv, PIX_STATE_SIZE, "PIX/pixstage-4", 4 );
    ALLOC_STATE( pix[5], texenv, PIX_STATE_SIZE, "PIX/pixstage-5", 5 );
-   if (rmesa->r200Screen->drmSupportsTriPerf) {
+   if (rmesa->radeon.radeonScreen->drmSupportsTriPerf) {
       ALLOC_STATE( prf, always, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
    }
    else {
       ALLOC_STATE( prf, never, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
    }
-   if (rmesa->r200Screen->drmSupportsPointSprites) {
+   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites) {
       ALLOC_STATE( spr, always, SPR_STATE_SIZE, "SPR/pointsprite", 0 );
-      ALLOC_STATE( ptp, tcl, PTP_STATE_SIZE, "PTP/pointparams", 0 );
+      if (rmesa->radeon.radeonScreen->kernel_mm)
+         ALLOC_STATE( ptp, tcl_add8, PTP_STATE_SIZE, "PTP/pointparams", 0 );
+      else
+         ALLOC_STATE( ptp, tcl, PTP_STATE_SIZE, "PTP/pointparams", 0 );
    }
    else {
       ALLOC_STATE (spr, never, SPR_STATE_SIZE, "SPR/pointsprite", 0 );
@@ -409,87 +1034,125 @@ void r200InitState( r200ContextPtr rmesa )
 
    /* Fill in the packet headers:
     */
-   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(RADEON_EMIT_PP_MISC);
-   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(RADEON_EMIT_PP_CNTL);
-   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(RADEON_EMIT_RB3D_COLORPITCH);
-   if (rmesa->r200Screen->drmSupportsBlendColor)
-      rmesa->hw.ctx.cmd[CTX_CMD_3] = cmdpkt(R200_EMIT_RB3D_BLENDCOLOR);
-   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(RADEON_EMIT_RE_LINE_PATTERN);
-   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(RADEON_EMIT_SE_LINE_WIDTH);
-   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(RADEON_EMIT_RB3D_STENCILREFMASK);
-   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(RADEON_EMIT_SE_VPORT_XSCALE);
-   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(RADEON_EMIT_SE_CNTL);
-   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(RADEON_EMIT_RE_MISC);
-   rmesa->hw.cst.cmd[CST_CMD_0] = cmdpkt(R200_EMIT_PP_CNTL_X);
-   rmesa->hw.cst.cmd[CST_CMD_1] = cmdpkt(R200_EMIT_RB3D_DEPTHXY_OFFSET);
-   rmesa->hw.cst.cmd[CST_CMD_2] = cmdpkt(R200_EMIT_RE_AUX_SCISSOR_CNTL);
-   rmesa->hw.cst.cmd[CST_CMD_3] = cmdpkt(R200_EMIT_RE_SCISSOR_TL_0);
-   rmesa->hw.cst.cmd[CST_CMD_4] = cmdpkt(R200_EMIT_SE_VAP_CNTL_STATUS);
-   rmesa->hw.cst.cmd[CST_CMD_5] = cmdpkt(R200_EMIT_RE_POINTSIZE);
-   rmesa->hw.cst.cmd[CST_CMD_6] = cmdpkt(R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0);
-   rmesa->hw.tam.cmd[TAM_CMD_0] = cmdpkt(R200_EMIT_PP_TAM_DEBUG3);
-   rmesa->hw.tf.cmd[TF_CMD_0] = cmdpkt(R200_EMIT_TFACTOR_0);
-   if (rmesa->r200Screen->drmSupportsFragShader) {
-      rmesa->hw.atf.cmd[ATF_CMD_0] = cmdpkt(R200_EMIT_ATF_TFACTOR);
-      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_0);
-      rmesa->hw.tex[0].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_0);
-      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_1);
-      rmesa->hw.tex[1].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_1);
-      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_2);
-      rmesa->hw.tex[2].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_2);
-      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_3);
-      rmesa->hw.tex[3].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_3);
-      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_4);
-      rmesa->hw.tex[4].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_4);
-      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_5);
-      rmesa->hw.tex[5].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_5);
+   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_MISC);
+   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CNTL);
+   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(rmesa, RADEON_EMIT_RB3D_COLORPITCH);
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
+      rmesa->hw.ctx.cmd[CTX_CMD_3] = cmdpkt(rmesa, R200_EMIT_RB3D_BLENDCOLOR);
+   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_LINE_PATTERN);
+   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_LINE_WIDTH);
+   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RB3D_STENCILREFMASK);
+   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_VPORT_XSCALE);
+   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL);
+   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_MISC);
+   rmesa->hw.cst.cmd[CST_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CNTL_X);
+   rmesa->hw.cst.cmd[CST_CMD_1] = cmdpkt(rmesa, R200_EMIT_RB3D_DEPTHXY_OFFSET);
+   rmesa->hw.cst.cmd[CST_CMD_2] = cmdpkt(rmesa, R200_EMIT_RE_AUX_SCISSOR_CNTL);
+   rmesa->hw.cst.cmd[CST_CMD_3] = cmdpkt(rmesa, R200_EMIT_RE_SCISSOR_TL_0);
+   rmesa->hw.cst.cmd[CST_CMD_4] = cmdpkt(rmesa, R200_EMIT_SE_VAP_CNTL_STATUS);
+   rmesa->hw.cst.cmd[CST_CMD_5] = cmdpkt(rmesa, R200_EMIT_RE_POINTSIZE);
+   rmesa->hw.cst.cmd[CST_CMD_6] = cmdpkt(rmesa, R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0);
+   rmesa->hw.tam.cmd[TAM_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TAM_DEBUG3);
+   rmesa->hw.tf.cmd[TF_CMD_0] = cmdpkt(rmesa, R200_EMIT_TFACTOR_0);
+   if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
+      rmesa->hw.atf.cmd[ATF_CMD_0] = cmdpkt(rmesa, R200_EMIT_ATF_TFACTOR);
+      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_0);
+      rmesa->hw.tex[0].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_0);
+      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_1);
+      rmesa->hw.tex[1].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_1);
+      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_2);
+      rmesa->hw.tex[2].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_2);
+      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_3);
+      rmesa->hw.tex[3].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_3);
+      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_4);
+      rmesa->hw.tex[4].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_4);
+      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_5);
+      rmesa->hw.tex[5].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_5);
    } else {
-      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_0);
-      rmesa->hw.tex[0].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_0);
-      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_1);
-      rmesa->hw.tex[1].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_1);
-      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_2);
-      rmesa->hw.tex[2].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_2);
-      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_3);
-      rmesa->hw.tex[3].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_3);
-      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_4);
-      rmesa->hw.tex[4].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_4);
-      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_5);
-      rmesa->hw.tex[5].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_5);
-   }
-   rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_0);
-   rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_1);
-   rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(R200_EMIT_VAP_PVS_CNTL);
-   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_0);
-   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_0);
-   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_1);
-   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_1);
-   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_2);
-   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_2);
-   rmesa->hw.cube[3].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_3);
-   rmesa->hw.cube[3].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_3);
-   rmesa->hw.cube[4].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_4);
-   rmesa->hw.cube[4].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_4);
-   rmesa->hw.cube[5].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_5);
-   rmesa->hw.cube[5].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_5);
-   rmesa->hw.pix[0].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_0);
-   rmesa->hw.pix[1].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_1);
-   rmesa->hw.pix[2].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_2);
-   rmesa->hw.pix[3].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_3);
-   rmesa->hw.pix[4].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_4);
-   rmesa->hw.pix[5].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_5);
-   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(RADEON_EMIT_SE_ZBIAS_FACTOR);
-   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(R200_EMIT_TCL_LIGHT_MODEL_CTL_0);
-   rmesa->hw.tcl.cmd[TCL_CMD_1] = cmdpkt(R200_EMIT_TCL_UCP_VERT_BLEND_CTL);
-   rmesa->hw.tcg.cmd[TCG_CMD_0] = cmdpkt(R200_EMIT_TEX_PROC_CTL_2);
-   rmesa->hw.msl.cmd[MSL_CMD_0] = cmdpkt(R200_EMIT_MATRIX_SELECT_0);
-   rmesa->hw.vap.cmd[VAP_CMD_0] = cmdpkt(R200_EMIT_VAP_CTL);
-   rmesa->hw.vtx.cmd[VTX_CMD_0] = cmdpkt(R200_EMIT_VTX_FMT_0);
-   rmesa->hw.vtx.cmd[VTX_CMD_1] = cmdpkt(R200_EMIT_OUTPUT_VTX_COMP_SEL);
-   rmesa->hw.vtx.cmd[VTX_CMD_2] = cmdpkt(R200_EMIT_SE_VTX_STATE_CNTL);
-   rmesa->hw.vte.cmd[VTE_CMD_0] = cmdpkt(R200_EMIT_VTE_CNTL);
-   rmesa->hw.prf.cmd[PRF_CMD_0] = cmdpkt(R200_EMIT_PP_TRI_PERF_CNTL);
-   rmesa->hw.spr.cmd[SPR_CMD_0] = cmdpkt(R200_EMIT_TCL_POINT_SPRITE_CNTL);
+      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_0);
+      rmesa->hw.tex[0].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_0);
+      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_1);
+      rmesa->hw.tex[1].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_1);
+      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_2);
+      rmesa->hw.tex[2].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_2);
+      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_3);
+      rmesa->hw.tex[3].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_3);
+      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_4);
+      rmesa->hw.tex[4].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_4);
+      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_5);
+      rmesa->hw.tex[5].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_5);
+   }
+   rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_AFS_0);
+   rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_AFS_1);
+   rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(rmesa, R200_EMIT_VAP_PVS_CNTL);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_0);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_0);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_1);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_1);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_2);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_2);
+   rmesa->hw.cube[3].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_3);
+   rmesa->hw.cube[3].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_3);
+   rmesa->hw.cube[4].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_4);
+   rmesa->hw.cube[4].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_4);
+   rmesa->hw.cube[5].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_5);
+   rmesa->hw.cube[5].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_5);
+   rmesa->hw.pix[0].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_0);
+   rmesa->hw.pix[1].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_1);
+   rmesa->hw.pix[2].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_2);
+   rmesa->hw.pix[3].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_3);
+   rmesa->hw.pix[4].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_4);
+   rmesa->hw.pix[5].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_5);
+   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_ZBIAS_FACTOR);
+   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(rmesa, R200_EMIT_TCL_LIGHT_MODEL_CTL_0);
+   rmesa->hw.tcl.cmd[TCL_CMD_1] = cmdpkt(rmesa, R200_EMIT_TCL_UCP_VERT_BLEND_CTL);
+   rmesa->hw.tcg.cmd[TCG_CMD_0] = cmdpkt(rmesa, R200_EMIT_TEX_PROC_CTL_2);
+   rmesa->hw.msl.cmd[MSL_CMD_0] = cmdpkt(rmesa, R200_EMIT_MATRIX_SELECT_0);
+   rmesa->hw.vap.cmd[VAP_CMD_0] = cmdpkt(rmesa, R200_EMIT_VAP_CTL);
+   rmesa->hw.vtx.cmd[VTX_CMD_0] = cmdpkt(rmesa, R200_EMIT_VTX_FMT_0);
+   rmesa->hw.vtx.cmd[VTX_CMD_1] = cmdpkt(rmesa, R200_EMIT_OUTPUT_VTX_COMP_SEL);
+   rmesa->hw.vtx.cmd[VTX_CMD_2] = cmdpkt(rmesa, R200_EMIT_SE_VTX_STATE_CNTL);
+   rmesa->hw.vte.cmd[VTE_CMD_0] = cmdpkt(rmesa, R200_EMIT_VTE_CNTL);
+   rmesa->hw.prf.cmd[PRF_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TRI_PERF_CNTL);
+   rmesa->hw.spr.cmd[SPR_CMD_0] = cmdpkt(rmesa, R200_EMIT_TCL_POINT_SPRITE_CNTL);
+
+   rmesa->hw.sci.cmd[SCI_CMD_0] = CP_PACKET0(R200_RE_AUX_SCISSOR_CNTL, 0);
+   rmesa->hw.sci.cmd[SCI_CMD_1] = CP_PACKET0(R200_RE_TOP_LEFT, 0);
+   rmesa->hw.sci.cmd[SCI_CMD_2] = CP_PACKET0(R200_RE_WIDTH_HEIGHT, 0);
+
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+
+	rmesa->hw.stp.cmd[STP_CMD_0] = CP_PACKET0(RADEON_RE_STIPPLE_ADDR, 0);
+	rmesa->hw.stp.cmd[STP_DATA_0] = 0;
+	rmesa->hw.stp.cmd[STP_CMD_1] = CP_PACKET0_ONE(RADEON_RE_STIPPLE_DATA, 31);
+
+        rmesa->hw.mtl[0].emit = mtl_emit;
+        rmesa->hw.mtl[1].emit = mtl_emit;
+
+        rmesa->hw.vpi[0].emit = veclinear_emit;
+        rmesa->hw.vpi[1].emit = veclinear_emit;
+        rmesa->hw.vpp[0].emit = veclinear_emit;
+        rmesa->hw.vpp[1].emit = veclinear_emit;
+
+        rmesa->hw.grd.emit = scl_emit;
+        rmesa->hw.fog.emit = vec_emit;
+        rmesa->hw.glt.emit = vec_emit;
+        rmesa->hw.eye.emit = vec_emit;
+
+	for (i = R200_MTX_MV; i <= R200_MTX_TEX5; i++)
+	  rmesa->hw.mat[i].emit = vec_emit;
+
+	for (i = 0; i < 8; i++)
+	  rmesa->hw.lit[i].emit = lit_emit;
+
+	for (i = 0; i < 6; i++)
+	  rmesa->hw.ucp[i].emit = vec_emit;
+
+	rmesa->hw.ptp.emit = ptp_emit;
+   }
+
+
+   
    rmesa->hw.mtl[0].cmd[MTL_CMD_0] = 
       cmdvec( R200_VS_MAT_0_EMISS, 1, 16 );
    rmesa->hw.mtl[0].cmd[MTL_CMD_1] = 
@@ -567,7 +1230,7 @@ void r200InitState( r200ContextPtr rmesa )
 				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
 				(R200_BLEND_GL_ZERO << R200_DST_BLEND_SHIFT));
 
-   if (rmesa->r200Screen->drmSupportsBlendColor) {
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
       rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = 0x00000000;
       rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = (R200_COMB_FCN_ADD_CLAMP |
 				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
@@ -578,18 +1241,17 @@ void r200InitState( r200ContextPtr rmesa )
    }
 
    rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHOFFSET] =
-      rmesa->r200Screen->depthOffset + rmesa->r200Screen->fbLocation;
+      rmesa->radeon.radeonScreen->depthOffset + rmesa->radeon.radeonScreen->fbLocation;
 
    rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] = 
-      ((rmesa->r200Screen->depthPitch &
+      ((rmesa->radeon.radeonScreen->depthPitch &
 	R200_DEPTHPITCH_MASK) |
        R200_DEPTH_ENDIAN_NO_SWAP);
    
    if (rmesa->using_hyperz)
       rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] |= R200_DEPTH_HYPERZ;
 
-   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (depth_fmt |
-					       R200_Z_TEST_LESS |
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (R200_Z_TEST_LESS |
 					       R200_STENCIL_TEST_ALWAYS |
 					       R200_STENCIL_FAIL_KEEP |
 					       R200_STENCIL_ZPASS_KEEP |
@@ -599,15 +1261,14 @@ void r200InitState( r200ContextPtr rmesa )
    if (rmesa->using_hyperz) {
       rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_COMPRESSION_ENABLE |
 						  R200_Z_DECOMPRESSION_ENABLE;
-/*      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200)
+/*      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200)
 	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_HIERARCHY_ENABLE;*/
    }
 
    rmesa->hw.ctx.cmd[CTX_PP_CNTL] = (R200_ANTI_ALIAS_NONE 
  				     | R200_TEX_BLEND_0_ENABLE);
 
-   rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = color_fmt;
-   switch ( driQueryOptioni( &rmesa->optionCache, "dither_mode" ) ) {
+   switch ( driQueryOptioni( &rmesa->radeon.optionCache, "dither_mode" ) ) {
    case DRI_CONF_DITHER_XERRORDIFFRESET:
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_DITHER_INIT;
       break;
@@ -615,41 +1276,19 @@ void r200InitState( r200ContextPtr rmesa )
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_SCALE_DITHER_ENABLE;
       break;
    }
-   if ( driQueryOptioni( &rmesa->optionCache, "round_mode" ) ==
+   if ( driQueryOptioni( &rmesa->radeon.optionCache, "round_mode" ) ==
 	DRI_CONF_ROUND_ROUND )
-      rmesa->state.color.roundEnable = R200_ROUND_ENABLE;
+      rmesa->radeon.state.color.roundEnable = R200_ROUND_ENABLE;
    else
-      rmesa->state.color.roundEnable = 0;
-   if ( driQueryOptioni (&rmesa->optionCache, "color_reduction" ) ==
+      rmesa->radeon.state.color.roundEnable = 0;
+   if ( driQueryOptioni (&rmesa->radeon.optionCache, "color_reduction" ) ==
 	DRI_CONF_COLOR_REDUCTION_DITHER )
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_DITHER_ENABLE;
    else
-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->state.color.roundEnable;
-
-#if 000
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((rmesa->state.color.drawOffset +
-					       rmesa->r200Screen->fbLocation)
-					      & R200_COLOROFFSET_MASK);
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((rmesa->state.color.drawPitch &
-					      R200_COLORPITCH_MASK) |
-					     R200_COLOR_ENDIAN_NO_SWAP);
-#else
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((drawOffset +
-					       rmesa->r200Screen->fbLocation)
-					      & R200_COLOROFFSET_MASK);
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((drawPitch &
-					      R200_COLORPITCH_MASK) |
-					     R200_COLOR_ENDIAN_NO_SWAP);
-#endif
-   /* (fixed size) sarea is initialized to zero afaics so can omit version check. Phew! */
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
-   }
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->radeon.state.color.roundEnable;
 
    rmesa->hw.prf.cmd[PRF_PP_TRI_PERF] = R200_TRI_CUTOFF_MASK - R200_TRI_CUTOFF_MASK * 
-			driQueryOptionf (&rmesa->optionCache,"texture_blend_quality");
+			driQueryOptionf (&rmesa->radeon.optionCache,"texture_blend_quality");
    rmesa->hw.prf.cmd[PRF_PP_PERF_CNTL] = 0;
 
    rmesa->hw.set.cmd[SET_SE_CNTL] = (R200_FFACE_CULL_CCW |
@@ -704,7 +1343,7 @@ void r200InitState( r200ContextPtr rmesa )
 						R200_VC_NO_SWAP;
 #endif
 
-   if (!(rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL)) {
+   if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
       /* Bypass TCL */
       rmesa->hw.cst.cmd[CST_SE_VAP_CNTL_STATUS] |= (1<<8);
    }
@@ -743,28 +1382,28 @@ void r200InitState( r200ContextPtr rmesa )
       rmesa->hw.tex[i].cmd[TEX_PP_TXFORMAT_X] =
          (/* R200_TEXCOORD_PROJ | */
           0x100000);	/* Small default bias */
-      if (rmesa->r200Screen->drmSupportsFragShader) {
+      if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
 	 rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET_NEWDRM] =
-	     rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	     rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
 	 rmesa->hw.tex[i].cmd[TEX_PP_CUBIC_FACES] = 0;
 	 rmesa->hw.tex[i].cmd[TEX_PP_TXMULTI_CTL] = 0;
       }
       else {
 	  rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET_OLDDRM] =
-	     rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	     rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
      }
 
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_FACES] = 0;
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F1] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F2] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F3] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F4] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F5] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
 
       rmesa->hw.pix[i].cmd[PIX_PP_TXCBLEND] =
          (R200_TXC_ARG_A_ZERO |
@@ -967,5 +1606,13 @@ void r200InitState( r200ContextPtr rmesa )
 
    r200LightingSpaceChange( ctx );
 
-   rmesa->hw.all_dirty = GL_TRUE;
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      radeon_init_query_stateobj(&rmesa->radeon, R200_QUERYOBJ_CMDSIZE);
+      rmesa->radeon.query.queryobj.cmd[R200_QUERYOBJ_CMD_0] = CP_PACKET0(RADEON_RB3D_ZPASS_DATA, 0);
+      rmesa->radeon.query.queryobj.cmd[R200_QUERYOBJ_DATA_0] = 0;
+   }
+
+   rmesa->radeon.hw.all_dirty = GL_TRUE;
+
+   rcommonInitCmdBuf(&rmesa->radeon);
 }
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
index b25f028244..240fb45078 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
@@ -39,6 +39,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/image.h"
 #include "main/imports.h"
 #include "main/macros.h"
+#include "main/simple_list.h"
 
 #include "swrast/s_context.h"
 #include "swrast/s_fog.h"
@@ -55,27 +56,24 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_tcl.h"
 
 
-static void flush_last_swtcl_prim( r200ContextPtr rmesa  );
-
-
 /***********************************************************************
- *                         Initialization 
+ *                         Initialization
  ***********************************************************************/
 
 #define EMIT_ATTR( ATTR, STYLE, F0 )					\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
    fmt_0 |= F0;								\
 } while (0)
 
 #define EMIT_PAD( N )							\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
 } while (0)
 
 static void r200SetVertexFormat( GLcontext *ctx )
@@ -100,7 +98,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
    }
 
    assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-   rmesa->swtcl.vertex_attr_count = 0;
+   rmesa->radeon.swtcl.vertex_attr_count = 0;
 
    /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
     * build up a hardware vertex.
@@ -121,7 +119,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
    }
 
    rmesa->swtcl.coloroffset = offset;
-#if MESA_LITTLE_ENDIAN 
+#if MESA_LITTLE_ENDIAN
    EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
 #else
    EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
@@ -132,7 +130,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 ) ||
        RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
 
-#if MESA_LITTLE_ENDIAN 
+#if MESA_LITTLE_ENDIAN
       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
 	 rmesa->swtcl.specoffset = offset;
 	 EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
@@ -185,7 +183,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= R200_FOG_USE_SPEC_ALPHA;
    }
 
-   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
+   if (!RENDERINPUTS_EQUAL( rmesa->radeon.tnl_index_bitset, index_bitset ) ||
 	(rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0) ||
 	(rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
       R200_NEWPRIM(rmesa);
@@ -193,26 +191,42 @@ static void r200SetVertexFormat( GLcontext *ctx )
       rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = fmt_0;
       rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = fmt_1;
 
-      rmesa->swtcl.vertex_size =
+      rmesa->radeon.swtcl.vertex_size =
 	  _tnl_install_attrs( ctx,
-			      rmesa->swtcl.vertex_attrs, 
-			      rmesa->swtcl.vertex_attr_count,
+			      rmesa->radeon.swtcl.vertex_attrs,
+			      rmesa->radeon.swtcl.vertex_attr_count,
 			      NULL, 0 );
-      rmesa->swtcl.vertex_size /= 4;
-      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
+      rmesa->radeon.swtcl.vertex_size /= 4;
+      RENDERINPUTS_COPY( rmesa->radeon.tnl_index_bitset, index_bitset );
+   }
+}
+
+static void r200_predict_emit_size( r200ContextPtr rmesa )
+{
+   if (RADEON_DEBUG & RADEON_VERTS)
+      fprintf(stderr, "%s\n", __func__);
+   const int vertex_array_size = 7;
+   const int prim_size = 3;
+   if (!rmesa->radeon.swtcl.emit_prediction) {
+      const int state_size = radeonCountStateEmitSize(&rmesa->radeon);
+      if (rcommonEnsureCmdBufSpace(&rmesa->radeon,
+	       state_size +
+	       vertex_array_size + prim_size,
+	       __FUNCTION__))
+	 rmesa->radeon.swtcl.emit_prediction = radeonCountStateEmitSize(&rmesa->radeon);
+      else
+	 rmesa->radeon.swtcl.emit_prediction = state_size;
+      rmesa->radeon.swtcl.emit_prediction += vertex_array_size + prim_size
+	 + rmesa->radeon.cmdbuf.cs->cdw;
    }
 }
 
 
 static void r200RenderStart( GLcontext *ctx )
 {
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-
    r200SetVertexFormat( ctx );
-
-   if (rmesa->dma.flush != 0 && 
-       rmesa->dma.flush != flush_last_swtcl_prim)
-      rmesa->dma.flush( rmesa );
+   if (RADEON_DEBUG & RADEON_VERTS)
+      fprintf(stderr, "%s\n", __func__);
 }
 
 
@@ -232,7 +246,7 @@ void r200ChooseVertexState( GLcontext *ctx )
     * rasterization fallback.  As this function will be called again when we
     * leave a rasterization fallback, we can just skip it for now.
     */
-   if (rmesa->Fallback != 0)
+   if (rmesa->radeon.Fallback != 0)
       return;
 
    vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
@@ -273,78 +287,32 @@ void r200ChooseVertexState( GLcontext *ctx )
    }
 }
 
-
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( r200ContextPtr rmesa  )
-{
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   rmesa->dma.flush = NULL;
-
-   if (rmesa->dma.current.buf) {
-      struct r200_dma_region *current = &rmesa->dma.current;
-      GLuint current_offset = (rmesa->r200Screen->gart_buffer_offset +
-			       current->buf->buf->idx * RADEON_BUFFER_SIZE + 
-			       current->start);
-
-      assert (!(rmesa->swtcl.hw_primitive & R200_VF_PRIM_WALK_IND));
-
-      assert (current->start + 
-	      rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-	      current->ptr);
-
-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-	 r200EnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
-			        rmesa->hw.max_state_size + VBUF_BUFSZ );
-	 r200EmitVertexAOS( rmesa,
-			      rmesa->swtcl.vertex_size,
-			      current_offset);
-
-	 r200EmitVbufPrim( rmesa,
-			   rmesa->swtcl.hw_primitive,
-			   rmesa->swtcl.numverts);
-      }
-
-      rmesa->swtcl.numverts = 0;
-      current->start = current->ptr;
-   }
-}
-
-
-/* Alloc space in the current dma region.
- */
-static INLINE void *
-r200AllocDmaLowVerts( r200ContextPtr rmesa, int nverts, int vsize )
+void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 {
-   GLuint bytes = vsize * nverts;
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   if (RADEON_DEBUG & RADEON_VERTS)
+      fprintf(stderr, "%s\n", __func__);
 
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      r200RefillCurrentDmaRegion( rmesa );
 
-   if (!rmesa->dma.flush) {
-      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-      rmesa->dma.flush = flush_last_swtcl_prim;
-   }
+   radeonEmitState(&rmesa->radeon);
+   r200EmitVertexAOS( rmesa,
+		      rmesa->radeon.swtcl.vertex_size,
+		      first_elem(&rmesa->radeon.dma.reserved)->bo,
+		      current_offset);
 
-   ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
-   ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
-   ASSERT( rmesa->dma.current.start + 
-	   rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-	   rmesa->dma.current.ptr );
 
+   r200EmitVbufPrim( rmesa,
+		     rmesa->radeon.swtcl.hw_primitive,
+		     rmesa->radeon.swtcl.numverts);
+   if ( rmesa->radeon.swtcl.emit_prediction < rmesa->radeon.cmdbuf.cs->cdw )
+      WARN_ONCE("Rendering was %d commands larger than predicted size."
+	    " We might overflow  command buffer.\n",
+	    rmesa->radeon.cmdbuf.cs->cdw - rmesa->radeon.swtcl.emit_prediction );
 
-   {
-      GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
-      rmesa->dma.current.ptr += bytes;
-      rmesa->swtcl.numverts += nverts;
-      return head;
-   }
+   rmesa->radeon.swtcl.emit_prediction = 0;
 
 }
 
-
 /**************************************************************************/
 
 
@@ -389,17 +357,27 @@ static void r200ResetLineStipple( GLcontext *ctx );
 #define HAVE_POLYGONS    1
 #define HAVE_ELTS        0
 
+static void* r200_alloc_verts( r200ContextPtr rmesa, GLuint n, GLuint size)
+{
+   void *rv;
+   do {
+      r200_predict_emit_size( rmesa );
+      rv = rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 );
+   } while(!rv);
+   return rv;
+}
+
 #undef LOCAL_VARS
 #undef ALLOC_VERTS
 #define CTX_ARG r200ContextPtr rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) r200AllocDmaLowVerts( rmesa, n, size * 4 )
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) r200_alloc_verts(rmesa, n, size)
 #define LOCAL_VARS						\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   const char *r200verts = (char *)rmesa->swtcl.verts;
-#define VERT(x) (r200Vertex *)(r200verts + ((x) * vertsize * sizeof(int)))
-#define VERTEX r200Vertex 
-#define DO_DEBUG_VERTS (1 && (R200_DEBUG & DEBUG_VERTS))
+   const char *r200verts = (char *)rmesa->radeon.swtcl.verts;
+#define VERT(x) (radeonVertex *)(r200verts + ((x) * vertsize * sizeof(int)))
+#define VERTEX radeonVertex
+#define DO_DEBUG_VERTS (1 && (R200_DEBUG & RADEON_VERTS))
 
 #undef TAG
 #define TAG(x) r200_##x
@@ -456,11 +434,11 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
 
 #define VERT_SET_RGBA( v, c )  					\
 do {								\
-   r200_color_t *color = (r200_color_t *)&((v)->ui[coloroffset]);	\
+   radeon_color_t *color = (radeon_color_t *)&((v)->ui[coloroffset]);	\
    UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]);		\
    UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]);		\
    UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]);		\
@@ -472,7 +450,7 @@ do {								\
 #define VERT_SET_SPEC( v, c )					\
 do {								\
    if (specoffset) {						\
-      r200_color_t *spec = (r200_color_t *)&((v)->ui[specoffset]);	\
+      radeon_color_t *spec = (radeon_color_t *)&((v)->ui[specoffset]);	\
       UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);	\
       UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);	\
       UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);	\
@@ -481,8 +459,8 @@ do {								\
 #define VERT_COPY_SPEC( v0, v1 )			\
 do {							\
    if (specoffset) {					\
-      r200_color_t *spec0 = (r200_color_t *)&((v0)->ui[specoffset]);	\
-      r200_color_t *spec1 = (r200_color_t *)&((v1)->ui[specoffset]);	\
+      radeon_color_t *spec0 = (radeon_color_t *)&((v0)->ui[specoffset]);	\
+      radeon_color_t *spec1 = (radeon_color_t *)&((v1)->ui[specoffset]);	\
       spec0->red   = spec1->red;	\
       spec0->green = spec1->green;	\
       spec0->blue  = spec1->blue; 	\
@@ -503,7 +481,7 @@ do {							\
 
 #define LOCAL_VARS(n)							\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);			\
-   GLuint color[n], spec[n];						\
+   GLuint color[n] = {0}, spec[n] = {0};						\
    GLuint coloroffset = rmesa->swtcl.coloroffset;	\
    GLuint specoffset = rmesa->swtcl.specoffset;			\
    (void) color; (void) spec; (void) coloroffset; (void) specoffset;
@@ -513,7 +491,7 @@ do {							\
  ***********************************************************************/
 
 #define RASTERIZE(x) r200RasterPrimitive( ctx, reduced_hw_prim(ctx, x) )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
 #undef TAG
 #define TAG(x) x
 #include "tnl_dd/t_dd_unfilled.h"
@@ -569,8 +547,8 @@ static void init_rast_tab( void )
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
-   const char *r200verts = (char *)rmesa->swtcl.verts;		\
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
+   const char *r200verts = (char *)rmesa->radeon.swtcl.verts;		\
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
    const GLboolean stipple = ctx->Line.StippleFlag;		\
    (void) elt; (void) stipple;
@@ -599,13 +577,13 @@ void r200ChooseRenderState( GLcontext *ctx )
    GLuint index = 0;
    GLuint flags = ctx->_TriangleCaps;
 
-   if (!rmesa->TclFallback || rmesa->Fallback) 
+   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback)
       return;
 
    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R200_TWOSIDE_BIT;
    if (flags & DD_TRI_UNFILLED)      index |= R200_UNFILLED_BIT;
 
-   if (index != rmesa->swtcl.RenderIndex) {
+   if (index != rmesa->radeon.swtcl.RenderIndex) {
       tnl->Driver.Render.Points = rast_tab[index].points;
       tnl->Driver.Render.Line = rast_tab[index].line;
       tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -622,7 +600,7 @@ void r200ChooseRenderState( GLcontext *ctx )
 	 tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
       }
 
-      rmesa->swtcl.RenderIndex = index;
+      rmesa->radeon.swtcl.RenderIndex = index;
    }
 }
 
@@ -636,7 +614,7 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   if (rmesa->swtcl.hw_primitive != hwprim) {
+   if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
       /* need to disable perspective-correct texturing for point sprites */
       if ((hwprim & 0xf) == R200_VF_PRIM_POINT_SPRITES && ctx->Point.PointSprite) {
 	 if (rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE) {
@@ -649,15 +627,15 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 	 rmesa->hw.set.cmd[SET_RE_CNTL] |= R200_PERSPECTIVE_ENABLE;
       }
       R200_NEWPRIM( rmesa );
-      rmesa->swtcl.hw_primitive = hwprim;
+      rmesa->radeon.swtcl.hw_primitive = hwprim;
    }
 }
 
 static void r200RenderPrimitive( GLcontext *ctx, GLenum prim )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   rmesa->swtcl.render_primitive = prim;
-   if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
+   rmesa->radeon.swtcl.render_primitive = prim;
+   if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED))
       r200RasterPrimitive( ctx, reduced_hw_prim(ctx, prim) );
 }
 
@@ -701,23 +679,23 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->Fallback;
+   GLuint oldfallback = rmesa->radeon.Fallback;
 
    if (mode) {
-      rmesa->Fallback |= bit;
+      rmesa->radeon.Fallback |= bit;
       if (oldfallback == 0) {
-	 R200_FIREVERTICES( rmesa );
+	 radeon_firevertices(&rmesa->radeon);
 	 TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_TRUE );
 	 _swsetup_Wakeup( ctx );
-	 rmesa->swtcl.RenderIndex = ~0;
-         if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 rmesa->radeon.swtcl.RenderIndex = ~0;
+         if (R200_DEBUG & RADEON_FALLBACKS) {
             fprintf(stderr, "R200 begin rasterization fallback: 0x%x %s\n",
                     bit, getFallbackString(bit));
          }
       }
    }
    else {
-      rmesa->Fallback &= ~bit;
+      rmesa->radeon.Fallback &= ~bit;
       if (oldfallback == bit) {
 
 	 _swrast_flush( ctx );
@@ -731,18 +709,18 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 	 tnl->Driver.Render.ResetLineStipple = r200ResetLineStipple;
 	 TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_FALSE );
-	 if (rmesa->TclFallback) {
-	    /* These are already done if rmesa->TclFallback goes to
+	 if (rmesa->radeon.TclFallback) {
+	    /* These are already done if rmesa->radeon.TclFallback goes to
 	     * zero above. But not if it doesn't (R200_NO_TCL for
 	     * example?)
 	     */
 	    _tnl_invalidate_vertex_state( ctx, ~0 );
 	    _tnl_invalidate_vertices( ctx, ~0 );
-	    RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
+	    RENDERINPUTS_ZERO( rmesa->radeon.tnl_index_bitset );
 	    r200ChooseVertexState( ctx );
 	    r200ChooseRenderState( ctx );
 	 }
-         if (R200_DEBUG & DEBUG_FALLBACKS) {
+         if (R200_DEBUG & RADEON_FALLBACKS) {
             fprintf(stderr, "R200 end rasterization fallback: 0x%x %s\n",
                     bit, getFallbackString(bit));
          }
@@ -755,7 +733,7 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 /**
  * Cope with depth operations by drawing individual pixels as points.
- * 
+ *
  * \todo
  * The way the vertex state is set in this routine is hokey.  It seems to
  * work, but it's very hackish.  This whole routine is pretty hackish.  If
@@ -770,14 +748,14 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 		  const GLubyte *bitmap )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   const GLfloat *rc = ctx->Current.RasterColor; 
+   const GLfloat *rc = ctx->Current.RasterColor;
    GLint row, col;
-   r200Vertex vert;
+   radeonVertex vert;
    GLuint orig_vte;
    GLuint h;
 
 
-   /* Turn off tcl.  
+   /* Turn off tcl.
     */
    TCL_FALLBACK( ctx, R200_TCL_FALLBACK_BITMAP, 1 );
 
@@ -794,7 +772,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
       vte |= R200_VTX_W0_FMT;
       vap &= ~R200_VAP_FORCE_W_TO_ONE;
 
-      rmesa->swtcl.vertex_size = 5;
+      rmesa->radeon.swtcl.vertex_size = 5;
 
       if ( (rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0)
 	   || (rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
@@ -828,7 +806,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 					   R200_VPORT_Z_SCALE_ENA |
 					   R200_VPORT_X_OFFSET_ENA |
 					   R200_VPORT_Y_OFFSET_ENA |
-					   R200_VPORT_Z_OFFSET_ENA); 
+					   R200_VPORT_Z_OFFSET_ENA);
 
    /* Turn off other stuff:  Stipple?, texture?, blending?, etc.
     */
@@ -871,16 +849,16 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 
    /* Update window height
     */
-   LOCK_HARDWARE( rmesa );
-   UNLOCK_HARDWARE( rmesa );
-   h = rmesa->dri.drawable->h + rmesa->dri.drawable->y;
-   px += rmesa->dri.drawable->x;
+   LOCK_HARDWARE( &rmesa->radeon );
+   UNLOCK_HARDWARE( &rmesa->radeon );
+   h = radeon_get_drawable(&rmesa->radeon)->h + radeon_get_drawable(&rmesa->radeon)->y;
+   px += radeon_get_drawable(&rmesa->radeon)->x;
 
    /* Clipping handled by existing mechansims in r200_ioctl.c?
     */
    for (row=0; row<height; row++) {
-      const GLubyte *src = (const GLubyte *) 
-	 _mesa_image_address2d(unpack, bitmap, width, height, 
+      const GLubyte *src = (const GLubyte *)
+	 _mesa_image_address2d(unpack, bitmap, width, height,
                                GL_COLOR_INDEX, GL_BITMAP, row, 0 );
 
       if (unpack->LsbFirst) {
@@ -929,7 +907,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 
    /* Need to restore vertexformat?
     */
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       r200ChooseVertexState( ctx );
 }
 
@@ -949,6 +927,7 @@ void r200InitSwtcl( GLcontext *ctx )
       init_rast_tab();
       firsttime = 0;
    }
+   rmesa->radeon.swtcl.emit_prediction = 0;
 
    tnl->Driver.Render.Start = r200RenderStart;
    tnl->Driver.Render.Finish = r200RenderFinish;
@@ -959,20 +938,12 @@ void r200InitSwtcl( GLcontext *ctx )
    tnl->Driver.Render.Interp = _tnl_interp;
 
    /* FIXME: what are these numbers? */
-   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
 		       36 * sizeof(GLfloat) );
-   
-   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->swtcl.render_primitive = GL_TRIANGLES;
-   rmesa->swtcl.hw_primitive = 0;
-}
 
-
-void r200DestroySwtcl( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (rmesa->swtcl.indexed_verts.buf) 
-      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ );
+   rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+   rmesa->radeon.swtcl.hw_primitive = 0;
 }
+
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.h b/src/mesa/drivers/dri/r200/r200_swtcl.h
index 8c29fd0c99..b0905879d7 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.h
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.h
@@ -39,7 +39,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 
 extern void r200InitSwtcl( GLcontext *ctx );
-extern void r200DestroySwtcl( GLcontext *ctx );
 
 extern void r200ChooseRenderState( GLcontext *ctx );
 extern void r200ChooseVertexState( GLcontext *ctx );
@@ -52,15 +51,11 @@ extern void r200BuildVertices( GLcontext *ctx, GLuint start, GLuint count,
 extern void r200PrintSetupFlags(char *msg, GLuint flags );
 
 
-extern void r200_emit_indexed_verts( GLcontext *ctx,
-				       GLuint start,
-				       GLuint count );
-
 extern void r200_translate_vertex( GLcontext *ctx, 
-				     const r200Vertex *src, 
+				     const radeonVertex *src, 
 				     SWvertex *dst );
 
-extern void r200_print_vertex( GLcontext *ctx, const r200Vertex *v );
+extern void r200_print_vertex( GLcontext *ctx, const radeonVertex *v );
 
 extern void r200_import_float_colors( GLcontext *ctx );
 extern void r200_import_float_spec_colors( GLcontext *ctx );
@@ -70,5 +65,5 @@ extern void r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 			      const struct gl_pixelstore_attrib *unpack,
 			      const GLubyte *bitmap );
 
-
+void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
 #endif
diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
index 99aecfe1e9..c702910ef2 100644
--- a/src/mesa/drivers/dri/r200/r200_tcl.c
+++ b/src/mesa/drivers/dri/r200/r200_tcl.c
@@ -51,6 +51,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_swtcl.h"
 #include "r200_maos.h"
 
+#include "radeon_common_context.h"
+
 
 
 #define HAVE_POINTS      1
@@ -109,7 +111,7 @@ static GLboolean discrete_prim[0x10] = {
 #define ELT_INIT(prim, hw_prim) \
    r200TclPrimitive( ctx, prim, hw_prim | R200_VF_PRIM_WALK_IND )
 
-#define GET_MESA_ELTS() rmesa->tcl.Elts
+#define GET_MESA_ELTS() TNL_CONTEXT(ctx)->vb.Elts
 
 
 /* Don't really know how many elts will fit in what's left of cmdbuf,
@@ -123,7 +125,7 @@ static GLboolean discrete_prim[0x10] = {
 
 #define RESET_STIPPLE() do {			\
    R200_STATECHANGE( rmesa, lin );		\
-   r200EmitState( rmesa );			\
+   radeonEmitState(&rmesa->radeon);			\
 } while (0)
 
 #define AUTO_STIPPLE( mode )  do {		\
@@ -134,7 +136,7 @@ static GLboolean discrete_prim[0x10] = {
    else						\
       rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] &=	\
 	 ~R200_LINE_PATTERN_AUTO_RESET;	\
-   r200EmitState( rmesa );			\
+   radeonEmitState(&rmesa->radeon);			\
 } while (0)
 
 
@@ -142,27 +144,24 @@ static GLboolean discrete_prim[0x10] = {
 
 static GLushort *r200AllocElts( r200ContextPtr rmesa, GLuint nr ) 
 {
-   if (rmesa->dma.flush == r200FlushElts &&
-       rmesa->store.cmd_used + nr*2 < R200_CMD_BUF_SZ) {
+   if (rmesa->radeon.dma.flush == r200FlushElts &&
+       rmesa->tcl.elt_used + nr*2 < R200_ELT_BUF_SZ) {
 
-      GLushort *dest = (GLushort *)(rmesa->store.cmd_buf +
-				    rmesa->store.cmd_used);
+      GLushort *dest = (GLushort *)(rmesa->radeon.tcl.elt_dma_bo->ptr +
+				    rmesa->radeon.tcl.elt_dma_offset + rmesa->tcl.elt_used);
 
-      rmesa->store.cmd_used += nr*2;
+      rmesa->tcl.elt_used += nr*2;
 
       return dest;
    }
    else {
-      if (rmesa->dma.flush)
-	 rmesa->dma.flush( rmesa );
-
-      r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
-			     rmesa->hw.max_state_size + ELTS_BUFSZ(nr) );
+      if (rmesa->radeon.dma.flush)
+	 rmesa->radeon.dma.flush( rmesa->radeon.glCtx );
 
       r200EmitAOS( rmesa,
-		   rmesa->tcl.aos_components,
-		   rmesa->tcl.nr_aos_components, 0 );
+		   rmesa->radeon.tcl.aos_count, 0 );
 
+      r200EmitMaxVtxIndex(rmesa, rmesa->radeon.tcl.aos[0].count);
       return r200AllocEltsOpenEnded( rmesa, rmesa->tcl.hw_primitive, nr );
    }
 }
@@ -188,13 +187,11 @@ static void r200EmitPrim( GLcontext *ctx,
    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    r200TclPrimitive( ctx, prim, hwprim );
    
-   r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
-			  rmesa->hw.max_state_size + VBUF_BUFSZ );
+   //   fprintf(stderr,"Emit prim %d\n", rmesa->radeon.tcl.aos_count);
 
    r200EmitAOS( rmesa,
-		  rmesa->tcl.aos_components,
-		  rmesa->tcl.nr_aos_components,
-		  start );
+		rmesa->radeon.tcl.aos_count,
+		start );
    
    /* Why couldn't this packet have taken an offset param?
     */
@@ -207,6 +204,7 @@ static void r200EmitPrim( GLcontext *ctx,
    r200EmitPrim( ctx, prim, hwprim, start, count );             \
    (void) rmesa; } while (0)
 
+#define MAX_CONVERSION_SIZE 40
 /* Try & join small primitives
  */
 #if 0
@@ -369,6 +367,66 @@ r200ComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord )
    }
 }
 
+/**
+ * Predict total emit size for next rendering operation so there is no flush in middle of rendering
+ * Prediction has to aim towards the best possible value that is worse than worst case scenario
+ */
+static GLuint r200EnsureEmitSize( GLcontext * ctx , GLubyte* vimap_rev )
+{
+  r200ContextPtr rmesa = R200_CONTEXT(ctx);
+  TNLcontext *tnl = TNL_CONTEXT(ctx);
+  struct vertex_buffer *VB = &tnl->vb;
+  GLuint space_required;
+  GLuint state_size;
+  GLuint nr_aos = 0;
+  int i;
+  /* predict number of aos to emit */
+  for (i = 0; i < 15; ++i)
+  {
+    if (vimap_rev[i] != 255)
+    {
+      ++nr_aos;
+    }
+  }
+
+  {
+    /* count the prediction for state size */
+    space_required = 0;
+    state_size = radeonCountStateEmitSize( &rmesa->radeon );
+    /* vtx may be changed in r200EmitArrays so account for it if not dirty */
+    if (!rmesa->hw.vtx.dirty)
+      state_size += rmesa->hw.vtx.check(rmesa->radeon.glCtx, &rmesa->hw.vtx);
+    /* predict size for elements */
+    for (i = 0; i < VB->PrimitiveCount; ++i)
+    {
+      if (!VB->Primitive[i].count)
+	continue;
+      /* If primitive.count is less than MAX_CONVERSION_SIZE
+         rendering code may decide convert to elts.
+	 In that case we have to make pessimistic prediction.
+	 and use larger of 2 paths. */
+      const GLuint elts = ELTS_BUFSZ(nr_aos);
+      const GLuint index = INDEX_BUFSZ;
+      const GLuint vbuf = VBUF_BUFSZ;
+      if ( (!VB->Elts && VB->Primitive[i].count >= MAX_CONVERSION_SIZE)
+	  || vbuf > index + elts)
+	space_required += vbuf;
+      else
+	space_required += index + elts;
+      space_required += AOS_BUFSZ(nr_aos);
+    }
+  }
+
+  radeon_print(RADEON_RENDER,RADEON_VERBOSE,
+      "%s space %u, aos %d\n",
+      __func__, space_required, AOS_BUFSZ(nr_aos) );
+  /* flush the buffer in case we need more than is left. */
+  if (rcommonEnsureCmdBufSpace(&rmesa->radeon, space_required + state_size, __FUNCTION__))
+    return space_required + radeonCountStateEmitSize( &rmesa->radeon );
+  else
+    return space_required + state_size;
+}
+
 
 /**********************************************************************/
 /*                          Render pipeline stage                     */
@@ -394,19 +452,19 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
 
    /* TODO: separate this from the swtnl pipeline 
     */
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       return GL_TRUE;	/* fallback to software t&l */
 
-   if (R200_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   radeon_print(RADEON_RENDER, RADEON_NORMAL, "%s\n", __FUNCTION__);
 
    if (VB->Count == 0)
       return GL_FALSE;
 
    /* Validate state:
     */
-   if (rmesa->NewGLState)
-      r200ValidateState( ctx );
+   if (rmesa->radeon.NewGLState)
+      if (!r200ValidateState( ctx ))
+         return GL_TRUE; /* fallback to sw t&l */
 
    if (!ctx->VertexProgram._Enabled) {
    /* NOTE: inputs != tnl->render_inputs - these are the untransformed
@@ -481,11 +539,11 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
 
    /* Do the actual work:
     */
-   r200ReleaseArrays( ctx, ~0 /* stage->changed_inputs */ );
+   radeonReleaseArrays( ctx, ~0 /* stage->changed_inputs */ );
+   GLuint emit_end = r200EnsureEmitSize( ctx, vimap_rev )
+     + rmesa->radeon.cmdbuf.cs->cdw;
    r200EmitArrays( ctx, vimap_rev );
 
-   rmesa->tcl.Elts = VB->Elts;
-
    for (i = 0 ; i < VB->PrimitiveCount ; i++)
    {
       GLuint prim = _tnl_translate_prim(&VB->Primitive[i]);
@@ -495,11 +553,14 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
       if (!length)
 	 continue;
 
-      if (rmesa->tcl.Elts)
+      if (VB->Elts)
 	 r200EmitEltPrimitive( ctx, start, start+length, prim );
       else
 	 r200EmitPrimitive( ctx, start, start+length, prim );
    }
+   if ( emit_end < rmesa->radeon.cmdbuf.cs->cdw )
+     WARN_ONCE("Rendering was %d commands larger than predicted size."
+	 " We might overflow  command buffer.\n", rmesa->radeon.cmdbuf.cs->cdw - emit_end);
 
    return GL_FALSE;		/* finished the pipe */
 }
@@ -545,7 +606,7 @@ static void transition_to_swtnl( GLcontext *ctx )
    tnl->Driver.NotifyMaterialChange = 
       _mesa_validate_all_lighting_tables;
 
-   r200ReleaseArrays( ctx, ~0 );
+   radeonReleaseArrays( ctx, ~0 );
 
    /* Still using the D3D based hardware-rasterizer from the radeon;
     * need to put the card into D3D mode to make it work:
@@ -565,15 +626,11 @@ static void transition_to_hwtnl( GLcontext *ctx )
 
    tnl->Driver.NotifyMaterialChange = r200UpdateMaterial;
 
-   if ( rmesa->dma.flush )			
-      rmesa->dma.flush( rmesa );	
+   if ( rmesa->radeon.dma.flush )			
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	
 
-   rmesa->dma.flush = NULL;
+   rmesa->radeon.dma.flush = NULL;
    
-   if (rmesa->swtcl.indexed_verts.buf) 
-      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
-			      __FUNCTION__ );
-
    R200_STATECHANGE( rmesa, vap );
    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_TCL_ENABLE;
    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_FORCE_W_TO_ONE;
@@ -594,7 +651,7 @@ static void transition_to_hwtnl( GLcontext *ctx )
    rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] &= ~(R200_VTX_XY_FMT|R200_VTX_Z_FMT);
    rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL] |= R200_VTX_W0_FMT;
 
-   if (R200_DEBUG & DEBUG_FALLBACKS) 
+   if (R200_DEBUG & RADEON_FALLBACKS)
       fprintf(stderr, "R200 end tcl fallback\n");
 }
 
@@ -631,21 +688,21 @@ static char *getFallbackString(GLuint bit)
 void r200TclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->TclFallback;
+   GLuint oldfallback = rmesa->radeon.TclFallback;
 
    if (mode) {
-      rmesa->TclFallback |= bit;
+      rmesa->radeon.TclFallback |= bit;
       if (oldfallback == 0) {
-	 if (R200_DEBUG & DEBUG_FALLBACKS) 
+	 if (R200_DEBUG & RADEON_FALLBACKS)
 	    fprintf(stderr, "R200 begin tcl fallback %s\n",
 		    getFallbackString( bit ));
 	 transition_to_swtnl( ctx );
       }
    }
    else {
-      rmesa->TclFallback &= ~bit;
+      rmesa->radeon.TclFallback &= ~bit;
       if (oldfallback == bit) {
-	 if (R200_DEBUG & DEBUG_FALLBACKS) 
+	 if (R200_DEBUG & RADEON_FALLBACKS)
 	    fprintf(stderr, "R200 end tcl fallback %s\n",
 		    getFallbackString( bit ));
 	 transition_to_hwtnl( ctx );
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index 5a4db33f44..36d9e37d87 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -43,8 +43,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/teximage.h"
 #include "main/texobj.h"
 
-#include "texmem.h"
-
+#include "radeon_mipmap_tree.h"
 #include "r200_context.h"
 #include "r200_state.h"
 #include "r200_ioctl.h"
@@ -63,10 +62,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \param twrap Wrap mode for the \a t texture coordinate
  */
 
-static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum rwrap )
+static void r200SetTexWrap( radeonTexObjPtr t, GLenum swrap, GLenum twrap, GLenum rwrap )
 {
    GLboolean  is_clamp = GL_FALSE;
    GLboolean  is_clamp_to_border = GL_FALSE;
+   struct gl_texture_object *tObj = &t->base;
 
    t->pp_txfilter &= ~(R200_CLAMP_S_MASK | R200_CLAMP_T_MASK | R200_BORDER_MODE_D3D);
 
@@ -103,7 +103,7 @@ static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum
       _mesa_problem(NULL, "bad S wrap mode in %s", __FUNCTION__);
    }
 
-   if (t->base.tObj->Target != GL_TEXTURE_1D) {
+   if (tObj->Target != GL_TEXTURE_1D) {
       switch ( twrap ) {
       case GL_REPEAT:
          t->pp_txfilter |= R200_CLAMP_T_WRAP;
@@ -180,7 +180,7 @@ static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum
    t->border_fallback = (is_clamp && is_clamp_to_border);
 }
 
-static void r200SetTexMaxAnisotropy( r200TexObjPtr t, GLfloat max )
+static void r200SetTexMaxAnisotropy( radeonTexObjPtr t, GLfloat max )
 {
    t->pp_txfilter &= ~R200_MAX_ANISO_MASK;
 
@@ -205,10 +205,13 @@ static void r200SetTexMaxAnisotropy( r200TexObjPtr t, GLfloat max )
  * \param magf Texture magnification mode
  */
 
-static void r200SetTexFilter( r200TexObjPtr t, GLenum minf, GLenum magf )
+static void r200SetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
 {
    GLuint anisotropy = (t->pp_txfilter & R200_MAX_ANISO_MASK);
 
+   /* Force revalidation to account for switches from/to mipmapping. */
+   t->validated = GL_FALSE;
+
    t->pp_txfilter &= ~(R200_MIN_FILTER_MASK | R200_MAG_FILTER_MASK);
    t->pp_txformat_x &= ~R200_VOLUME_FILTER_MASK;
 
@@ -267,696 +270,16 @@ static void r200SetTexFilter( r200TexObjPtr t, GLenum minf, GLenum magf )
    }
 }
 
-static void r200SetTexBorderColor( r200TexObjPtr t, GLubyte c[4] )
-{
-   t->pp_border_color = r200PackColor( 4, c[0], c[1], c[2], c[3] );
-}
-
-
-/**
- * Allocate space for and load the mesa images into the texture memory block.
- * This will happen before drawing with a new texture, or drawing with a
- * texture after it was swapped out or teximaged again.
- */
-
-static r200TexObjPtr r200AllocTexObj( struct gl_texture_object *texObj )
-{
-   r200TexObjPtr t;
-
-   t = CALLOC_STRUCT( r200_tex_obj );
-   texObj->DriverData = t;
-   if ( t != NULL ) {
-      if ( R200_DEBUG & DEBUG_TEXTURE ) {
-	 fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)texObj, 
-		  (void *)t );
-      }
-
-      /* Initialize non-image-dependent parts of the state:
-       */
-      t->base.tObj = texObj;
-      t->border_fallback = GL_FALSE;
-
-      make_empty_list( & t->base );
-
-      r200SetTexWrap( t, texObj->WrapS, texObj->WrapT, texObj->WrapR );
-      r200SetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
-      r200SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
-      r200SetTexBorderColor( t, texObj->_BorderChan );
-   }
-
-   return t;
-}
-
-/* try to find a format which will only need a memcopy */
-static const struct gl_texture_format *
-r200Choose8888TexFormat( GLenum srcFormat, GLenum srcType )
-{
-   const GLuint ui = 1;
-   const GLubyte littleEndian = *((const GLubyte *) &ui);
-
-   if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-       (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
-      return &_mesa_texformat_rgba8888;
-   }
-   else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-       (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
-      return &_mesa_texformat_rgba8888_rev;
-   }
-   else return _dri_texformat_argb8888;
-}
-
-static const struct gl_texture_format *
-r200ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
-                           GLenum format, GLenum type )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   const GLboolean do32bpt =
-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32 );
-   const GLboolean force16bpt =
-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16 );
-   (void) format;
-
-   switch ( internalFormat ) {
-   case 4:
-   case GL_RGBA:
-   case GL_COMPRESSED_RGBA:
-      switch ( type ) {
-      case GL_UNSIGNED_INT_10_10_10_2:
-      case GL_UNSIGNED_INT_2_10_10_10_REV:
-	 return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb1555;
-      case GL_UNSIGNED_SHORT_4_4_4_4:
-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-	 return _dri_texformat_argb4444;
-      case GL_UNSIGNED_SHORT_5_5_5_1:
-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-	 return _dri_texformat_argb1555;
-      default:
-         return do32bpt ?
-	    r200Choose8888TexFormat(format, type) : _dri_texformat_argb4444;
-      }
-
-   case 3:
-   case GL_RGB:
-   case GL_COMPRESSED_RGB:
-      switch ( type ) {
-      case GL_UNSIGNED_SHORT_4_4_4_4:
-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-	 return _dri_texformat_argb4444;
-      case GL_UNSIGNED_SHORT_5_5_5_1:
-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-	 return _dri_texformat_argb1555;
-      case GL_UNSIGNED_SHORT_5_6_5:
-      case GL_UNSIGNED_SHORT_5_6_5_REV:
-	 return _dri_texformat_rgb565;
-      default:
-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
-      }
-
-   case GL_RGBA8:
-   case GL_RGB10_A2:
-   case GL_RGBA12:
-   case GL_RGBA16:
-      return !force16bpt ?
-	  r200Choose8888TexFormat(format, type) : _dri_texformat_argb4444;
-
-   case GL_RGBA4:
-   case GL_RGBA2:
-      return _dri_texformat_argb4444;
-
-   case GL_RGB5_A1:
-      return _dri_texformat_argb1555;
-
-   case GL_RGB8:
-   case GL_RGB10:
-   case GL_RGB12:
-   case GL_RGB16:
-      return !force16bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
-
-   case GL_RGB5:
-   case GL_RGB4:
-   case GL_R3_G3_B2:
-      return _dri_texformat_rgb565;
-
-   case GL_ALPHA:
-   case GL_ALPHA4:
-   case GL_ALPHA8:
-   case GL_ALPHA12:
-   case GL_ALPHA16:
-   case GL_COMPRESSED_ALPHA:
-   /* can't use a8 format since interpreting hw I8 as a8 would result
-      in wrong rgb values (same as alpha value instead of 0). */
-      return _dri_texformat_al88;
-
-   case 1:
-   case GL_LUMINANCE:
-   case GL_LUMINANCE4:
-   case GL_LUMINANCE8:
-   case GL_LUMINANCE12:
-   case GL_LUMINANCE16:
-   case GL_COMPRESSED_LUMINANCE:
-      return _dri_texformat_l8;
-
-   case 2:
-   case GL_LUMINANCE_ALPHA:
-   case GL_LUMINANCE4_ALPHA4:
-   case GL_LUMINANCE6_ALPHA2:
-   case GL_LUMINANCE8_ALPHA8:
-   case GL_LUMINANCE12_ALPHA4:
-   case GL_LUMINANCE12_ALPHA12:
-   case GL_LUMINANCE16_ALPHA16:
-   case GL_COMPRESSED_LUMINANCE_ALPHA:
-      return _dri_texformat_al88;
-
-   case GL_INTENSITY:
-   case GL_INTENSITY4:
-   case GL_INTENSITY8:
-   case GL_INTENSITY12:
-   case GL_INTENSITY16:
-   case GL_COMPRESSED_INTENSITY:
-       return _dri_texformat_i8;
-
-   case GL_YCBCR_MESA:
-      if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-          type == GL_UNSIGNED_BYTE)
-         return &_mesa_texformat_ycbcr;
-      else
-         return &_mesa_texformat_ycbcr_rev;
-
-   case GL_RGB_S3TC:
-   case GL_RGB4_S3TC:
-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgb_dxt1;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgba_dxt1;
-
-   case GL_RGBA_S3TC:
-   case GL_RGBA4_S3TC:
-   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-      return &_mesa_texformat_rgba_dxt3;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-      return &_mesa_texformat_rgba_dxt5;
-
-   default:
-      _mesa_problem(ctx,
-         "unexpected internalFormat 0x%x in r200ChooseTextureFormat",
-         (int) internalFormat);
-      return NULL;
-   }
-
-   return NULL; /* never get here */
-}
-
-
-static GLboolean
-r200ValidateClientStorage( GLcontext *ctx, GLenum target,
-			   GLint internalFormat,
-			   GLint srcWidth, GLint srcHeight, 
-                           GLenum format, GLenum type,  const void *pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if ( R200_DEBUG & DEBUG_TEXTURE )
-      fprintf(stderr, "intformat %s format %s type %s\n",
-	      _mesa_lookup_enum_by_nr( internalFormat ),
-	      _mesa_lookup_enum_by_nr( format ),
-	      _mesa_lookup_enum_by_nr( type ));
-
-   if (!ctx->Unpack.ClientStorage)
-      return 0;
-
-   if (ctx->_ImageTransferState ||
-       texImage->IsCompressed ||
-       texObj->GenerateMipmap)
-      return 0;
-
-
-   /* This list is incomplete, may be different on ppc???
-    */
-   switch ( internalFormat ) {
-   case GL_RGBA:
-      if ( format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV ) {
-	 texImage->TexFormat = _dri_texformat_argb8888;
-      }
-      else
-	 return 0;
-      break;
-
-   case GL_RGB:
-      if ( format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5 ) {
-	 texImage->TexFormat = _dri_texformat_rgb565;
-      }
-      else
-	 return 0;
-      break;
-
-   case GL_YCBCR_MESA:
-      if ( format == GL_YCBCR_MESA && 
-	   type == GL_UNSIGNED_SHORT_8_8_REV_APPLE ) {
-	 texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
-      }
-      else if ( format == GL_YCBCR_MESA && 
-		(type == GL_UNSIGNED_SHORT_8_8_APPLE || 
-		 type == GL_UNSIGNED_BYTE)) {
-	 texImage->TexFormat = &_mesa_texformat_ycbcr;
-      }
-      else
-	 return 0;
-      break;
-
-   default:
-      return 0;
-   }
-
-   /* Could deal with these packing issues, but currently don't:
-    */
-   if (packing->SkipPixels || 
-       packing->SkipRows || 
-       packing->SwapBytes ||
-       packing->LsbFirst) {
-      return 0;
-   }
-
-   {      
-      GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
-						  format, type);
-
-      
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf(stderr, "%s: srcRowStride %d/%x\n", 
-		 __FUNCTION__, srcRowStride, srcRowStride);
-
-      /* Could check this later in upload, pitch restrictions could be
-       * relaxed, but would need to store the image pitch somewhere,
-       * as packing details might change before image is uploaded:
-       */
-      if (!r200IsGartMemory( rmesa, pixels, srcHeight * srcRowStride ) ||
-	  (srcRowStride & 63))
-	 return 0;
-
-
-      /* Have validated that _mesa_transfer_teximage would be a straight
-       * memcpy at this point.  NOTE: future calls to TexSubImage will
-       * overwrite the client data.  This is explicitly mentioned in the
-       * extension spec.
-       */
-      texImage->Data = (void *)pixels;
-      texImage->IsClientData = GL_TRUE;
-      texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
-
-      return 1;
-   }
-}
-
-
-static void r200TexImage1D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint border,
-                              GLenum format, GLenum type, const GLvoid *pixels,
-                              const struct gl_pixelstore_attrib *packing,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
+static void r200SetTexBorderColor( radeonTexObjPtr t, const GLfloat color[4] )
 {
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-         return;
-      }
-   }
-
-   /* Note, this will call ChooseTextureFormat */
-   _mesa_store_teximage1d(ctx, target, level, internalFormat,
-                          width, border, format, type, pixels,
-                          &ctx->Unpack, texObj, texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-
-
-static void r200TexSubImage1D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset,
-                                 GLsizei width,
-                                 GLenum format, GLenum type,
-                                 const GLvoid *pixels,
-                                 const struct gl_pixelstore_attrib *packing,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
-         return;
-      }
-   }
-
-   _mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
-			     format, type, pixels, packing, texObj,
-			     texImage);
-
-   t->dirty_images[0] |= (1 << level);
+   GLubyte c[4];
+   CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+   CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+   CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+   CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
+   t->pp_border_color = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
 }
 
-
-static void r200TexImage2D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint height, GLint border,
-                              GLenum format, GLenum type, const GLvoid *pixels,
-                              const struct gl_pixelstore_attrib *packing,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   if ( t != NULL ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
-         return;
-      }
-   }
-
-   texImage->IsClientData = GL_FALSE;
-
-   if (r200ValidateClientStorage( ctx, target, 
-				  internalFormat, 
-				  width, height, 
-				  format, type, pixels, 
-				  packing, texObj, texImage)) {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__); 
-   }
-   else {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__); 
-
-      /* Normal path: copy (to cached memory) and eventually upload
-       * via another copy to GART memory and then a blit...  Could
-       * eliminate one copy by going straight to (permanent) GART.
-       *
-       * Note, this will call r200ChooseTextureFormat.
-       */
-      _mesa_store_teximage2d(ctx, target, level, internalFormat,
-			     width, height, border, format, type, pixels,
-			     &ctx->Unpack, texObj, texImage);
-      
-      t->dirty_images[face] |= (1 << level);
-   }
-}
-
-
-static void r200TexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset, GLint yoffset,
-                                 GLsizei width, GLsizei height,
-                                 GLenum format, GLenum type,
-                                 const GLvoid *pixels,
-                                 const struct gl_pixelstore_attrib *packing,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
-         return;
-      }
-   }
-
-   _mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-			     height, format, type, pixels, packing, texObj,
-			     texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-
-static void r200CompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint height, GLint border,
-                              GLsizei imageSize, const GLvoid *data,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   if ( t != NULL ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
-         return;
-      }
-   }
-
-   texImage->IsClientData = GL_FALSE;
-/* can't call this, different parameters. Would never evaluate to true anyway currently
-   if (r200ValidateClientStorage( ctx, target, 
-				  internalFormat,
-				  width, height,
-				  format, type, pixels,
-				  packing, texObj, texImage)) {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__);
-   }
-   else */{
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__);
-
-      /* Normal path: copy (to cached memory) and eventually upload
-       * via another copy to GART memory and then a blit...  Could
-       * eliminate one copy by going straight to (permanent) GART.
-       *
-       * Note, this will call r200ChooseTextureFormat.
-       */
-      _mesa_store_compressed_teximage2d(ctx, target, level, internalFormat, width,
-                                 height, border, imageSize, data, texObj, texImage);
-
-      t->dirty_images[face] |= (1 << level);
-   }
-}
-
-
-static void r200CompressedTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset, GLint yoffset,
-                                 GLsizei width, GLsizei height,
-                                 GLenum format,
-                                 GLsizei imageSize, const GLvoid *data,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage2D");
-         return;
-      }
-   }
-
-   _mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-                            height, format, imageSize, data, texObj, texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-
-#if ENABLE_HW_3D_TEXTURE
-static void r200TexImage3D( GLcontext *ctx, GLenum target, GLint level,
-                            GLint internalFormat,
-                            GLint width, GLint height, GLint depth,
-                            GLint border,
-                            GLenum format, GLenum type, const GLvoid *pixels,
-                            const struct gl_pixelstore_attrib *packing,
-                            struct gl_texture_object *texObj,
-                            struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
-         return;
-      }
-   }
-
-   texImage->IsClientData = GL_FALSE;
-
-#if 0
-   if (r200ValidateClientStorage( ctx, target, 
-				  internalFormat, 
-				  width, height, 
-				  format, type, pixels, 
-				  packing, texObj, texImage)) {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__); 
-   }
-   else
-#endif
-   {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__); 
-
-      /* Normal path: copy (to cached memory) and eventually upload
-       * via another copy to GART memory and then a blit...  Could
-       * eliminate one copy by going straight to (permanent) GART.
-       *
-       * Note, this will call r200ChooseTextureFormat.
-       */
-      _mesa_store_teximage3d(ctx, target, level, internalFormat,
-			     width, height, depth, border,
-                             format, type, pixels,
-			     &ctx->Unpack, texObj, texImage);
-      
-      t->dirty_images[0] |= (1 << level);
-   }
-}
-#endif
-
-
-#if ENABLE_HW_3D_TEXTURE
-static void
-r200TexSubImage3D( GLcontext *ctx, GLenum target, GLint level,
-                   GLint xoffset, GLint yoffset, GLint zoffset,
-                   GLsizei width, GLsizei height, GLsizei depth,
-                   GLenum format, GLenum type,
-                   const GLvoid *pixels,
-                   const struct gl_pixelstore_attrib *packing,
-                   struct gl_texture_object *texObj,
-                   struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-/*     fprintf(stderr, "%s\n", __FUNCTION__); */
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
-         return;
-      }
-      texObj->DriverData = t;
-   }
-
-   _mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
-                             width, height, depth,
-                             format, type, pixels, packing, texObj, texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-#endif
-
-
-
 static void r200TexEnv( GLcontext *ctx, GLenum target,
 			  GLenum pname, const GLfloat *param )
 {
@@ -964,7 +287,7 @@ static void r200TexEnv( GLcontext *ctx, GLenum target,
    GLuint unit = ctx->Texture.CurrentUnit;
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 
-   if ( R200_DEBUG & DEBUG_STATE ) {
+   if ( R200_DEBUG & RADEON_STATE ) {
       fprintf( stderr, "%s( %s )\n",
 	       __FUNCTION__, _mesa_lookup_enum_by_nr( pname ) );
    }
@@ -978,7 +301,7 @@ static void r200TexEnv( GLcontext *ctx, GLenum target,
       GLubyte c[4];
       GLuint envColor;
       UNCLAMPED_FLOAT_TO_RGBA_CHAN( c, texUnit->EnvColor );
-      envColor = r200PackColor( 4, c[0], c[1], c[2], c[3] );
+      envColor = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
       if ( rmesa->hw.tf.cmd[TF_TFACTOR_0 + unit] != envColor ) {
 	 R200_STATECHANGE( rmesa, tf );
 	 rmesa->hw.tf.cmd[TF_TFACTOR_0 + unit] = envColor;
@@ -997,7 +320,7 @@ static void r200TexEnv( GLcontext *ctx, GLenum target,
        * NOTE: Add a small bias to the bias for conform mipsel.c test.
        */
       bias = *param + .01;
-      min = driQueryOptionb (&rmesa->optionCache, "no_neg_lod_bias") ?
+      min = driQueryOptionb (&rmesa->radeon.optionCache, "no_neg_lod_bias") ?
 	  0.0 : -16.0;
       bias = CLAMP( bias, min, 16.0 );
       b = (int)(bias * fixed_one) & R200_LOD_BIAS_MASK;
@@ -1034,9 +357,9 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
 				struct gl_texture_object *texObj,
 				GLenum pname, const GLfloat *params )
 {
-   r200TexObjPtr t = (r200TexObjPtr) texObj->DriverData;
+   radeonTexObj* t = radeon_tex_obj(texObj);
 
-   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
+   if ( R200_DEBUG & (RADEON_STATE|RADEON_TEXTURE) ) {
       fprintf( stderr, "%s( %s )\n", __FUNCTION__,
 	       _mesa_lookup_enum_by_nr( pname ) );
    }
@@ -1056,7 +379,7 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
       break;
 
    case GL_TEXTURE_BORDER_COLOR:
-      r200SetTexBorderColor( t, texObj->_BorderChan );
+      r200SetTexBorderColor( t, texObj->BorderColor );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
@@ -1068,59 +391,46 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
        * we just have to rely on loading the right subset of mipmap levels
        * to simulate a clamped LOD.
        */
-      driSwapOutTextureObject( (driTextureObject *) t );
+      if (t->mt) {
+         radeon_miptree_unreference(t->mt);
+	 t->mt = 0;
+	 t->validated = GL_FALSE;
+      }
       break;
 
    default:
       return;
    }
-
-   /* Mark this texobj as dirty (one bit per tex unit)
-    */
-   t->dirty_state = TEX_ALL;
 }
 
 
-
-static void r200BindTexture( GLcontext *ctx, GLenum target,
-			       struct gl_texture_object *texObj )
-{
-   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
-      fprintf( stderr, "%s( %p ) unit=%d\n", __FUNCTION__, (void *)texObj,
-	       ctx->Texture.CurrentUnit );
-   }
-
-   if ( (target == GL_TEXTURE_1D)
-	|| (target == GL_TEXTURE_2D) 
-#if ENABLE_HW_3D_TEXTURE
-	|| (target == GL_TEXTURE_3D)
-#endif
-	|| (target == GL_TEXTURE_CUBE_MAP)
-	|| (target == GL_TEXTURE_RECTANGLE_NV) ) {
-      assert( texObj->DriverData != NULL );
-   }
-}
-
-
-static void r200DeleteTexture( GLcontext *ctx,
-				 struct gl_texture_object *texObj )
+static void r200DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
-      fprintf( stderr, "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
-	       _mesa_lookup_enum_by_nr( texObj->Target ) );
+   radeonTexObj* t = radeon_tex_obj(texObj);
+
+   if (RADEON_DEBUG & (RADEON_STATE | RADEON_TEXTURE)) {
+      fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+	      (void *)texObj,
+	      _mesa_lookup_enum_by_nr(texObj->Target));
+   }
+   
+   if (rmesa) {
+      int i;
+      radeon_firevertices(&rmesa->radeon);
+      for ( i = 0 ; i < rmesa->radeon.glCtx->Const.MaxTextureUnits ; i++ ) {
+	 if ( t == rmesa->state.texture.unit[i].texobj ) {
+	    rmesa->state.texture.unit[i].texobj = NULL;
+	    rmesa->hw.tex[i].dirty = GL_FALSE;
+	    rmesa->hw.cube[i].dirty = GL_FALSE;
+	 }
+      }      
    }
-
-   if ( t != NULL ) {
-      if ( rmesa ) {
-         R200_FIREVERTICES( rmesa );
-      }
-
-      driDestroyTextureObject( t );
+   
+   if (t->mt) {
+      radeon_miptree_unreference(t->mt);
+      t->mt = 0;
    }
-   /* Free mipmap images and the texture object itself */
    _mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -1150,46 +460,59 @@ static void r200TexGen( GLcontext *ctx,
  * Called via ctx->Driver.NewTextureObject.
  * Note: this function will be called during context creation to
  * allocate the default texture objects.
- * Note: we could use containment here to 'derive' the driver-specific
- * texture object from the core mesa gl_texture_object.  Not done at this time.
  * Fixup MaxAnisotropy according to user preference.
  */
-static struct gl_texture_object *
-r200NewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
+static struct gl_texture_object *r200NewTextureObject(GLcontext * ctx,
+						      GLuint name,
+						      GLenum target)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_object *obj;
-   obj = _mesa_new_texture_object(ctx, name, target);
-   if (!obj)
-      return NULL;
-   obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
-   r200AllocTexObj( obj );
-   return obj;
+   radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
+
+   if (RADEON_DEBUG & (RADEON_STATE | RADEON_TEXTURE)) {
+     fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+	     t, _mesa_lookup_enum_by_nr(target));
+   }
+
+   _mesa_initialize_texture_object(&t->base, name, target);
+   t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+   /* Initialize hardware state */
+   r200SetTexWrap( t, t->base.WrapS, t->base.WrapT, t->base.WrapR );
+   r200SetTexMaxAnisotropy( t, t->base.MaxAnisotropy );
+   r200SetTexFilter(t, t->base.MinFilter, t->base.MagFilter);
+   r200SetTexBorderColor(t, t->base.BorderColor);
+
+   return &t->base;
 }
 
 
+
 void r200InitTextureFuncs( struct dd_function_table *functions )
 {
    /* Note: we only plug in the functions we implement in the driver
     * since _mesa_init_driver_functions() was already called.
     */
-   functions->ChooseTextureFormat	= r200ChooseTextureFormat;
-   functions->TexImage1D		= r200TexImage1D;
-   functions->TexImage2D		= r200TexImage2D;
+   functions->ChooseTextureFormat	= radeonChooseTextureFormat_mesa;
+   functions->TexImage1D		= radeonTexImage1D;
+   functions->TexImage2D		= radeonTexImage2D;
 #if ENABLE_HW_3D_TEXTURE
-   functions->TexImage3D		= r200TexImage3D;
+   functions->TexImage3D		= radeonTexImage3D;
 #else
    functions->TexImage3D		= _mesa_store_teximage3d;
 #endif
-   functions->TexSubImage1D		= r200TexSubImage1D;
-   functions->TexSubImage2D		= r200TexSubImage2D;
+   functions->TexSubImage1D		= radeonTexSubImage1D;
+   functions->TexSubImage2D		= radeonTexSubImage2D;
 #if ENABLE_HW_3D_TEXTURE
-   functions->TexSubImage3D		= r200TexSubImage3D;
+   functions->TexSubImage3D		= radeonTexSubImage3D;
 #else
    functions->TexSubImage3D		= _mesa_store_texsubimage3d;
 #endif
+   functions->GetTexImage               = radeonGetTexImage;
+   functions->GetCompressedTexImage     = radeonGetCompressedTexImage;
    functions->NewTextureObject		= r200NewTextureObject;
-   functions->BindTexture		= r200BindTexture;
+   //   functions->BindTexture		= r200BindTexture;
    functions->DeleteTexture		= r200DeleteTexture;
    functions->IsTextureResident		= driIsTextureResident;
 
@@ -1197,22 +520,16 @@ void r200InitTextureFuncs( struct dd_function_table *functions )
    functions->TexParameter		= r200TexParameter;
    functions->TexGen			= r200TexGen;
 
-   functions->CompressedTexImage2D	= r200CompressedTexImage2D;
-   functions->CompressedTexSubImage2D	= r200CompressedTexSubImage2D;
+   functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
+   functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
 
-   driInitTextureFormats();
+   functions->GenerateMipmap = radeonGenerateMipmap;
 
-#if 000
-   /* moved or obsolete code */
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   driInitTextureObjects( ctx, & rmesa->swapped,
-			  DRI_TEXMGR_DO_TEXTURE_1D
-			  | DRI_TEXMGR_DO_TEXTURE_2D );
+   functions->NewTextureImage = radeonNewTextureImage;
+   functions->FreeTexImageData = radeonFreeTexImageData;
+   functions->MapTexture = radeonMapTexture;
+   functions->UnmapTexture = radeonUnmapTexture;
+
+   driInitTextureFormats();
 
-   /* Hack: r200NewTextureObject is not yet installed when the
-    * default textures are created. Therefore set MaxAnisotropy of the
-    * default 2D texture now. */
-   ctx->Shared->Default2D->MaxAnisotropy = driQueryOptionf (&rmesa->optionCache,
-							    "def_max_anisotropy");
-#endif
 }
diff --git a/src/mesa/drivers/dri/r200/r200_tex.h b/src/mesa/drivers/dri/r200/r200_tex.h
index 10ff8e8a66..e122de6e5e 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.h
+++ b/src/mesa/drivers/dri/r200/r200_tex.h
@@ -35,15 +35,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __R200_TEX_H__
 #define __R200_TEX_H__
 
+extern void r200SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv);
+extern void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format,
+			      __DRIdrawable *dPriv);
 extern void r200SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 			     unsigned long long offset, GLint depth,
 			     GLuint pitch);
 
 extern void r200UpdateTextureState( GLcontext *ctx );
 
-extern int r200UploadTexImages( r200ContextPtr rmesa, r200TexObjPtr t, GLuint face );
+extern int r200UploadTexImages( r200ContextPtr rmesa, radeonTexObjPtr t, GLuint face );
 
-extern void r200DestroyTexObj( r200ContextPtr rmesa, r200TexObjPtr t );
+extern void r200DestroyTexObj( r200ContextPtr rmesa, radeonTexObjPtr t );
 
 extern void r200InitTextureFuncs( struct dd_function_table *functions );
 
diff --git a/src/mesa/drivers/dri/r200/r200_texmem.c b/src/mesa/drivers/dri/r200/r200_texmem.c
deleted file mode 100644
index 3b81ac0c80..0000000000
--- a/src/mesa/drivers/dri/r200/r200_texmem.c
+++ /dev/null
@@ -1,530 +0,0 @@
-/**************************************************************************
-
-Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.  
-The Weather Channel, Inc. funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86
-license. This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation on the rights to use, copy, modify, merge, publish,
-distribute, sub license, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Kevin E. Martin <martin@valinux.com>
- *   Gareth Hughes <gareth@valinux.com>
- *
- */
- 
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/macros.h"
-#include "r200_context.h"
-#include "r200_ioctl.h"
-#include "r200_tex.h"
-#include "radeon_reg.h"
-
-#include <unistd.h>  /* for usleep() */
-
-
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void
-r200DestroyTexObj( r200ContextPtr rmesa, r200TexObjPtr t )
-{
-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, 
-	       (void *)t, (void *)t->base.tObj );
-   }
-
-   if ( rmesa != NULL ) {
-      unsigned   i;
-
-
-      for ( i = 0 ; i < rmesa->glCtx->Const.MaxTextureUnits ; i++ ) {
-	 if ( t == rmesa->state.texture.unit[i].texobj ) {
-	    rmesa->state.texture.unit[i].texobj = NULL;
-	    rmesa->hw.tex[i].dirty = GL_FALSE;
-	    rmesa->hw.cube[i].dirty = GL_FALSE;
-	 }
-      }
-   }
-}
-
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-
-static void r200UploadGARTClientSubImage( r200ContextPtr rmesa,
-					  r200TexObjPtr t, 
-					  struct gl_texture_image *texImage,
-					  GLint hwlevel,
-					  GLint x, GLint y, 
-					  GLint width, GLint height )
-{
-   const struct gl_texture_format *texFormat = texImage->TexFormat;
-   GLuint srcPitch, dstPitch;
-   int blit_format;
-   int srcOffset;
-
-   /*
-    * XXX it appears that we always upload the full image, not a subimage.
-    * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
-    * changed, the src pitch will have to change.
-    */
-   switch ( texFormat->TexelBytes ) {
-   case 1:
-      blit_format = R200_CP_COLOR_FORMAT_CI8;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   case 2:
-      blit_format = R200_CP_COLOR_FORMAT_RGB565;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   case 4:
-      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   default:
-      return;
-   }
-
-   t->image[0][hwlevel].data = texImage->Data;
-   srcOffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
-
-   assert( srcOffset != ~0 );
-
-   /* Don't currently need to cope with small pitches?
-    */
-   width = texImage->Width;
-   height = texImage->Height;
-
-   r200EmitWait( rmesa, RADEON_WAIT_3D );
-
-   r200EmitBlit( rmesa, blit_format, 
-		 srcPitch,  
-		 srcOffset,   
-		 dstPitch,
-		 t->bufAddr,
-		 x, 
-		 y, 
-		 t->image[0][hwlevel].x + x,
-		 t->image[0][hwlevel].y + y, 
-		 width,
-		 height );
-
-   r200EmitWait( rmesa, RADEON_WAIT_2D );
-}
-
-static void r200UploadRectSubImage( r200ContextPtr rmesa,
-				    r200TexObjPtr t, 
-				    struct gl_texture_image *texImage,
-				    GLint x, GLint y, 
-				    GLint width, GLint height )
-{
-   const struct gl_texture_format *texFormat = texImage->TexFormat;
-   int blit_format, dstPitch, done;
-
-   switch ( texFormat->TexelBytes ) {
-   case 1:
-      blit_format = R200_CP_COLOR_FORMAT_CI8;
-      break;
-   case 2:
-      blit_format = R200_CP_COLOR_FORMAT_RGB565;
-      break;
-   case 4:
-      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
-      break;
-   default:
-      return;
-   }
-
-   t->image[0][0].data = texImage->Data;
-
-   /* Currently don't need to cope with small pitches.
-    */
-   width = texImage->Width;
-   height = texImage->Height;
-   dstPitch = t->pp_txpitch + 32;
-
-   if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
-      /* In this case, could also use GART texturing.  This is
-       * currently disabled, but has been tested & works.
-       */
-      if ( !t->image_override )
-         t->pp_txoffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
-      t->pp_txpitch = texImage->RowStride * texFormat->TexelBytes - 32;
-
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, 
-		 "Using GART texturing for rectangular client texture\n");
-
-      /* Release FB memory allocated for this image:
-       */
-      /* FIXME This may not be correct as driSwapOutTextureObject sets
-       * FIXME dirty_images.  It may be fine, though.
-       */
-      if ( t->base.memBlock ) {
-	 driSwapOutTextureObject( (driTextureObject *) t );
-      }
-   }
-   else if (texImage->IsClientData) {
-      /* Data already in GART memory, with usable pitch.
-       */
-      GLuint srcPitch;
-      srcPitch = texImage->RowStride * texFormat->TexelBytes;
-      r200EmitBlit( rmesa, 
-		    blit_format, 
-		    srcPitch,
-		    r200GartOffsetFromVirtual( rmesa, texImage->Data ),   
-		    dstPitch, t->bufAddr,
-		    0, 0, 
-		    0, 0, 
-		    width, height );
-   }
-   else {
-      /* Data not in GART memory, or bad pitch.
-       */
-      for (done = 0; done < height ; ) {
-	 struct r200_dma_region region;
-	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
-	 int src_pitch;
-	 char *tex;
-
-         src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-	 tex = (char *)texImage->Data + done * src_pitch;
-
-	 memset(&region, 0, sizeof(region));
-	 r200AllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
-
-	 /* Copy texdata to dma:
-	  */
-	 if (0)
-	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
-		    __FUNCTION__, src_pitch, dstPitch);
-
-	 if (src_pitch == dstPitch) {
-	    memcpy( region.address + region.start, tex, lines * src_pitch );
-	 } 
-	 else {
-	    char *buf = region.address + region.start;
-	    int i;
-	    for (i = 0 ; i < lines ; i++) {
-	       memcpy( buf, tex, src_pitch );
-	       buf += dstPitch;
-	       tex += src_pitch;
-	    }
-	 }
-
-	 r200EmitWait( rmesa, RADEON_WAIT_3D );
-
-	 /* Blit to framebuffer
-	  */
-	 r200EmitBlit( rmesa,
-		       blit_format,
-		       dstPitch, GET_START( &region ),
-		       dstPitch | (t->tile_bits >> 16),
-		       t->bufAddr,
-		       0, 0,
-		       0, done,
-		       width, lines );
-	 
-	 r200EmitWait( rmesa, RADEON_WAIT_2D );
-
-	 r200ReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
-	 done += lines;
-      }
-   }
-}
-
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void uploadSubImage( r200ContextPtr rmesa, r200TexObjPtr t, 
-			    GLint hwlevel,
-			    GLint x, GLint y, GLint width, GLint height,
-			    GLuint face )
-{
-   struct gl_texture_image *texImage = NULL;
-   GLuint offset;
-   GLint imageWidth, imageHeight;
-   GLint ret;
-   drm_radeon_texture_t tex;
-   drm_radeon_tex_image_t tmp;
-   const int level = hwlevel + t->base.firstLevel;
-
-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
-	       __FUNCTION__, (void *)t, (void *)t->base.tObj,
-	       level, width, height, face );
-   }
-
-   ASSERT(face < 6);
-
-   /* Ensure we have a valid texture to upload */
-   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
-      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-      return;
-   }
-
-   texImage = t->base.tObj->Image[face][level];
-
-   if ( !texImage ) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
-      return;
-   }
-   if ( !texImage->Data ) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
-      return;
-   }
-
-
-   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-      assert(level == 0);
-      assert(hwlevel == 0);
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
-      r200UploadRectSubImage( rmesa, t, texImage, x, y, width, height );
-      return;
-   }
-   else if (texImage->IsClientData) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is in GART client storage\n",
-		  __FUNCTION__);
-      r200UploadGARTClientSubImage( rmesa, t, texImage, hwlevel,
-				   x, y, width, height );
-      return;
-   }
-   else if ( R200_DEBUG & DEBUG_TEXTURE )
-      fprintf( stderr, "%s: image data is in normal memory\n",
-	       __FUNCTION__);
-      
-
-   imageWidth = texImage->Width;
-   imageHeight = texImage->Height;
-
-   offset = t->bufAddr + t->base.totalSize / 6 * face;
-
-   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      GLint imageX = 0;
-      GLint imageY = 0;
-      GLint blitX = t->image[face][hwlevel].x;
-      GLint blitY = t->image[face][hwlevel].y;
-      GLint blitWidth = t->image[face][hwlevel].width;
-      GLint blitHeight = t->image[face][hwlevel].height;
-      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
-	       imageWidth, imageHeight, imageX, imageY );
-      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
-	       blitWidth, blitHeight, blitX, blitY );
-      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-	       (GLuint)offset, hwlevel, level );
-   }
-
-   t->image[face][hwlevel].data = texImage->Data;
-
-   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-    * We used to use 1, 2 and 4-byte texels and used to use the texture
-    * width to dictate the blit width - but that won't work for compressed
-    * textures. (Brian)
-    * NOTE: can't do that with texture tiling. (sroland)
-    */
-   tex.offset = offset;
-   tex.image = &tmp;
-   /* copy (x,y,width,height,data) */
-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(tmp) );
-   
-   if (texImage->TexFormat->TexelBytes) {
-      /* use multi-byte upload scheme */
-      tex.height = imageHeight;
-      tex.width = imageWidth;
-      tex.format = t->pp_txformat & R200_TXFORMAT_FORMAT_MASK;
-      if (tex.format == R200_TXFORMAT_ABGR8888) {
-	 /* drm will refuse abgr8888 textures. */
-	 tex.format = R200_TXFORMAT_ARGB8888;
-      }
-      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
-      tex.offset += tmp.x & ~1023;
-      tmp.x = tmp.x % 1024;
-      if (t->tile_bits & R200_TXO_MICRO_TILE) {
-	 /* need something like "tiled coordinates" ? */
-	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
-	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
-	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-      }
-      else {
-	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-      }
-      if ((t->tile_bits & R200_TXO_MACRO_TILE) &&
-	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256) &&
-	 ((!(t->tile_bits & R200_TXO_MICRO_TILE) && (texImage->Height >= 8)) ||
-	    (texImage->Height >= 16))) {
-	 /* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
-	    OR if height is smaller than 8 automatically, but if micro tiling is active
-	    the limit is height 16 instead ? */
-	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-      }
-   }
-   else {
-      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
-         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
-         so the kernel module reads the right amount of data. */
-      tex.format = R200_TXFORMAT_I8; /* any 1-byte texel format */
-      tex.pitch = (BLIT_WIDTH_BYTES / 64);
-      tex.height = (imageHeight + 3) / 4;
-      tex.width = (imageWidth + 3) / 4;
-      switch (t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) {
-      case R200_TXFORMAT_DXT1:
-           tex.width *= 8;
-           break;
-      case R200_TXFORMAT_DXT23:
-      case R200_TXFORMAT_DXT45:
-           tex.width *= 16;
-           break;
-      default:
-          fprintf(stderr, "unknown compressed tex format in uploadSubImage\n");
-      }
-   }
-
-   LOCK_HARDWARE( rmesa );
-   do {
-      ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
-                                 &tex, sizeof(drm_radeon_texture_t) );
-      if (ret) {
-	 if (R200_DEBUG & DEBUG_IOCTL)
-	    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
-	 usleep(1);
-      }
-   } while ( ret == -EAGAIN );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
-      fprintf( stderr, "   offset=0x%08x\n",
-	       offset );
-      fprintf( stderr, "   image width=%d height=%d\n",
-	       imageWidth, imageHeight );
-      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
-	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
-	       t->image[face][hwlevel].data );
-      exit( 1 );
-   }
-}
-
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- * 
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int r200UploadTexImages( r200ContextPtr rmesa, r200TexObjPtr t, GLuint face )
-{
-   const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-	       (void *)rmesa->glCtx, (void *)t->base.tObj, t->base.totalSize,
-	       t->base.firstLevel, t->base.lastLevel );
-   }
-
-   if ( !t || t->base.totalSize == 0 || t->image_override )
-      return 0;
-
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      r200Finish( rmesa->glCtx );
-   }
-
-   LOCK_HARDWARE( rmesa );
-
-   if ( t->base.memBlock == NULL ) {
-      int heap;
-
-      heap = driAllocateTexture( rmesa->texture_heaps, rmesa->nr_heaps,
-				 (driTextureObject *) t );
-      if ( heap == -1 ) {
-	 UNLOCK_HARDWARE( rmesa );
-	 return -1;
-      }
-
-      /* Set the base offset of the texture image */
-      t->bufAddr = rmesa->r200Screen->texOffset[heap] 
-	   + t->base.memBlock->ofs;
-      t->pp_txoffset = t->bufAddr;
-       
-      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-	 /* hope it's safe to add that here... */
-	 t->pp_txoffset |= t->tile_bits;
-      }
-
-      /* Mark this texobj as dirty on all units:
-       */
-      t->dirty_state = TEX_ALL;
-   }
-
-   /* Let the world know we've used this memory recently.
-    */
-   driUpdateTextureLRU( (driTextureObject *) t );
-   UNLOCK_HARDWARE( rmesa );
-
-   /* Upload any images that are new */
-   if (t->base.dirty_images[face]) {
-      int i;
-      for ( i = 0 ; i < numLevels ; i++ ) {
-         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
-            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
-			    t->image[face][i].height, face );
-         }
-      }
-      t->base.dirty_images[face] = 0;
-   }
-
-
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      r200Finish( rmesa->glCtx );
-   }
-
-   return 0;
-}
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index 3f9a2f4ac1..c94834752e 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -37,9 +37,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/texformat.h"
+#include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
 
+#include "radeon_common.h"
+#include "radeon_mipmap_tree.h"
 #include "r200_context.h"
 #include "r200_state.h"
 #include "r200_ioctl.h"
@@ -139,257 +142,6 @@ static const struct tx_table tx_table_le[] =
 #undef _ALPHA
 #undef _INVALID
 
-/**
- * This function computes the number of bytes of storage needed for
- * the given texture object (all mipmap levels, all cube faces).
- * The \c image[face][level].x/y/width/height parameters for upload/blitting
- * are computed here.  \c pp_txfilter, \c pp_txformat, etc. will be set here
- * too.
- * 
- * \param rmesa Context pointer
- * \param tObj GL texture object whose images are to be posted to
- *                 hardware state.
- */
-static void r200SetTexImages( r200ContextPtr rmesa,
-			      struct gl_texture_object *tObj )
-{
-   r200TexObjPtr t = (r200TexObjPtr)tObj->DriverData;
-   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
-   GLint curOffset, blitWidth;
-   GLint i, texelBytes;
-   GLint numLevels;
-   GLint log2Width, log2Height, log2Depth;
-
-   /* Set the hardware texture format
-    */
-   if ( !t->image_override ) {
-      if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
-	 const struct tx_table *table = _mesa_little_endian() ? tx_table_le :
-								tx_table_be;
-
-         t->pp_txformat &= ~(R200_TXFORMAT_FORMAT_MASK |
-                             R200_TXFORMAT_ALPHA_IN_MAP);
-         t->pp_txfilter &= ~R200_YUV_TO_RGB;
-
-	 t->pp_txformat |= table[ baseImage->TexFormat->MesaFormat ].format;
-	 t->pp_txfilter |= table[ baseImage->TexFormat->MesaFormat ].filter;
-      }
-      else {
-         _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
-         return;
-      }
-   }
-
-   texelBytes = baseImage->TexFormat->TexelBytes;
-
-   /* Compute which mipmap levels we really want to send to the hardware.
-    */
-
-   driCalculateTextureFirstLastLevel( (driTextureObject *) t );
-   log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
-   log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
-   log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
-
-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
-
-   /* Calculate mipmap offsets and dimensions for blitting (uploading)
-    * The idea is that we lay out the mipmap levels within a block of
-    * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
-    */
-   curOffset = 0;
-   blitWidth = BLIT_WIDTH_BYTES;
-   t->tile_bits = 0;
-
-   /* figure out if this texture is suitable for tiling. */
-   if (texelBytes) {
-      if (rmesa->texmicrotile  && (tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
-      /* texrect might be able to use micro tiling too in theory? */
-	 (baseImage->Height > 1)) {
-	 /* allow 32 (bytes) x 1 mip (which will use two times the space
-	 the non-tiled version would use) max if base texture is large enough */
-	 if ((numLevels == 1) ||
-	   (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
-	       (baseImage->Width * texelBytes > 64)) ||
-	    ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
-	    t->tile_bits |= R200_TXO_MICRO_TILE;
-	 }
-      }
-      if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
-	 /* we can set macro tiling even for small textures, they will be untiled anyway */
-	 t->tile_bits |= R200_TXO_MACRO_TILE;
-      }
-   }
-
-   for (i = 0; i < numLevels; i++) {
-      const struct gl_texture_image *texImage;
-      GLuint size;
-
-      texImage = tObj->Image[0][i + t->base.firstLevel];
-      if ( !texImage )
-	 break;
-
-      /* find image size in bytes */
-      if (texImage->IsCompressed) {
-      /* need to calculate the size AFTER padding even though the texture is
-         submitted without padding.
-         Only handle pot textures currently - don't know if npot is even possible,
-         size calculation would certainly need (trivial) adjustments.
-         Align (and later pad) to 32byte, not sure what that 64byte blit width is
-         good for? */
-         if ((t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) == R200_TXFORMAT_DXT1) {
-            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
-            if ((texImage->Width + 3) < 8) /* width one block */
-               size = texImage->CompressedSize * 4;
-            else if ((texImage->Width + 3) < 16)
-               size = texImage->CompressedSize * 2;
-            else size = texImage->CompressedSize;
-         }
-         else /* DXT3/5, 16 bytes per block */
-            if ((texImage->Width + 3) < 8)
-               size = texImage->CompressedSize * 2;
-            else size = texImage->CompressedSize;
-      }
-      else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-	 size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
-      }
-      else if (t->tile_bits & R200_TXO_MICRO_TILE) {
-	 /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-	    though the actual offset may be different (if texture is less than
-	    32 bytes width) to the untiled case */
-	 int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-	 size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-      }
-      else {
-	 int w = (texImage->Width * texelBytes + 31) & ~31;
-	 size = w * texImage->Height * texImage->Depth;
-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-      }
-      assert(size > 0);
-
-      /* Align to 32-byte offset.  It is faster to do this unconditionally
-       * (no branch penalty).
-       */
-
-      curOffset = (curOffset + 0x1f) & ~0x1f;
-
-      if (texelBytes) {
-	 t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
-	 t->image[0][i].y = 0;
-	 t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
-	 t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
-      }
-      else {
-         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
-         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
-         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
-         t->image[0][i].height = size / t->image[0][i].width;     
-      }
-
-#if 0
-      /* for debugging only and only  applicable to non-rectangle targets */
-      assert(size % t->image[0][i].width == 0);
-      assert(t->image[0][i].x == 0
-             || (size < BLIT_WIDTH_BYTES && t->image[0][i].height == 1));
-#endif
-
-      if (0)
-         fprintf(stderr,
-                 "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-                 i, texImage->Width, texImage->Height,
-                 t->image[0][i].x, t->image[0][i].y,
-                 t->image[0][i].width, t->image[0][i].height, size, curOffset);
-
-      curOffset += size;
-
-   }
-
-   /* Align the total size of texture memory block.
-    */
-   t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-
-   /* Setup remaining cube face blits, if needed */
-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      const GLuint faceSize = t->base.totalSize;
-      GLuint face;
-      /* reuse face 0 x/y/width/height - just update the offset when uploading */
-      for (face = 1; face < 6; face++) {
-         for (i = 0; i < numLevels; i++) {
-            t->image[face][i].x =  t->image[0][i].x;
-            t->image[face][i].y =  t->image[0][i].y;
-            t->image[face][i].width  = t->image[0][i].width;
-            t->image[face][i].height = t->image[0][i].height;
-         }
-      }
-      t->base.totalSize = 6 * faceSize; /* total texmem needed */
-   }
-
-
-   /* Hardware state:
-    */
-   t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
-   t->pp_txfilter |= (numLevels - 1) << R200_MAX_MIP_LEVEL_SHIFT;
-
-   t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
-		       R200_TXFORMAT_HEIGHT_MASK |
-                       R200_TXFORMAT_CUBIC_MAP_ENABLE |
-                       R200_TXFORMAT_F5_WIDTH_MASK |
-                       R200_TXFORMAT_F5_HEIGHT_MASK);
-   t->pp_txformat |= ((log2Width << R200_TXFORMAT_WIDTH_SHIFT) |
-		      (log2Height << R200_TXFORMAT_HEIGHT_SHIFT));
-
-   t->pp_txformat_x &= ~(R200_DEPTH_LOG2_MASK | R200_TEXCOORD_MASK);
-   if (tObj->Target == GL_TEXTURE_3D) {
-      t->pp_txformat_x |= (log2Depth << R200_DEPTH_LOG2_SHIFT);
-      t->pp_txformat_x |= R200_TEXCOORD_VOLUME;
-   }
-   else if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      ASSERT(log2Width == log2Height);
-      t->pp_txformat |= ((log2Width << R200_TXFORMAT_F5_WIDTH_SHIFT) |
-                         (log2Height << R200_TXFORMAT_F5_HEIGHT_SHIFT) |
-/* don't think we need this bit, if it exists at all - fglrx does not set it */
-                         (R200_TXFORMAT_CUBIC_MAP_ENABLE));
-      t->pp_txformat_x |= R200_TEXCOORD_CUBIC_ENV;
-      t->pp_cubic_faces = ((log2Width << R200_FACE_WIDTH_1_SHIFT) |
-                           (log2Height << R200_FACE_HEIGHT_1_SHIFT) |
-                           (log2Width << R200_FACE_WIDTH_2_SHIFT) |
-                           (log2Height << R200_FACE_HEIGHT_2_SHIFT) |
-                           (log2Width << R200_FACE_WIDTH_3_SHIFT) |
-                           (log2Height << R200_FACE_HEIGHT_3_SHIFT) |
-                           (log2Width << R200_FACE_WIDTH_4_SHIFT) |
-                           (log2Height << R200_FACE_HEIGHT_4_SHIFT));
-   }
-   else {
-      /* If we don't in fact send enough texture coordinates, q will be 1,
-       * making TEXCOORD_PROJ act like TEXCOORD_NONPROJ (Right?)
-       */
-      t->pp_txformat_x |= R200_TEXCOORD_PROJ;
-   }
-
-   t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
-                   ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
-
-   /* Only need to round to nearest 32 for textures, but the blitter
-    * requires 64-byte aligned pitches, and we may/may not need the
-    * blitter.   NPOT only!
-    */
-   if ( !t->image_override ) {
-      if (baseImage->IsCompressed)
-         t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-      else
-         t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
-      t->pp_txpitch -= 32;
-   }
-
-   t->dirty_state = TEX_ALL;
-
-   /* FYI: r200UploadTexImages( rmesa, t ) used to be called here */
-}
-
-
-
 /* ================================================================
  * Texture combine functions
  */
@@ -569,7 +321,7 @@ static GLboolean r200UpdateTextureEnv( GLcontext *ctx, int unit, int slot, GLuin
    assert( (texUnit->_ReallyEnabled == 0)
 	   || (texUnit->_Current != NULL) );
 
-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
+   if ( R200_DEBUG & RADEON_TEXTURE ) {
       fprintf( stderr, "%s( %p, %d )\n", __FUNCTION__, (void *)ctx, unit );
    }
 
@@ -981,20 +733,19 @@ void r200SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 {
 	r200ContextPtr rmesa = pDRICtx->driverPrivate;
 	struct gl_texture_object *tObj =
-	    _mesa_lookup_texture(rmesa->glCtx, texname);
-	r200TexObjPtr t;
+	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
 
 	if (!tObj)
 		return;
 
-	t = (r200TexObjPtr) tObj->DriverData;
-
 	t->image_override = GL_TRUE;
 
 	if (!offset)
 		return;
 
-	t->pp_txoffset = offset;
+	t->bo = NULL;
+	t->override_offset = offset;
 	t->pp_txpitch = pitch - 32;
 
 	switch (depth) {
@@ -1014,6 +765,125 @@ void r200SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	}
 }
 
+void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format,
+		       __DRIdrawable *dPriv)
+{
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	r200ContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+	
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->tile_bits = 0;
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = tx_table_le[MESA_FORMAT_RGB888].format;
+		else
+			t->pp_txformat = tx_table_le[MESA_FORMAT_ARGB8888].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_ARGB8888].filter;
+		break;
+	case 3:
+	default:
+		t->pp_txformat = tx_table_le[MESA_FORMAT_RGB888].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_RGB888].filter;
+		break;
+	case 2:
+		t->pp_txformat = tx_table_le[MESA_FORMAT_RGB565].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_RGB565].filter;
+		break;
+	}
+        t->pp_txsize = ((rb->base.Width - 1) << RADEON_TEX_USIZE_SHIFT)
+		   | ((rb->base.Height - 1) << RADEON_TEX_VSIZE_SHIFT);
+        t->pp_txformat |= R200_TXFORMAT_NON_POWER2;
+	t->pp_txpitch = pitch_val;
+        t->pp_txpitch -= 32;
+
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
+}
+
+
+void r200SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+        r200SetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}
+
+
 #define REF_COLOR 1
 #define REF_ALPHA 2
 
@@ -1207,12 +1077,43 @@ static GLboolean r200UpdateAllTexEnv( GLcontext *ctx )
                                 R200_VOLUME_FILTER_MASK)
 
 
+static void disable_tex_obj_state( r200ContextPtr rmesa, 
+				   int unit )
+{
+   
+   R200_STATECHANGE( rmesa, vtx );
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
+
+   R200_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~(R200_TEX_0_ENABLE << unit);
+   if (rmesa->radeon.TclFallback & (R200_TCL_FALLBACK_TEXGEN_0<<unit)) {
+      TCL_FALLBACK( rmesa->radeon.glCtx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
+   }
+
+   /* Actually want to keep all units less than max active texture
+    * enabled, right?  Fix this for >2 texunits.
+    */
+
+   {
+      GLuint tmp = rmesa->TexGenEnabled;
+
+      rmesa->TexGenEnabled &= ~(R200_TEXGEN_TEXMAT_0_ENABLE<<unit);
+      rmesa->TexGenEnabled &= ~(R200_TEXMAT_0_ENABLE<<unit);
+      rmesa->TexGenNeedNormals[unit] = GL_FALSE;
+      rmesa->TexGenCompSel &= ~(R200_OUTPUT_TEX_0 << unit);
+
+      if (tmp != rmesa->TexGenEnabled) {
+	 rmesa->recheck_texgen[unit] = GL_TRUE;
+	 rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+      }
+   }
+}
 static void import_tex_obj_state( r200ContextPtr rmesa,
 				  int unit,
-				  r200TexObjPtr texobj )
+				  radeonTexObjPtr texobj )
 {
 /* do not use RADEON_DB_STATE to avoid stale texture caches */
-   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+   GLuint *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
 
    R200_STATECHANGE( rmesa, tex[unit] );
 
@@ -1225,36 +1126,21 @@ static void import_tex_obj_state( r200ContextPtr rmesa,
    cmd[TEX_PP_TXSIZE] = texobj->pp_txsize; /* NPOT only! */
    cmd[TEX_PP_TXPITCH] = texobj->pp_txpitch; /* NPOT only! */
    cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
-   if (rmesa->r200Screen->drmSupportsFragShader) {
-      cmd[TEX_PP_TXOFFSET_NEWDRM] = texobj->pp_txoffset;
-   }
-   else {
-      cmd[TEX_PP_TXOFFSET_OLDDRM] = texobj->pp_txoffset;
-   }
 
-   if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
-      GLuint bytesPerFace = texobj->base.totalSize / 6;
-      ASSERT(texobj->base.totalSize % 6 == 0);
+   if (texobj->base.Target == GL_TEXTURE_CUBE_MAP) {
+      GLuint *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
 
       R200_STATECHANGE( rmesa, cube[unit] );
       cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
-      if (rmesa->r200Screen->drmSupportsFragShader) {
+      if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
 	 /* that value is submitted twice. could change cube atom
 	    to not include that command when new drm is used */
 	 cmd[TEX_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
       }
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F1] = texobj->pp_txoffset + 1 * bytesPerFace;
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F2] = texobj->pp_txoffset + 2 * bytesPerFace;
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F3] = texobj->pp_txoffset + 3 * bytesPerFace;
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F4] = texobj->pp_txoffset + 4 * bytesPerFace;
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F5] = texobj->pp_txoffset + 5 * bytesPerFace;
    }
 
-   texobj->dirty_state &= ~(1<<unit);
 }
 
-
 static void set_texgen_matrix( r200ContextPtr rmesa, 
 			       GLuint unit,
 			       const GLfloat *s_plane,
@@ -1366,37 +1252,36 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
       fprintf(stderr, "%s unit %d\n", __FUNCTION__, unit);
 
    if (texUnit->TexGenEnabled & S_BIT) {
-      mode = texUnit->GenModeS;
+      mode = texUnit->GenS.Mode;
    } else {
       tgcm |= R200_TEXGEN_COMP_S << (unit * 4);
    }
 
    if (texUnit->TexGenEnabled & T_BIT) {
-      if (texUnit->GenModeT != mode)
+      if (texUnit->GenT.Mode != mode)
 	 mixed_fallback = GL_TRUE;
    } else {
       tgcm |= R200_TEXGEN_COMP_T << (unit * 4);
    }
-
    if (texUnit->TexGenEnabled & R_BIT) {
-      if (texUnit->GenModeR != mode)
+      if (texUnit->GenR.Mode != mode)
 	 mixed_fallback = GL_TRUE;
    } else {
       tgcm |= R200_TEXGEN_COMP_R << (unit * 4);
    }
 
    if (texUnit->TexGenEnabled & Q_BIT) {
-      if (texUnit->GenModeQ != mode)
+      if (texUnit->GenQ.Mode != mode)
 	 mixed_fallback = GL_TRUE;
    } else {
       tgcm |= R200_TEXGEN_COMP_Q << (unit * 4);
    }
 
    if (mixed_fallback) {
-      if (R200_DEBUG & DEBUG_FALLBACKS)
+      if (R200_DEBUG & RADEON_FALLBACKS)
 	 fprintf(stderr, "fallback mixed texgen, 0x%x (0x%x 0x%x 0x%x 0x%x)\n",
-		 texUnit->TexGenEnabled, texUnit->GenModeS, texUnit->GenModeT,
-		 texUnit->GenModeR, texUnit->GenModeQ);
+		 texUnit->TexGenEnabled, texUnit->GenS.Mode, texUnit->GenT.Mode,
+		 texUnit->GenR.Mode, texUnit->GenQ.Mode);
       return GL_FALSE;
    }
 
@@ -1414,10 +1299,12 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
    switch (mode) {
    case GL_OBJECT_LINEAR: {
       GLuint needtgenable = r200_need_dis_texgen( texUnit->TexGenEnabled,
-				texUnit->ObjectPlaneS, texUnit->ObjectPlaneT,
-				texUnit->ObjectPlaneR, texUnit->ObjectPlaneQ );
+                                                  texUnit->GenS.ObjectPlane,
+                                                  texUnit->GenT.ObjectPlane,
+                                                  texUnit->GenR.ObjectPlane,
+                                                  texUnit->GenQ.ObjectPlane );
       if (needtgenable & (S_BIT | T_BIT)) {
-	 if (R200_DEBUG & DEBUG_FALLBACKS)
+	 if (R200_DEBUG & RADEON_FALLBACKS)
 	 fprintf(stderr, "fallback mixed texgen / obj plane, 0x%x\n",
 		 texUnit->TexGenEnabled);
 	 return GL_FALSE;
@@ -1431,19 +1318,21 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
 
       tgi |= R200_TEXGEN_INPUT_OBJ << inputshift;
       set_texgen_matrix( rmesa, unit, 
-	 (texUnit->TexGenEnabled & S_BIT) ? texUnit->ObjectPlaneS : I,
-	 (texUnit->TexGenEnabled & T_BIT) ? texUnit->ObjectPlaneT : I + 4,
-	 (texUnit->TexGenEnabled & R_BIT) ? texUnit->ObjectPlaneR : I + 8,
-	 (texUnit->TexGenEnabled & Q_BIT) ? texUnit->ObjectPlaneQ : I + 12);
+	 (texUnit->TexGenEnabled & S_BIT) ? texUnit->GenS.ObjectPlane : I,
+	 (texUnit->TexGenEnabled & T_BIT) ? texUnit->GenT.ObjectPlane : I + 4,
+	 (texUnit->TexGenEnabled & R_BIT) ? texUnit->GenR.ObjectPlane : I + 8,
+	 (texUnit->TexGenEnabled & Q_BIT) ? texUnit->GenQ.ObjectPlane : I + 12);
       }
       break;
 
    case GL_EYE_LINEAR: {
       GLuint needtgenable = r200_need_dis_texgen( texUnit->TexGenEnabled,
-				texUnit->EyePlaneS, texUnit->EyePlaneT,
-				texUnit->EyePlaneR, texUnit->EyePlaneQ );
+                                                  texUnit->GenS.EyePlane,
+                                                  texUnit->GenT.EyePlane,
+                                                  texUnit->GenR.EyePlane,
+                                                  texUnit->GenQ.EyePlane );
       if (needtgenable & (S_BIT | T_BIT)) {
-	 if (R200_DEBUG & DEBUG_FALLBACKS)
+	 if (R200_DEBUG & RADEON_FALLBACKS)
 	 fprintf(stderr, "fallback mixed texgen / eye plane, 0x%x\n",
 		 texUnit->TexGenEnabled);
 	 return GL_FALSE;
@@ -1456,10 +1345,10 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
       }
       tgi |= R200_TEXGEN_INPUT_EYE << inputshift;
       set_texgen_matrix( rmesa, unit,
-	 (texUnit->TexGenEnabled & S_BIT) ? texUnit->EyePlaneS : I,
-	 (texUnit->TexGenEnabled & T_BIT) ? texUnit->EyePlaneT : I + 4,
-	 (texUnit->TexGenEnabled & R_BIT) ? texUnit->EyePlaneR : I + 8,
-	 (texUnit->TexGenEnabled & Q_BIT) ? texUnit->EyePlaneQ : I + 12);
+	 (texUnit->TexGenEnabled & S_BIT) ? texUnit->GenS.EyePlane : I,
+	 (texUnit->TexGenEnabled & T_BIT) ? texUnit->GenT.EyePlane : I + 4,
+	 (texUnit->TexGenEnabled & R_BIT) ? texUnit->GenR.EyePlane : I + 8,
+	 (texUnit->TexGenEnabled & Q_BIT) ? texUnit->GenQ.EyePlane : I + 12);
       }
       break;
 
@@ -1493,9 +1382,9 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
    default:
       /* Unsupported mode, fallback:
        */
-      if (R200_DEBUG & DEBUG_FALLBACKS)
+      if (R200_DEBUG & RADEON_FALLBACKS)
 	 fprintf(stderr, "fallback unsupported texgen, %d\n",
-		 texUnit->GenModeS);
+		 texUnit->GenS.Mode);
       return GL_FALSE;
    }
 
@@ -1513,52 +1402,6 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
    return GL_TRUE;
 }
 
-
-static void disable_tex( GLcontext *ctx, int unit )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit)) {
-      /* Texture unit disabled */
-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
-	 /* The old texture is no longer bound to this texture unit.
-	  * Mark it as such.
-	  */
-
-	 rmesa->state.texture.unit[unit].texobj->base.bound &= ~(1UL << unit);
-	 rmesa->state.texture.unit[unit].texobj = NULL;
-      }
-
-      R200_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~(R200_TEX_0_ENABLE << unit);
-	 
-      R200_STATECHANGE( rmesa, vtx );
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
-	 
-      if (rmesa->TclFallback & (R200_TCL_FALLBACK_TEXGEN_0<<unit)) {
-	 TCL_FALLBACK( ctx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
-      }
-
-      /* Actually want to keep all units less than max active texture
-       * enabled, right?  Fix this for >2 texunits.
-       */
-
-      {
-	 GLuint tmp = rmesa->TexGenEnabled;
-
-	 rmesa->TexGenEnabled &= ~(R200_TEXGEN_TEXMAT_0_ENABLE<<unit);
-	 rmesa->TexGenEnabled &= ~(R200_TEXMAT_0_ENABLE<<unit);
-	 rmesa->TexGenNeedNormals[unit] = GL_FALSE;
-	 rmesa->TexGenCompSel &= ~(R200_OUTPUT_TEX_0 << unit);
-
-	 if (tmp != rmesa->TexGenEnabled) {
-	    rmesa->recheck_texgen[unit] = GL_TRUE;
-	    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
-	 }
-      }
-   }
-}
-
 void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
@@ -1575,237 +1418,169 @@ void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
    }
 }
 
-static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
+/**
+ * Compute the cached hardware register values for the given texture object.
+ *
+ * \param rmesa Context pointer
+ * \param t the r300 texture object
+ */
+static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
 {
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-
-   /* Need to load the 2d images associated with this unit.
-    */
-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
+   int firstlevel = t->mt ? t->mt->firstLevel : 0;
+   const struct gl_texture_image *firstImage = t->base.Image[0][firstlevel];
+   GLint log2Width, log2Height, log2Depth, texelBytes;
+   
+   if ( t->bo ) {
+       return;
    }
 
-   ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
+   log2Width  = firstImage->WidthLog2;
+   log2Height = firstImage->HeightLog2;
+   log2Depth  = firstImage->DepthLog2;
+   texelBytes = firstImage->TexFormat->TexelBytes;
 
-   if ( t->base.dirty_images[0] ) {
-      R200_FIREVERTICES( rmesa );
-      r200SetTexImages( rmesa, tObj );
-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock && !t->image_override ) 
-	 return GL_FALSE;
-   }
 
-   set_re_cntl_d3d( ctx, unit, GL_FALSE );
-
-   return GL_TRUE;
-}
-
-#if ENABLE_HW_3D_TEXTURE
-static GLboolean enable_tex_3d( GLcontext *ctx, int unit )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-
-   /* Need to load the 3d images associated with this unit.
-    */
-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
+   if (!t->image_override) {
+      if (VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+	 const struct tx_table *table = _mesa_little_endian() ? tx_table_le :
+	    tx_table_be;
+	 
+	 t->pp_txformat &= ~(R200_TXFORMAT_FORMAT_MASK |
+			     R200_TXFORMAT_ALPHA_IN_MAP);
+	 t->pp_txfilter &= ~R200_YUV_TO_RGB;
+	 
+	 t->pp_txformat |= table[ firstImage->TexFormat->MesaFormat ].format;
+	 t->pp_txfilter |= table[ firstImage->TexFormat->MesaFormat ].filter;
+      } else {
+	 _mesa_problem(NULL, "unexpected texture format in %s",
+		       __FUNCTION__);
+	 return;
+      }
    }
+   
+   t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
+   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << R200_MAX_MIP_LEVEL_SHIFT;
+	
+   t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
+		       R200_TXFORMAT_HEIGHT_MASK |
+		       R200_TXFORMAT_CUBIC_MAP_ENABLE |
+		       R200_TXFORMAT_F5_WIDTH_MASK |
+		       R200_TXFORMAT_F5_HEIGHT_MASK);
+   t->pp_txformat |= ((log2Width << R200_TXFORMAT_WIDTH_SHIFT) |
+		      (log2Height << R200_TXFORMAT_HEIGHT_SHIFT));
+   
+   t->tile_bits = 0;
+   
+   t->pp_txformat_x &= ~(R200_DEPTH_LOG2_MASK | R200_TEXCOORD_MASK);
+   if (t->base.Target == GL_TEXTURE_3D) {
+      t->pp_txformat_x |= (log2Depth << R200_DEPTH_LOG2_SHIFT);
+      t->pp_txformat_x |= R200_TEXCOORD_VOLUME;
 
-   ASSERT(tObj->Target == GL_TEXTURE_3D);
-
-   /* R100 & R200 do not support mipmaps for 3D textures.
-    */
-   if ( (tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR) ) {
-      return GL_FALSE;
    }
-
-   if ( t->base.dirty_images[0] ) {
-      R200_FIREVERTICES( rmesa );
-      r200SetTexImages( rmesa, tObj );
-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock ) 
-	 return GL_FALSE;
+   else if (t->base.Target == GL_TEXTURE_CUBE_MAP) {
+      ASSERT(log2Width == log2Height);
+      t->pp_txformat |= ((log2Width << R200_TXFORMAT_F5_WIDTH_SHIFT) |
+			 (log2Height << R200_TXFORMAT_F5_HEIGHT_SHIFT) |
+			 /* don't think we need this bit, if it exists at all - fglrx does not set it */
+			 (R200_TXFORMAT_CUBIC_MAP_ENABLE));
+      t->pp_txformat_x |= R200_TEXCOORD_CUBIC_ENV;
+      t->pp_cubic_faces = ((log2Width << R200_FACE_WIDTH_1_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_1_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_2_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_2_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_3_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_3_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_4_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_4_SHIFT));
    }
-
-   set_re_cntl_d3d( ctx, unit, GL_TRUE );
-
-   return GL_TRUE;
-}
-#endif
-
-static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-   GLuint face;
-
-   /* Need to load the 2d images associated with this unit.
-    */
-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
-      for (face = 0; face < 6; face++)
-         t->base.dirty_images[face] = ~0;
+   else {
+      /* If we don't in fact send enough texture coordinates, q will be 1,
+       * making TEXCOORD_PROJ act like TEXCOORD_NONPROJ (Right?)
+       */
+      t->pp_txformat_x |= R200_TEXCOORD_PROJ;
    }
 
-   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+   t->pp_txsize = (((firstImage->Width - 1) << R200_PP_TX_WIDTHMASK_SHIFT)
+		   | ((firstImage->Height - 1) << R200_PP_TX_HEIGHTMASK_SHIFT));
 
-   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
-        t->base.dirty_images[2] || t->base.dirty_images[3] ||
-        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
-      /* flush */
-      R200_FIREVERTICES( rmesa );
-      /* layout memory space, once for all faces */
-      r200SetTexImages( rmesa, tObj );
-   }
-
-   /* upload (per face) */
-   for (face = 0; face < 6; face++) {
-      if (t->base.dirty_images[face]) {
-         r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, face );
-      }
-   }
-      
-   if ( !t->base.memBlock ) {
-      /* texmem alloc failed, use s/w fallback */
-      return GL_FALSE;
+   if ( !t->image_override ) {
+      if (firstImage->IsCompressed)
+         t->pp_txpitch = (firstImage->Width + 63) & ~(63);
+      else
+         t->pp_txpitch = ((firstImage->Width * texelBytes) + 63) & ~(63);
+      t->pp_txpitch -= 32;
    }
 
-   set_re_cntl_d3d( ctx, unit, GL_TRUE );
-
-   return GL_TRUE;
-}
-
-static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-
-   if (!(t->pp_txformat & R200_TXFORMAT_NON_POWER2)) {
+   if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
       t->pp_txformat |= R200_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
-   }
-
-   ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
-
-   if ( t->base.dirty_images[0] ) {
-      R200_FIREVERTICES( rmesa );
-      r200SetTexImages( rmesa, tObj );
-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock &&
-           !t->image_override &&
-           !rmesa->prefer_gart_client_texturing ) 
-	 return GL_FALSE;
    }
 
-   set_re_cntl_d3d( ctx, unit, GL_FALSE );
-
-   return GL_TRUE;
 }
 
-
-static GLboolean update_tex_common( GLcontext *ctx, int unit )
+static GLboolean r200_validate_texture(GLcontext *ctx, struct gl_texture_object *texObj, int unit)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-
-   /* Fallback if there's a texture border */
-   if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 )
-       return GL_FALSE;
-
-   /* Update state if this is a different texture object to last
-    * time.
-    */
-   if ( rmesa->state.texture.unit[unit].texobj != t ) {
-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
-	 /* The old texture is no longer bound to this texture unit.
-	  * Mark it as such.
-	  */
+   radeonTexObj *t = radeon_tex_obj(texObj);
 
-	 rmesa->state.texture.unit[unit].texobj->base.bound &= 
-	     ~(1UL << unit);
-      }
-
-      rmesa->state.texture.unit[unit].texobj = t;
-      t->base.bound |= (1UL << unit);
-      t->dirty_state |= 1<<unit;
-      driUpdateTextureLRU( (driTextureObject *) t ); /* XXX: should be locked! */
-   }
-
-
-   /* Newly enabled?
-    */
-   if ( 1|| !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit))) {
-      R200_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << unit;
-
-      R200_STATECHANGE( rmesa, vtx );
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] |= 4 << (unit * 3);
+   if (!radeon_validate_texture_miptree(ctx, texObj))
+      return GL_FALSE;
 
-      rmesa->recheck_texgen[unit] = GL_TRUE;
-   }
+   r200_validate_texgen(ctx, unit);
+   /* Configure the hardware registers (more precisely, the cached version
+    * of the hardware registers). */
+   setup_hardware_state(rmesa, t);
+
+   if (texObj->Target == GL_TEXTURE_RECTANGLE_NV ||
+       texObj->Target == GL_TEXTURE_2D ||
+       texObj->Target == GL_TEXTURE_1D)
+      set_re_cntl_d3d( ctx, unit, GL_FALSE );
+   else
+      set_re_cntl_d3d( ctx, unit, GL_TRUE );
+   R200_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << unit;
+   
+   R200_STATECHANGE( rmesa, vtx );
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] |= 4 << (unit * 3);
 
-   if (t->dirty_state & (1<<unit)) {
-      import_tex_obj_state( rmesa, unit, t );
-   }
+   rmesa->recheck_texgen[unit] = GL_TRUE;
+   import_tex_obj_state( rmesa, unit, t );
 
    if (rmesa->recheck_texgen[unit]) {
       GLboolean fallback = !r200_validate_texgen( ctx, unit );
       TCL_FALLBACK( ctx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), fallback);
       rmesa->recheck_texgen[unit] = 0;
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
    }
 
-   FALLBACK( rmesa, R200_FALLBACK_BORDER_MODE, t->border_fallback );
-   return !t->border_fallback;
-}
+   t->validated = GL_TRUE;
 
+   FALLBACK( rmesa, RADEON_FALLBACK_BORDER_MODE, t->border_fallback );
 
+   return !t->border_fallback;
+}
 
-static GLboolean r200UpdateTextureUnit( GLcontext *ctx, int unit )
+static GLboolean r200UpdateTextureUnit(GLcontext *ctx, int unit)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLuint unitneeded = rmesa->state.texture.unit[unit].unitneeded;
 
-   if ( unitneeded & (TEXTURE_RECT_BIT) ) {
-      return (enable_tex_rect( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-   else if ( unitneeded & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
-      return (enable_tex_2d( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-#if ENABLE_HW_3D_TEXTURE
-   else if ( unitneeded & (TEXTURE_3D_BIT) ) {
-      return (enable_tex_3d( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-#endif
-   else if ( unitneeded & (TEXTURE_CUBE_BIT) ) {
-      return (enable_tex_cube( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-   else if ( unitneeded ) {
-      return GL_FALSE;
-   }
-   else {
-      disable_tex( ctx, unit );
-      return GL_TRUE;
+   if (!unitneeded) {
+      /* disable the unit */
+     disable_tex_obj_state(rmesa, unit);
+     return GL_TRUE;
    }
+
+   if (!r200_validate_texture(ctx, ctx->Texture.Unit[unit]._Current, unit)) {
+    _mesa_warning(ctx,
+		  "failed to validate texture for unit %d.\n",
+		  unit);
+    rmesa->state.texture.unit[unit].texobj = NULL;
+    return GL_FALSE;
+  }
+
+   rmesa->state.texture.unit[unit].texobj = radeon_tex_obj(ctx->Texture.Unit[unit]._Current);
+  return GL_TRUE;
 }
 
 
@@ -1846,11 +1621,11 @@ void r200UpdateTextureState( GLcontext *ctx )
 
    FALLBACK( rmesa, R200_FALLBACK_TEXTURE, !ok );
 
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       r200ChooseVertexState( ctx );
 
 
-   if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
+   if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
 
       /*
        * T0 hang workaround -------------
@@ -1863,7 +1638,7 @@ void r200UpdateTextureState( GLcontext *ctx )
 	 R200_STATECHANGE(rmesa, tex[1]);
 	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_1_ENABLE;
 	 if (!(rmesa->hw.cst.cmd[CST_PP_CNTL_X] & R200_PPX_TEX_1_ENABLE))
-	    rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+	   rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
 	 rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] |= R200_TXFORMAT_LOOKUP_DISABLE;
       }
       else if (!ctx->ATIFragmentShader._Enabled) {
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index 562992fbb5..11405d7cae 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -202,7 +202,7 @@ static unsigned long t_dst(struct prog_dst_register *dst)
    }
 }
 
-static unsigned long t_src_class(enum register_file file)
+static unsigned long t_src_class(gl_register_file file)
 {
 
    switch(file){
@@ -290,7 +290,7 @@ static unsigned long t_src(struct r200_vertex_program *vp, struct prog_src_regis
 			t_swizzle(GET_SWZ(src->Swizzle, 2)),
 			t_swizzle(GET_SWZ(src->Swizzle, 3)),
 			t_src_class(src->File),
-			src->NegateBase) | (src->RelAddr << 4);
+			src->Negate) | (src->RelAddr << 4);
 }
 
 static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_src_register *src)
@@ -302,7 +302,7 @@ static unsigned long t_src_scalar(struct r200_vertex_program *vp, struct prog_sr
 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			t_swizzle(GET_SWZ(src->Swizzle, 0)),
 			t_src_class(src->File),
-			src->NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
+			src->Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src->RelAddr << 4);
 }
 
 static unsigned long t_opcode(enum prog_opcode opcode)
@@ -423,7 +423,7 @@ static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_verte
       ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 |
       VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 |
       VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) != 0) {
-      if (R200_DEBUG & DEBUG_FALLBACKS) {
+      if (R200_DEBUG & RADEON_FALLBACKS) {
 	 fprintf(stderr, "can't handle vert prog inputs 0x%x\n",
 	    mesa_vp->Base.InputsRead);
       }
@@ -436,7 +436,7 @@ static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_verte
       (1 << VERT_RESULT_FOGC) | (1 << VERT_RESULT_TEX0) | (1 << VERT_RESULT_TEX1) |
       (1 << VERT_RESULT_TEX2) | (1 << VERT_RESULT_TEX3) | (1 << VERT_RESULT_TEX4) |
       (1 << VERT_RESULT_TEX5) | (1 << VERT_RESULT_PSIZ))) != 0) {
-      if (R200_DEBUG & DEBUG_FALLBACKS) {
+      if (R200_DEBUG & RADEON_FALLBACKS) {
 	 fprintf(stderr, "can't handle vert prog outputs 0x%x\n",
 	    mesa_vp->Base.OutputsWritten);
       }
@@ -551,7 +551,7 @@ static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_verte
       if (mesa_vp->Base.InputsRead & (1 << i)) {
 	 array_count++;
 	 if (array_count > 12) {
-	    if (R200_DEBUG & DEBUG_FALLBACKS) {
+	    if (R200_DEBUG & RADEON_FALLBACKS) {
 	       fprintf(stderr, "more than 12 attribs used in vert prog\n");
 	    }
 	    return GL_FALSE;
@@ -571,13 +571,13 @@ static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_verte
    }
 
    if (!(mesa_vp->Base.OutputsWritten & (1 << VERT_RESULT_HPOS))) {
-      if (R200_DEBUG & DEBUG_FALLBACKS) {
+      if (R200_DEBUG & RADEON_FALLBACKS) {
 	 fprintf(stderr, "can't handle vert prog without position output\n");
       }
       return GL_FALSE;
    }
    if (free_inputs & 1) {
-      if (R200_DEBUG & DEBUG_FALLBACKS) {
+      if (R200_DEBUG & RADEON_FALLBACKS) {
 	 fprintf(stderr, "can't handle vert prog without position input\n");
       }
       return GL_FALSE;
@@ -700,7 +700,7 @@ static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_verte
 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
 		   SWIZZLE_ZERO,
 		   t_src_class(src[0].File),
-		   src[0].NegateBase) | (src[0].RelAddr << 4);
+		   src[0].Negate) | (src[0].RelAddr << 4);
 	    o_inst->src1 = UNUSED_SRC_0;
 	    o_inst->src2 = UNUSED_SRC_0;
 	 }
@@ -712,12 +712,12 @@ static GLboolean r200_translate_vertex_program(GLcontext *ctx, struct r200_verte
 		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
 		   SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO,
 		   t_src_class(src[0].File),
-		   src[0].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+		   src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
 	    o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 		   SWIZZLE_ZERO, SWIZZLE_ZERO,
 		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO,
 		   t_src_class(src[1].File),
-		   src[1].NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+		   src[1].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
 	    o_inst->src2 = UNUSED_SRC_1;
 	    o_inst++;
 
@@ -766,11 +766,11 @@ if ((o_inst - vp->instr) == 31) {
 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 			SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X,
 			t_src_class(src[1].File),
-			src[1].NegateBase) | (src[1].RelAddr << 4);
+			src[1].Negate) | (src[1].RelAddr << 4);
 o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 			SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y,
 			t_src_class(src[1].File),
-			src[1].NegateBase) | (src[1].RelAddr << 4);
+			src[1].Negate) | (src[1].RelAddr << 4);
 }
 else {
 	 o_inst->src1 = t_src(vp, &src[1]);
@@ -792,7 +792,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
 		SWIZZLE_ZERO,
 		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
+		src[0].Negate) | (src[0].RelAddr << 4);
 
 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
@@ -800,7 +800,7 @@ else {
 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
 		SWIZZLE_ZERO,
 		t_src_class(src[1].File),
-		src[1].NegateBase) | (src[1].RelAddr << 4);
+		src[1].Negate) | (src[1].RelAddr << 4);
 
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
@@ -815,7 +815,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
 		VSF_IN_COMPONENT_ONE,
 		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
+		src[0].Negate) | (src[0].RelAddr << 4);
 	 o_inst->src1 = t_src(vp, &src[1]);
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
@@ -831,7 +831,7 @@ else {
 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
 		t_src_class(src[1].File),
-		(!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
 
@@ -846,7 +846,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
 		t_src_class(src[0].File),
-		(!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
+		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
 	 o_inst->src2 = UNUSED_SRC_1;
 	 goto next;
 
@@ -874,7 +874,7 @@ else {
 		VSF_IN_COMPONENT_W,
 		VSF_IN_CLASS_TMP,
 		/* Not 100% sure about this */
-		(!src[0].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
+		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);
 
 	 o_inst->src2 = UNUSED_SRC_0;
 	 u_temp_i--;
@@ -899,7 +899,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
 		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
+		src[0].Negate) | (src[0].RelAddr << 4);
 
 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
 		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
@@ -907,7 +907,7 @@ else {
 		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
 		t_src_class(src[1].File),
-		src[1].NegateBase) | (src[1].RelAddr << 4);
+		src[1].Negate) | (src[1].RelAddr << 4);
 
 	 o_inst->src2 = UNUSED_SRC_1;
 	 o_inst++;
@@ -922,7 +922,7 @@ else {
 		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
 		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
 		t_src_class(src[1].File),
-		(!src[1].NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
+		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
 
 	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
 		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
@@ -930,7 +930,7 @@ else {
 		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
 		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
 		t_src_class(src[0].File),
-		src[0].NegateBase) | (src[0].RelAddr << 4);
+		src[0].Negate) | (src[0].RelAddr << 4);
 
 	 o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1,
 		VSF_IN_COMPONENT_X,
@@ -1070,7 +1070,7 @@ else {
 	    mesa_vp->Base.NumTemporaries + u_temp_used;
       }
       if ((mesa_vp->Base.NumTemporaries + u_temp_used) > R200_VSF_MAX_TEMPS) {
-	 if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 if (R200_DEBUG & RADEON_FALLBACKS) {
 	    fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", mesa_vp->Base.NumTemporaries, u_temp_used);
 	 }
 	 return GL_FALSE;
@@ -1078,7 +1078,7 @@ else {
       u_temp_i = R200_VSF_MAX_TEMPS - 1;
       if(o_inst - vp->instr >= R200_VSF_MAX_INST) {
 	 mesa_vp->Base.NumNativeInstructions = 129;
-	 if (R200_DEBUG & DEBUG_FALLBACKS) {
+	 if (R200_DEBUG & RADEON_FALLBACKS) {
 	    fprintf(stderr, "more than 128 native instructions\n");
 	 }
 	 return GL_FALSE;
@@ -1110,9 +1110,9 @@ void r200SetupVertexProg( GLcontext *ctx ) {
    }
    /* could optimize setting up vertex progs away for non-tcl hw */
    fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp) &&
-      rmesa->r200Screen->drmSupportsVertexProgram);
+      rmesa->radeon.radeonScreen->drmSupportsVertexProgram);
    TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback);
-   if (rmesa->TclFallback) return;
+   if (rmesa->radeon.TclFallback) return;
 
    R200_STATECHANGE( rmesa, vap );
    /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it?
diff --git a/src/mesa/drivers/dri/r200/radeon_bo_legacy.c b/src/mesa/drivers/dri/r200/radeon_bo_legacy.c
new file mode 120000
index 0000000000..79ad050e6b
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_bo_legacy.c
@@ -0,0 +1 @@
+../radeon/radeon_bo_legacy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_bo_legacy.h b/src/mesa/drivers/dri/r200/radeon_bo_legacy.h
new file mode 120000
index 0000000000..83b0f7ffab
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_bo_legacy.h
@@ -0,0 +1 @@
+../radeon/radeon_bo_legacy.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_bocs_wrapper.h b/src/mesa/drivers/dri/r200/radeon_bocs_wrapper.h
new file mode 120000
index 0000000000..ca894b2443
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_bocs_wrapper.h
@@ -0,0 +1 @@
+../radeon/radeon_bocs_wrapper.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_chipset.h b/src/mesa/drivers/dri/r200/radeon_chipset.h
new file mode 120000
index 0000000000..eba99001ff
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_chipset.h
@@ -0,0 +1 @@
+../radeon/radeon_chipset.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_cmdbuf.h b/src/mesa/drivers/dri/r200/radeon_cmdbuf.h
new file mode 120000
index 0000000000..a799e1dc6d
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_cmdbuf.h
@@ -0,0 +1 @@
+../radeon/radeon_cmdbuf.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_common.c b/src/mesa/drivers/dri/r200/radeon_common.c
new file mode 120000
index 0000000000..67b19ba940
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_common.c
@@ -0,0 +1 @@
+../radeon/radeon_common.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_common.h b/src/mesa/drivers/dri/r200/radeon_common.h
new file mode 120000
index 0000000000..5bcb696a9f
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_common.h
@@ -0,0 +1 @@
+../radeon/radeon_common.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_common_context.c b/src/mesa/drivers/dri/r200/radeon_common_context.c
new file mode 120000
index 0000000000..86800f3819
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_common_context.c
@@ -0,0 +1 @@
+../radeon/radeon_common_context.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_common_context.h b/src/mesa/drivers/dri/r200/radeon_common_context.h
new file mode 120000
index 0000000000..4d66312550
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_common_context.h
@@ -0,0 +1 @@
+../radeon/radeon_common_context.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_cs_legacy.c b/src/mesa/drivers/dri/r200/radeon_cs_legacy.c
new file mode 120000
index 0000000000..006720f8a4
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_cs_legacy.c
@@ -0,0 +1 @@
+../radeon/radeon_cs_legacy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_cs_legacy.h b/src/mesa/drivers/dri/r200/radeon_cs_legacy.h
new file mode 120000
index 0000000000..a5f95e0a3d
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_cs_legacy.h
@@ -0,0 +1 @@
+../radeon/radeon_cs_legacy.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_cs_space_drm.c b/src/mesa/drivers/dri/r200/radeon_cs_space_drm.c
new file mode 120000
index 0000000000..c248ea7d1a
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_cs_space_drm.c
@@ -0,0 +1 @@
+../radeon/radeon_cs_space_drm.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_debug.c b/src/mesa/drivers/dri/r200/radeon_debug.c
new file mode 120000
index 0000000000..c98c2e074c
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_debug.c
@@ -0,0 +1 @@
+../radeon/radeon_debug.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_debug.h b/src/mesa/drivers/dri/r200/radeon_debug.h
new file mode 120000
index 0000000000..bd8aa28e89
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_debug.h
@@ -0,0 +1 @@
+../radeon/radeon_debug.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_dma.c b/src/mesa/drivers/dri/r200/radeon_dma.c
new file mode 120000
index 0000000000..43be000625
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_dma.c
@@ -0,0 +1 @@
+../radeon/radeon_dma.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_dma.h b/src/mesa/drivers/dri/r200/radeon_dma.h
new file mode 120000
index 0000000000..82e50634e3
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_dma.h
@@ -0,0 +1 @@
+../radeon/radeon_dma.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_fbo.c b/src/mesa/drivers/dri/r200/radeon_fbo.c
new file mode 120000
index 0000000000..0d738d8d78
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_fbo.c
@@ -0,0 +1 @@
+../radeon/radeon_fbo.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_lock.c b/src/mesa/drivers/dri/r200/radeon_lock.c
new file mode 120000
index 0000000000..af4108a8e3
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_lock.c
@@ -0,0 +1 @@
+../radeon/radeon_lock.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_lock.h b/src/mesa/drivers/dri/r200/radeon_lock.h
new file mode 120000
index 0000000000..64bdf94ee7
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_lock.h
@@ -0,0 +1 @@
+../radeon/radeon_lock.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_mipmap_tree.c b/src/mesa/drivers/dri/r200/radeon_mipmap_tree.c
new file mode 120000
index 0000000000..31c0cfbe94
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_mipmap_tree.c
@@ -0,0 +1 @@
+../radeon/radeon_mipmap_tree.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_mipmap_tree.h b/src/mesa/drivers/dri/r200/radeon_mipmap_tree.h
new file mode 120000
index 0000000000..254d50cf8c
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_mipmap_tree.h
@@ -0,0 +1 @@
+../radeon/radeon_mipmap_tree.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_queryobj.c b/src/mesa/drivers/dri/r200/radeon_queryobj.c
new file mode 120000
index 0000000000..1d6ebc1c48
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_queryobj.c
@@ -0,0 +1 @@
+../radeon/radeon_queryobj.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_queryobj.h b/src/mesa/drivers/dri/r200/radeon_queryobj.h
new file mode 120000
index 0000000000..8f6f842b0a
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_queryobj.h
@@ -0,0 +1 @@
+../radeon/radeon_queryobj.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_screen.c b/src/mesa/drivers/dri/r200/radeon_screen.c
new file mode 120000
index 0000000000..86161118dd
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_screen.c
@@ -0,0 +1 @@
+../radeon/radeon_screen.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_screen.h b/src/mesa/drivers/dri/r200/radeon_screen.h
new file mode 120000
index 0000000000..23bb6bd459
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_screen.h
@@ -0,0 +1 @@
+../radeon/radeon_screen.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_span.c b/src/mesa/drivers/dri/r200/radeon_span.c
new file mode 120000
index 0000000000..232868c4c9
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_span.c
@@ -0,0 +1 @@
+../radeon/radeon_span.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_span.h b/src/mesa/drivers/dri/r200/radeon_span.h
new file mode 120000
index 0000000000..f9d634508c
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_span.h
@@ -0,0 +1 @@
+../radeon/radeon_span.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_texture.c b/src/mesa/drivers/dri/r200/radeon_texture.c
new file mode 120000
index 0000000000..a822710915
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_texture.c
@@ -0,0 +1 @@
+../radeon/radeon_texture.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/radeon_texture.h b/src/mesa/drivers/dri/r200/radeon_texture.h
new file mode 120000
index 0000000000..17fac3d5ea
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/radeon_texture.h
@@ -0,0 +1 @@
+../radeon/radeon_texture.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/server/radeon.h b/src/mesa/drivers/dri/r200/server/radeon.h
new file mode 120000
index 0000000000..81274a54f1
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/server/radeon.h
@@ -0,0 +1 @@
+../../radeon/server/radeon.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/server/radeon_dri.c b/src/mesa/drivers/dri/r200/server/radeon_dri.c
new file mode 120000
index 0000000000..d05847d650
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/server/radeon_dri.c
@@ -0,0 +1 @@
+../../radeon/server/radeon_dri.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/server/radeon_dri.h b/src/mesa/drivers/dri/r200/server/radeon_dri.h
new file mode 120000
index 0000000000..27c591d3c9
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/server/radeon_dri.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_dri.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/server/radeon_egl.c b/src/mesa/drivers/dri/r200/server/radeon_egl.c
new file mode 120000
index 0000000000..d7735a7643
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/server/radeon_egl.c
@@ -0,0 +1 @@
+../../radeon/server/radeon_egl.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/server/radeon_macros.h b/src/mesa/drivers/dri/r200/server/radeon_macros.h
new file mode 120000
index 0000000000..c56cd735b8
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/server/radeon_macros.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_macros.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r200/server/radeon_reg.h b/src/mesa/drivers/dri/r200/server/radeon_reg.h
new file mode 120000
index 0000000000..e2349dcb68
--- /dev/null
+++ b/src/mesa/drivers/dri/r200/server/radeon_reg.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_reg.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/.gitignore b/src/mesa/drivers/dri/r300/.gitignore
deleted file mode 100644
index 3689a6a78e..0000000000
--- a/src/mesa/drivers/dri/r300/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-radeon_chipset.h
-radeon_screen.[ch]
-radeon_span.h
-server
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index 6ca934204f..5d8d6f6658 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -3,6 +3,8 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
+CFLAGS += $(RADEON_CFLAGS)
+
 LIBNAME = r300_dri.so
 
 MINIGLX_SOURCES = server/radeon_dri.c
@@ -11,6 +13,10 @@ ifeq ($(USING_EGL), 1)
 EGL_SOURCES = server/radeon_egl.c
 endif
 
+ifeq ($(RADEON_LDFLAGS),)
+CS_SOURCES = radeon_cs_space_drm.c
+endif
+
 COMMON_SOURCES = \
 	../../common/driverfuncs.c \
 	../common/mm.c \
@@ -20,67 +26,59 @@ COMMON_SOURCES = \
 	../common/xmlconfig.c \
 	../common/dri_util.c
 
+RADEON_COMMON_SOURCES = \
+	radeon_bo_legacy.c \
+	radeon_buffer_objects.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_cs_legacy.c \
+	radeon_dma.c \
+	radeon_debug.c \
+	radeon_fbo.c \
+	radeon_lock.c \
+	radeon_mipmap_tree.c \
+	radeon_span.c \
+	radeon_queryobj.c \
+	radeon_texture.c
+
 DRIVER_SOURCES = \
 		 radeon_screen.c \
-		 radeon_context.c \
-		 radeon_ioctl.c \
-		 radeon_lock.c \
-		 radeon_span.c \
-		 radeon_state.c \
-		 r300_mem.c \
 		 r300_context.c \
+		 r300_draw.c \
 		 r300_ioctl.c \
 		 r300_cmdbuf.c \
 		 r300_state.c \
 		 r300_render.c \
-		 r300_texmem.c \
 		 r300_tex.c \
 		 r300_texstate.c \
-		 radeon_program.c \
-		 radeon_program_alu.c \
-		 radeon_program_pair.c \
-		 radeon_nqssadce.c \
 		 r300_vertprog.c \
-		 r300_fragprog.c \
-		 r300_fragprog_swizzle.c \
-		 r300_fragprog_emit.c \
-		 r500_fragprog.c \
-		 r500_fragprog_emit.c \
+		 r300_fragprog_common.c \
 		 r300_shader.c \
 		 r300_emit.c \
 		 r300_swtcl.c \
-		 $(EGL_SOURCES)
+		 $(RADEON_COMMON_SOURCES) \
+		 $(EGL_SOURCES) \
+		 $(CS_SOURCES)
 
 C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
 
 DRIVER_DEFINES = -DCOMPILE_R300 -DR200_MERGED=0 \
-	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300
-
-SYMLINKS = \
-	server/radeon_dri.c \
-	server/radeon_dri.h \
-	server/radeon.h \
-	server/radeon_macros.h \
-	server/radeon_reg.h \
-	server/radeon_egl.c
-
-COMMON_SYMLINKS = \
-	radeon_chipset.h \
-	radeon_screen.c \
-	radeon_screen.h \
-	radeon_span.h
+	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300 \
+#	-DRADEON_BO_TRACK \
+	-Wall
+
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
+
+PIPE_DRIVERS =  compiler/libr300compiler.a
 
 ##### TARGETS #####
 
 include ../Makefile.template
 
-server:
-	mkdir -p server
-
-$(SYMLINKS): server
-	@[ -e $@ ] || ln -sf ../../radeon/$@ server/
+symlinks:
 
-$(COMMON_SYMLINKS):
-	@[ -e $@ ] || ln -sf ../radeon/$@ ./
+# Mark the archive phony so that we always check for recompilation
+.PHONY : compiler/libr300compiler.a
 
-symlinks: $(SYMLINKS) $(COMMON_SYMLINKS)
+compiler/libr300compiler.a:
+	cd compiler && $(MAKE)
diff --git a/src/mesa/drivers/dri/r300/compiler/Makefile b/src/mesa/drivers/dri/r300/compiler/Makefile
new file mode 100644
index 0000000000..d973844192
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/Makefile
@@ -0,0 +1,75 @@
+# src/mesa/drivers/dri/r300/compiler/Makefile
+
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = r300compiler
+
+C_SOURCES = \
+		radeon_code.c \
+		radeon_compiler.c \
+		radeon_nqssadce.c \
+		radeon_program.c \
+		radeon_program_alu.c \
+		radeon_program_pair.c \
+		r3xx_fragprog.c \
+		r300_fragprog.c \
+		r300_fragprog_swizzle.c \
+		r300_fragprog_emit.c \
+		r500_fragprog.c \
+		r500_fragprog_emit.c \
+		r3xx_vertprog.c \
+		r3xx_vertprog_dump.c \
+		\
+		memory_pool.c
+
+
+### Basic defines ###
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	$(CPP_SOURCES:.cpp=.o) \
+	$(ASM_SOURCES:.S=.o)
+
+INCLUDES = \
+	-I. \
+	-I$(TOP)/include \
+	-I$(TOP)/src/mesa \
+
+
+##### TARGETS #####
+
+default: depend lib$(LIBNAME).a
+
+lib$(LIBNAME).a: $(OBJECTS) Makefile $(TOP)/configs/current
+	$(MKLIB) -o $(LIBNAME) -static $(OBJECTS)
+
+depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDES) $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) 2> /dev/null
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+# Remove .o and backup files
+clean:
+	rm -f $(OBJECTS) lib$(LIBNAME).a depend depend.bak
+
+# Dummy target
+install:
+	@echo -n ""
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(LIBRARY_DEFINES) $< -o $@
+
+.cpp.o:
+	$(CXX) -c $(INCLUDES) $(CXXFLAGS) $(LIBRARY_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(LIBRARY_DEFINES)  $< -o $@
+
+
+sinclude depend
diff --git a/src/mesa/drivers/dri/r300/compiler/memory_pool.c b/src/mesa/drivers/dri/r300/compiler/memory_pool.c
new file mode 100644
index 0000000000..37aa2b6579
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/memory_pool.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "memory_pool.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+#define POOL_LARGE_ALLOC 4096
+#define POOL_ALIGN 4
+
+
+struct memory_block {
+	struct memory_block * next;
+};
+
+void memory_pool_init(struct memory_pool * pool)
+{
+	memset(pool, 0, sizeof(struct memory_pool));
+}
+
+
+void memory_pool_destroy(struct memory_pool * pool)
+{
+	while(pool->blocks) {
+		struct memory_block * block = pool->blocks;
+		pool->blocks = block->next;
+		free(block);
+	}
+}
+
+static void refill_pool(struct memory_pool * pool)
+{
+	unsigned int blocksize = pool->total_allocated;
+	struct memory_block * newblock;
+
+	if (!blocksize)
+		blocksize = 2*POOL_LARGE_ALLOC;
+
+	newblock = (struct memory_block*)malloc(blocksize);
+	newblock->next = pool->blocks;
+	pool->blocks = newblock;
+
+	pool->head = (unsigned char*)(newblock + 1);
+	pool->end = ((unsigned char*)newblock) + blocksize;
+	pool->total_allocated += blocksize;
+}
+
+
+void * memory_pool_malloc(struct memory_pool * pool, unsigned int bytes)
+{
+	if (bytes < POOL_LARGE_ALLOC) {
+		if (pool->head + bytes > pool->end)
+			refill_pool(pool);
+
+		assert(pool->head + bytes <= pool->end);
+
+		void * ptr = pool->head;
+
+		pool->head += bytes;
+		pool->head = (unsigned char*)(((unsigned long)pool->head + POOL_ALIGN - 1) & ~(POOL_ALIGN - 1));
+
+		return ptr;
+	} else {
+		struct memory_block * block = (struct memory_block*)malloc(bytes + sizeof(struct memory_block));
+
+		block->next = pool->blocks;
+		pool->blocks = block;
+
+		return (block + 1);
+	}
+}
+
+
diff --git a/src/mesa/drivers/dri/r300/compiler/memory_pool.h b/src/mesa/drivers/dri/r300/compiler/memory_pool.h
new file mode 100644
index 0000000000..ce23c319ad
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/memory_pool.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef MEMORY_POOL_H
+#define MEMORY_POOL_H
+
+struct memory_block;
+
+/**
+ * Provides a pool of memory that can quickly be allocated from, at the
+ * cost of being unable to explicitly free one of the allocated blocks.
+ * Instead, the entire pool can be freed at once.
+ *
+ * The idea is to allow one to quickly allocate a flexible amount of
+ * memory during operations like shader compilation while avoiding
+ * reference counting headaches.
+ */
+struct memory_pool {
+	unsigned char * head;
+	unsigned char * end;
+	unsigned int total_allocated;
+	struct memory_block * blocks;
+};
+
+
+void memory_pool_init(struct memory_pool * pool);
+void memory_pool_destroy(struct memory_pool * pool);
+void * memory_pool_malloc(struct memory_pool * pool, unsigned int bytes);
+
+#endif /* MEMORY_POOL_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog.c
new file mode 100644
index 0000000000..6c9fba4914
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "r300_fragprog.h"
+
+#include "shader/prog_parameter.h"
+
+#include "../r300_reg.h"
+
+static struct prog_src_register shadow_ambient(struct radeon_compiler * c, int tmu)
+{
+	struct prog_src_register reg = { 0, };
+
+	reg.File = PROGRAM_STATE_VAR;
+	reg.Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_SHADOW_AMBIENT, tmu);
+	reg.Swizzle = SWIZZLE_WWWW;
+	return reg;
+}
+
+/**
+ * Transform TEX, TXP, TXB, and KIL instructions in the following way:
+ *  - premultiply texture coordinates for RECT
+ *  - extract operand swizzles
+ *  - introduce a temporary register when write masks are needed
+ */
+GLboolean r300_transform_TEX(
+	struct radeon_compiler * c,
+	struct rc_instruction* inst,
+	void* data)
+{
+	struct r300_fragment_program_compiler *compiler =
+		(struct r300_fragment_program_compiler*)data;
+
+	if (inst->I.Opcode != OPCODE_TEX &&
+	    inst->I.Opcode != OPCODE_TXB &&
+	    inst->I.Opcode != OPCODE_TXP &&
+	    inst->I.Opcode != OPCODE_KIL)
+		return GL_FALSE;
+
+	/* ARB_shadow & EXT_shadow_funcs */
+	if (inst->I.Opcode != OPCODE_KIL &&
+	    c->Program.ShadowSamplers & (1 << inst->I.TexSrcUnit)) {
+		GLuint comparefunc = GL_NEVER + compiler->state.unit[inst->I.TexSrcUnit].texture_compare_func;
+
+		if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
+			inst->I.Opcode = OPCODE_MOV;
+
+			if (comparefunc == GL_ALWAYS) {
+				inst->I.SrcReg[0].File = PROGRAM_BUILTIN;
+				inst->I.SrcReg[0].Swizzle = SWIZZLE_1111;
+			} else {
+				inst->I.SrcReg[0] = shadow_ambient(c, inst->I.TexSrcUnit);
+			}
+
+			return GL_TRUE;
+		} else {
+			GLuint comparefunc = GL_NEVER + compiler->state.unit[inst->I.TexSrcUnit].texture_compare_func;
+			GLuint depthmode = compiler->state.unit[inst->I.TexSrcUnit].depth_texture_mode;
+			struct rc_instruction * inst_rcp = rc_insert_new_instruction(c, inst);
+			struct rc_instruction * inst_mad = rc_insert_new_instruction(c, inst_rcp);
+			struct rc_instruction * inst_cmp = rc_insert_new_instruction(c, inst_mad);
+			int pass, fail;
+
+			inst_rcp->I.Opcode = OPCODE_RCP;
+			inst_rcp->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst_rcp->I.DstReg.Index = rc_find_free_temporary(c);
+			inst_rcp->I.DstReg.WriteMask = WRITEMASK_W;
+			inst_rcp->I.SrcReg[0] = inst->I.SrcReg[0];
+			inst_rcp->I.SrcReg[0].Swizzle = SWIZZLE_WWWW;
+
+			inst_cmp->I.DstReg = inst->I.DstReg;
+			inst->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst->I.DstReg.Index = rc_find_free_temporary(c);
+			inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+
+			inst_mad->I.Opcode = OPCODE_MAD;
+			inst_mad->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst_mad->I.DstReg.Index = rc_find_free_temporary(c);
+			inst_mad->I.SrcReg[0] = inst->I.SrcReg[0];
+			inst_mad->I.SrcReg[0].Swizzle = SWIZZLE_ZZZZ;
+			inst_mad->I.SrcReg[1].File = PROGRAM_TEMPORARY;
+			inst_mad->I.SrcReg[1].Index = inst_rcp->I.DstReg.Index;
+			inst_mad->I.SrcReg[1].Swizzle = SWIZZLE_WWWW;
+			inst_mad->I.SrcReg[2].File = PROGRAM_TEMPORARY;
+			inst_mad->I.SrcReg[2].Index = inst->I.DstReg.Index;
+			if (depthmode == 0) /* GL_LUMINANCE */
+				inst_mad->I.SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+			else if (depthmode == 2) /* GL_ALPHA */
+				inst_mad->I.SrcReg[2].Swizzle = SWIZZLE_WWWW;
+
+			/* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
+			 *   r  < tex  <=>      -tex+r < 0
+			 *   r >= tex  <=> not (-tex+r < 0 */
+			if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
+				inst_mad->I.SrcReg[2].Negate = inst_mad->I.SrcReg[2].Negate ^ NEGATE_XYZW;
+			else
+				inst_mad->I.SrcReg[0].Negate = inst_mad->I.SrcReg[0].Negate ^ NEGATE_XYZW;
+
+			inst_cmp->I.Opcode = OPCODE_CMP;
+			/* DstReg has been filled out above */
+			inst_cmp->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+			inst_cmp->I.SrcReg[0].Index = inst_mad->I.DstReg.Index;
+
+			if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+				pass = 1;
+				fail = 2;
+			} else {
+				pass = 2;
+				fail = 1;
+			}
+
+			inst_cmp->I.SrcReg[pass].File = PROGRAM_BUILTIN;
+			inst_cmp->I.SrcReg[pass].Swizzle = SWIZZLE_1111;
+			inst_cmp->I.SrcReg[fail] = shadow_ambient(c, inst->I.TexSrcUnit);
+		}
+	}
+
+	/* Hardware uses [0..1]x[0..1] range for rectangle textures
+	 * instead of [0..Width]x[0..Height].
+	 * Add a scaling instruction.
+	 */
+	if (inst->I.Opcode != OPCODE_KIL && inst->I.TexSrcTarget == TEXTURE_RECT_INDEX) {
+		struct rc_instruction * inst_mul = rc_insert_new_instruction(c, inst->Prev);
+
+		inst_mul->I.Opcode = OPCODE_MUL;
+		inst_mul->I.DstReg.File = PROGRAM_TEMPORARY;
+		inst_mul->I.DstReg.Index = rc_find_free_temporary(c);
+		inst_mul->I.SrcReg[0] = inst->I.SrcReg[0];
+		inst_mul->I.SrcReg[1].File = PROGRAM_STATE_VAR;
+		inst_mul->I.SrcReg[1].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_TEXRECT_FACTOR, inst->I.TexSrcUnit);
+
+		reset_srcreg(&inst->I.SrcReg[0]);
+		inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+		inst->I.SrcReg[0].Index = inst_mul->I.DstReg.Index;
+	}
+
+	/* Cannot write texture to output registers or with masks */
+	if (inst->I.Opcode != OPCODE_KIL &&
+	    (inst->I.DstReg.File != PROGRAM_TEMPORARY || inst->I.DstReg.WriteMask != WRITEMASK_XYZW)) {
+		struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst);
+
+		inst_mov->I.Opcode = OPCODE_MOV;
+		inst_mov->I.DstReg = inst->I.DstReg;
+		inst_mov->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+		inst_mov->I.SrcReg[0].Index = rc_find_free_temporary(c);
+
+		inst->I.DstReg.File = PROGRAM_TEMPORARY;
+		inst->I.DstReg.Index = inst_mov->I.SrcReg[0].Index;
+		inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+	}
+
+
+	/* Cannot read texture coordinate from constants file */
+	if (inst->I.SrcReg[0].File != PROGRAM_TEMPORARY && inst->I.SrcReg[0].File != PROGRAM_INPUT) {
+		struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+
+		inst_mov->I.Opcode = OPCODE_MOV;
+		inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+		inst_mov->I.DstReg.Index = rc_find_free_temporary(c);
+		inst_mov->I.SrcReg[0] = inst->I.SrcReg[0];
+
+		reset_srcreg(&inst->I.SrcReg[0]);
+		inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+		inst->I.SrcReg[0].Index = inst_mov->I.DstReg.Index;
+	}
+
+	return GL_TRUE;
+}
+
+/* just some random things... */
+void r300FragmentProgramDump(struct rX00_fragment_program_code *c)
+{
+	struct r300_fragment_program_code *code = &c->code.r300;
+	int n, i, j;
+	static int pc = 0;
+
+	fprintf(stderr, "pc=%d*************************************\n", pc++);
+
+	fprintf(stderr, "Hardware program\n");
+	fprintf(stderr, "----------------\n");
+
+	for (n = 0; n <= (code->config & 3); n++) {
+		uint32_t code_addr = code->code_addr[3 - (code->config & 3) + n];
+		int alu_offset = (code_addr & R300_ALU_START_MASK) >> R300_ALU_START_SHIFT;
+		int alu_end = (code_addr & R300_ALU_SIZE_MASK) >> R300_ALU_SIZE_SHIFT;
+		int tex_offset = (code_addr & R300_TEX_START_MASK) >> R300_TEX_START_SHIFT;
+		int tex_end = (code_addr & R300_TEX_SIZE_MASK) >> R300_TEX_SIZE_SHIFT;
+
+		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
+			"alu_end: %d, tex_end: %d  (code_addr: %08x)\n", n,
+			alu_offset, tex_offset, alu_end, tex_end, code_addr);
+
+		if (n > 0 || (code->config & R300_PFS_CNTL_FIRST_NODE_HAS_TEX)) {
+			fprintf(stderr, "  TEX:\n");
+			for (i = tex_offset;
+			     i <= tex_offset + tex_end;
+			     ++i) {
+				const char *instr;
+
+				switch ((code->tex.
+					 inst[i] >> R300_TEX_INST_SHIFT) &
+					15) {
+				case R300_TEX_OP_LD:
+					instr = "TEX";
+					break;
+				case R300_TEX_OP_KIL:
+					instr = "KIL";
+					break;
+				case R300_TEX_OP_TXP:
+					instr = "TXP";
+					break;
+				case R300_TEX_OP_TXB:
+					instr = "TXB";
+					break;
+				default:
+					instr = "UNKNOWN";
+				}
+
+				fprintf(stderr,
+					"    %s t%i, %c%i, texture[%i]   (%08x)\n",
+					instr,
+					(code->tex.
+					 inst[i] >> R300_DST_ADDR_SHIFT) & 31,
+					't',
+					(code->tex.
+					 inst[i] >> R300_SRC_ADDR_SHIFT) & 31,
+					(code->tex.
+					 inst[i] & R300_TEX_ID_MASK) >>
+					R300_TEX_ID_SHIFT,
+					code->tex.inst[i]);
+			}
+		}
+
+		for (i = alu_offset;
+		     i <= alu_offset + alu_end; ++i) {
+			char srcc[3][10], dstc[20];
+			char srca[3][10], dsta[20];
+			char argc[3][20];
+			char arga[3][20];
+			char flags[5], tmp[10];
+
+			for (j = 0; j < 3; ++j) {
+				int regc = code->alu.inst[i].rgb_addr >> (j * 6);
+				int rega = code->alu.inst[i].alpha_addr >> (j * 6);
+
+				sprintf(srcc[j], "%c%i",
+					(regc & 32) ? 'c' : 't', regc & 31);
+				sprintf(srca[j], "%c%i",
+					(rega & 32) ? 'c' : 't', rega & 31);
+			}
+
+			dstc[0] = 0;
+			sprintf(flags, "%s%s%s",
+				(code->alu.inst[i].
+				 rgb_addr & R300_ALU_DSTC_REG_X) ? "x" : "",
+				(code->alu.inst[i].
+				 rgb_addr & R300_ALU_DSTC_REG_Y) ? "y" : "",
+				(code->alu.inst[i].
+				 rgb_addr & R300_ALU_DSTC_REG_Z) ? "z" : "");
+			if (flags[0] != 0) {
+				sprintf(dstc, "t%i.%s ",
+					(code->alu.inst[i].
+					 rgb_addr >> R300_ALU_DSTC_SHIFT) & 31,
+					flags);
+			}
+			sprintf(flags, "%s%s%s",
+				(code->alu.inst[i].
+				 rgb_addr & R300_ALU_DSTC_OUTPUT_X) ? "x" : "",
+				(code->alu.inst[i].
+				 rgb_addr & R300_ALU_DSTC_OUTPUT_Y) ? "y" : "",
+				(code->alu.inst[i].
+				 rgb_addr & R300_ALU_DSTC_OUTPUT_Z) ? "z" : "");
+			if (flags[0] != 0) {
+				sprintf(tmp, "o%i.%s",
+					(code->alu.inst[i].
+					 rgb_addr >> R300_ALU_DSTC_SHIFT) & 31,
+					flags);
+				strcat(dstc, tmp);
+			}
+
+			dsta[0] = 0;
+			if (code->alu.inst[i].alpha_addr & R300_ALU_DSTA_REG) {
+				sprintf(dsta, "t%i.w ",
+					(code->alu.inst[i].
+					 alpha_addr >> R300_ALU_DSTA_SHIFT) & 31);
+			}
+			if (code->alu.inst[i].alpha_addr & R300_ALU_DSTA_OUTPUT) {
+				sprintf(tmp, "o%i.w ",
+					(code->alu.inst[i].
+					 alpha_addr >> R300_ALU_DSTA_SHIFT) & 31);
+				strcat(dsta, tmp);
+			}
+			if (code->alu.inst[i].alpha_addr & R300_ALU_DSTA_DEPTH) {
+				strcat(dsta, "Z");
+			}
+
+			fprintf(stderr,
+				"%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
+				"       w: %3s %3s %3s -> %-20s (%08x)\n", i,
+				srcc[0], srcc[1], srcc[2], dstc,
+				code->alu.inst[i].rgb_addr, srca[0], srca[1],
+				srca[2], dsta, code->alu.inst[i].alpha_addr);
+
+			for (j = 0; j < 3; ++j) {
+				int regc = code->alu.inst[i].rgb_inst >> (j * 7);
+				int rega = code->alu.inst[i].alpha_inst >> (j * 7);
+				int d;
+				char buf[20];
+
+				d = regc & 31;
+				if (d < 12) {
+					switch (d % 4) {
+					case R300_ALU_ARGC_SRC0C_XYZ:
+						sprintf(buf, "%s.xyz",
+							srcc[d / 4]);
+						break;
+					case R300_ALU_ARGC_SRC0C_XXX:
+						sprintf(buf, "%s.xxx",
+							srcc[d / 4]);
+						break;
+					case R300_ALU_ARGC_SRC0C_YYY:
+						sprintf(buf, "%s.yyy",
+							srcc[d / 4]);
+						break;
+					case R300_ALU_ARGC_SRC0C_ZZZ:
+						sprintf(buf, "%s.zzz",
+							srcc[d / 4]);
+						break;
+					}
+				} else if (d < 15) {
+					sprintf(buf, "%s.www", srca[d - 12]);
+				} else if (d == 20) {
+					sprintf(buf, "0.0");
+				} else if (d == 21) {
+					sprintf(buf, "1.0");
+				} else if (d == 22) {
+					sprintf(buf, "0.5");
+				} else if (d >= 23 && d < 32) {
+					d -= 23;
+					switch (d / 3) {
+					case 0:
+						sprintf(buf, "%s.yzx",
+							srcc[d % 3]);
+						break;
+					case 1:
+						sprintf(buf, "%s.zxy",
+							srcc[d % 3]);
+						break;
+					case 2:
+						sprintf(buf, "%s.Wzy",
+							srcc[d % 3]);
+						break;
+					}
+				} else {
+					sprintf(buf, "%i", d);
+				}
+
+				sprintf(argc[j], "%s%s%s%s",
+					(regc & 32) ? "-" : "",
+					(regc & 64) ? "|" : "",
+					buf, (regc & 64) ? "|" : "");
+
+				d = rega & 31;
+				if (d < 9) {
+					sprintf(buf, "%s.%c", srcc[d / 3],
+						'x' + (char)(d % 3));
+				} else if (d < 12) {
+					sprintf(buf, "%s.w", srca[d - 9]);
+				} else if (d == 16) {
+					sprintf(buf, "0.0");
+				} else if (d == 17) {
+					sprintf(buf, "1.0");
+				} else if (d == 18) {
+					sprintf(buf, "0.5");
+				} else {
+					sprintf(buf, "%i", d);
+				}
+
+				sprintf(arga[j], "%s%s%s%s",
+					(rega & 32) ? "-" : "",
+					(rega & 64) ? "|" : "",
+					buf, (rega & 64) ? "|" : "");
+			}
+
+			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
+				"       w: %8s %8s %8s    op: %08x\n",
+				argc[0], argc[1], argc[2],
+				code->alu.inst[i].rgb_inst, arga[0], arga[1],
+				arga[2], code->alu.inst[i].alpha_inst);
+		}
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r300_fragprog.h
new file mode 100644
index 0000000000..0ac46dbd9c
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * Authors:
+ *   Ben Skeggs <darktama@iinet.net.au>
+ *   Jerome Glisse <j.glisse@gmail.com>
+ */
+#ifndef __R300_FRAGPROG_H_
+#define __R300_FRAGPROG_H_
+
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+
+#include "radeon_compiler.h"
+#include "radeon_program.h"
+
+
+extern void r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
+
+extern void r300FragmentProgramDump(struct rX00_fragment_program_code *c);
+
+extern GLboolean r300_transform_TEX(struct radeon_compiler * c, struct rc_instruction* inst, void* data);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
index 9f0b7e3534..305dc074ee 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
@@ -40,57 +40,43 @@
 
 #include "r300_fragprog.h"
 
+#include "../r300_reg.h"
+
 #include "radeon_program_pair.h"
 #include "r300_fragprog_swizzle.h"
-#include "r300_reg.h"
 
 
+struct r300_emit_state {
+	struct r300_fragment_program_compiler * compiler;
+
+	unsigned current_node : 2;
+	unsigned node_first_tex : 8;
+	unsigned node_first_alu : 8;
+	uint32_t node_flags;
+};
+
 #define PROG_CODE \
-	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
-	struct r300_fragment_program_code *code = c->code
+	struct r300_emit_state * emit = (struct r300_emit_state*)data; \
+	struct r300_fragment_program_compiler *c = emit->compiler; \
+	struct r300_fragment_program_code *code = &c->code->code.r300
 
 #define error(fmt, args...) do {			\
-		fprintf(stderr, "%s::%s(): " fmt "\n",	\
+		rc_error(&c->Base, "%s::%s(): " fmt "\n",	\
 			__FILE__, __FUNCTION__, ##args);	\
 	} while(0)
 
 
-static GLboolean emit_const(void* data, GLuint file, GLuint index, GLuint *hwindex)
-{
-	PROG_CODE;
-
-	for (*hwindex = 0; *hwindex < code->const_nr; ++*hwindex) {
-		if (code->constant[*hwindex].File == file &&
-		    code->constant[*hwindex].Index == index)
-			break;
-	}
-
-	if (*hwindex >= code->const_nr) {
-		if (*hwindex >= PFS_NUM_CONST_REGS) {
-			error("Out of hw constants!\n");
-			return GL_FALSE;
-		}
-
-		code->const_nr++;
-		code->constant[*hwindex].File = file;
-		code->constant[*hwindex].Index = index;
-	}
-
-	return GL_TRUE;
-}
-
-
 /**
  * Mark a temporary register as used.
  */
 static void use_temporary(struct r300_fragment_program_code *code, GLuint index)
 {
-	if (index > code->max_temp_idx)
-		code->max_temp_idx = index;
+	if (index > code->pixsize)
+		code->pixsize = index;
 }
 
 
-static GLuint translate_rgb_opcode(GLuint opcode)
+static GLuint translate_rgb_opcode(struct r300_fragment_program_compiler * c, GLuint opcode)
 {
 	switch(opcode) {
 	case OPCODE_CMP: return R300_ALU_OUTC_CMP;
@@ -109,7 +95,7 @@ static GLuint translate_rgb_opcode(GLuint opcode)
 	}
 }
 
-static GLuint translate_alpha_opcode(GLuint opcode)
+static GLuint translate_alpha_opcode(struct r300_fragment_program_compiler * c, GLuint opcode)
 {
 	switch(opcode) {
 	case OPCODE_CMP: return R300_ALU_OUTA_CMP;
@@ -138,70 +124,69 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 {
 	PROG_CODE;
 
-	if (code->alu.length >= PFS_MAX_ALU_INST) {
+	if (code->alu.length >= R300_PFS_MAX_ALU_INST) {
 		error("Too many ALU instructions");
 		return GL_FALSE;
 	}
 
 	int ip = code->alu.length++;
 	int j;
-	code->node[code->cur_node].alu_end++;
 
-	code->alu.inst[ip].inst0 = translate_rgb_opcode(inst->RGB.Opcode);
-	code->alu.inst[ip].inst2 = translate_alpha_opcode(inst->Alpha.Opcode);
+	code->alu.inst[ip].rgb_inst = translate_rgb_opcode(c, inst->RGB.Opcode);
+	code->alu.inst[ip].alpha_inst = translate_alpha_opcode(c, inst->Alpha.Opcode);
 
 	for(j = 0; j < 3; ++j) {
 		GLuint src = inst->RGB.Src[j].Index | (inst->RGB.Src[j].Constant << 5);
 		if (!inst->RGB.Src[j].Constant)
 			use_temporary(code, inst->RGB.Src[j].Index);
-		code->alu.inst[ip].inst1 |= src << (6*j);
+		code->alu.inst[ip].rgb_addr |= src << (6*j);
 
 		src = inst->Alpha.Src[j].Index | (inst->Alpha.Src[j].Constant << 5);
 		if (!inst->Alpha.Src[j].Constant)
 			use_temporary(code, inst->Alpha.Src[j].Index);
-		code->alu.inst[ip].inst3 |= src << (6*j);
+		code->alu.inst[ip].alpha_addr |= src << (6*j);
 
 		GLuint arg = r300FPTranslateRGBSwizzle(inst->RGB.Arg[j].Source, inst->RGB.Arg[j].Swizzle);
 		arg |= inst->RGB.Arg[j].Abs << 6;
 		arg |= inst->RGB.Arg[j].Negate << 5;
-		code->alu.inst[ip].inst0 |= arg << (7*j);
+		code->alu.inst[ip].rgb_inst |= arg << (7*j);
 
 		arg = r300FPTranslateAlphaSwizzle(inst->Alpha.Arg[j].Source, inst->Alpha.Arg[j].Swizzle);
 		arg |= inst->Alpha.Arg[j].Abs << 6;
 		arg |= inst->Alpha.Arg[j].Negate << 5;
-		code->alu.inst[ip].inst2 |= arg << (7*j);
+		code->alu.inst[ip].alpha_inst |= arg << (7*j);
 	}
 
 	if (inst->RGB.Saturate)
-		code->alu.inst[ip].inst0 |= R300_ALU_OUTC_CLAMP;
+		code->alu.inst[ip].rgb_inst |= R300_ALU_OUTC_CLAMP;
 	if (inst->Alpha.Saturate)
-		code->alu.inst[ip].inst2 |= R300_ALU_OUTA_CLAMP;
+		code->alu.inst[ip].alpha_inst |= R300_ALU_OUTA_CLAMP;
 
 	if (inst->RGB.WriteMask) {
 		use_temporary(code, inst->RGB.DestIndex);
-		code->alu.inst[ip].inst1 |=
+		code->alu.inst[ip].rgb_addr |=
 			(inst->RGB.DestIndex << R300_ALU_DSTC_SHIFT) |
 			(inst->RGB.WriteMask << R300_ALU_DSTC_REG_MASK_SHIFT);
 	}
 	if (inst->RGB.OutputWriteMask) {
-		code->alu.inst[ip].inst1 |= (inst->RGB.OutputWriteMask << R300_ALU_DSTC_OUTPUT_MASK_SHIFT);
-		code->node[code->cur_node].flags |= R300_RGBA_OUT;
+		code->alu.inst[ip].rgb_addr |= (inst->RGB.OutputWriteMask << R300_ALU_DSTC_OUTPUT_MASK_SHIFT);
+		emit->node_flags |= R300_RGBA_OUT;
 	}
 
 	if (inst->Alpha.WriteMask) {
 		use_temporary(code, inst->Alpha.DestIndex);
-		code->alu.inst[ip].inst3 |=
+		code->alu.inst[ip].alpha_addr |=
 			(inst->Alpha.DestIndex << R300_ALU_DSTA_SHIFT) |
 			R300_ALU_DSTA_REG;
 	}
 	if (inst->Alpha.OutputWriteMask) {
-		code->alu.inst[ip].inst3 |= R300_ALU_DSTA_OUTPUT;
-		code->node[code->cur_node].flags |= R300_RGBA_OUT;
+		code->alu.inst[ip].alpha_addr |= R300_ALU_DSTA_OUTPUT;
+		emit->node_flags |= R300_RGBA_OUT;
 	}
 	if (inst->Alpha.DepthWriteMask) {
-		code->alu.inst[ip].inst3 |= R300_ALU_DSTA_DEPTH;
-		code->node[code->cur_node].flags |= R300_W_OUT;
-		c->fp->WritesDepth = GL_TRUE;
+		code->alu.inst[ip].alpha_addr |= R300_ALU_DSTA_DEPTH;
+		emit->node_flags |= R300_W_OUT;
+		c->code->writes_depth = GL_TRUE;
 	}
 
 	return GL_TRUE;
@@ -211,31 +196,50 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 /**
  * Finish the current node without advancing to the next one.
  */
-static GLboolean finish_node(struct r300_fragment_program_compiler *c)
+static GLboolean finish_node(struct r300_emit_state * emit)
 {
-	struct r300_fragment_program_code *code = c->code;
-	struct r300_fragment_program_node *node = &code->node[code->cur_node];
+	struct r300_fragment_program_compiler * c = emit->compiler;
+	struct r300_fragment_program_code *code = &emit->compiler->code->code.r300;
 
-	if (node->alu_end < 0) {
+	if (code->alu.length == emit->node_first_alu) {
 		/* Generate a single NOP for this node */
 		struct radeon_pair_instruction inst;
 		_mesa_bzero(&inst, sizeof(inst));
-		if (!emit_alu(c, &inst))
+		if (!emit_alu(emit, &inst))
 			return GL_FALSE;
 	}
 
-	if (node->tex_end < 0) {
-		if (code->cur_node == 0) {
-			node->tex_end = 0;
-		} else {
-			error("Node %i has no TEX instructions", code->cur_node);
+	unsigned alu_offset = emit->node_first_alu;
+	unsigned alu_end = code->alu.length - alu_offset - 1;
+	unsigned tex_offset = emit->node_first_tex;
+	unsigned tex_end = code->tex.length - tex_offset - 1;
+
+	if (code->tex.length == emit->node_first_tex) {
+		if (emit->current_node > 0) {
+			error("Node %i has no TEX instructions", emit->current_node);
 			return GL_FALSE;
 		}
+
+		tex_end = 0;
 	} else {
-		if (code->cur_node == 0)
-			code->first_node_has_tex = 1;
+		if (emit->current_node == 0)
+			code->config |= R300_PFS_CNTL_FIRST_NODE_HAS_TEX;
 	}
 
+	/* Write the config register.
+	 * Note: The order in which the words for each node are written
+	 * is not correct here and needs to be fixed up once we're entirely
+	 * done
+	 *
+	 * Also note that the register specification from AMD is slightly
+	 * incorrect in its description of this register. */
+	code->code_addr[emit->current_node] =
+			(alu_offset << R300_ALU_START_SHIFT) |
+			(alu_end << R300_ALU_SIZE_SHIFT) |
+			(tex_offset << R300_TEX_START_SHIFT) |
+			(tex_end << R300_TEX_SIZE_SHIFT) |
+			emit->node_flags;
+
 	return GL_TRUE;
 }
 
@@ -248,64 +252,61 @@ static GLboolean begin_tex(void* data)
 {
 	PROG_CODE;
 
-	if (code->cur_node == 0) {
-		if (code->node[0].alu_end < 0 &&
-		    code->node[0].tex_end < 0)
-			return GL_TRUE;
+	if (code->alu.length == emit->node_first_alu &&
+	    code->tex.length == emit->node_first_tex) {
+		return GL_TRUE;
 	}
 
-	if (code->cur_node == 3) {
+	if (emit->current_node == 3) {
 		error("Too many texture indirections");
 		return GL_FALSE;
 	}
 
-	if (!finish_node(c))
+	if (!finish_node(emit))
 		return GL_FALSE;
 
-	struct r300_fragment_program_node *node = &code->node[++code->cur_node];
-	node->alu_offset = code->alu.length;
-	node->alu_end = -1;
-	node->tex_offset = code->tex.length;
-	node->tex_end = -1;
+	emit->current_node++;
+	emit->node_first_tex = code->tex.length;
+	emit->node_first_alu = code->alu.length;
+	emit->node_flags = 0;
 	return GL_TRUE;
 }
 
 
-static GLboolean emit_tex(void* data, struct prog_instruction* inst)
+static GLboolean emit_tex(void* data, struct radeon_pair_texture_instruction* inst)
 {
 	PROG_CODE;
 
-	if (code->tex.length >= PFS_MAX_TEX_INST) {
+	if (code->tex.length >= R300_PFS_MAX_TEX_INST) {
 		error("Too many TEX instructions");
 		return GL_FALSE;
 	}
 
 	GLuint unit = inst->TexSrcUnit;
-	GLuint dest = inst->DstReg.Index;
+	GLuint dest = inst->DestIndex;
 	GLuint opcode;
 
 	switch(inst->Opcode) {
-	case OPCODE_KIL: opcode = R300_TEX_OP_KIL; break;
-	case OPCODE_TEX: opcode = R300_TEX_OP_LD; break;
-	case OPCODE_TXB: opcode = R300_TEX_OP_TXB; break;
-	case OPCODE_TXP: opcode = R300_TEX_OP_TXP; break;
+	case RADEON_OPCODE_KIL: opcode = R300_TEX_OP_KIL; break;
+	case RADEON_OPCODE_TEX: opcode = R300_TEX_OP_LD; break;
+	case RADEON_OPCODE_TXB: opcode = R300_TEX_OP_TXB; break;
+	case RADEON_OPCODE_TXP: opcode = R300_TEX_OP_TXP; break;
 	default:
 		error("Unknown texture opcode %i", inst->Opcode);
 		return GL_FALSE;
 	}
 
-	if (inst->Opcode == OPCODE_KIL) {
+	if (inst->Opcode == RADEON_OPCODE_KIL) {
 		unit = 0;
 		dest = 0;
 	} else {
 		use_temporary(code, dest);
 	}
 
-	use_temporary(code, inst->SrcReg[0].Index);
+	use_temporary(code, inst->SrcIndex);
 
-	code->node[code->cur_node].tex_end++;
 	code->tex.inst[code->tex.length++] =
-		(inst->SrcReg[0].Index << R300_SRC_ADDR_SHIFT) |
+		(inst->SrcIndex << R300_SRC_ADDR_SHIFT) |
 		(dest << R300_DST_ADDR_SHIFT) |
 		(unit << R300_TEX_ID_SHIFT) |
 		(opcode << R300_TEX_INST_SHIFT);
@@ -314,31 +315,46 @@ static GLboolean emit_tex(void* data, struct prog_instruction* inst)
 
 
 static const struct radeon_pair_handler pair_handler = {
-	.EmitConst = &emit_const,
 	.EmitPaired = &emit_alu,
 	.EmitTex = &emit_tex,
 	.BeginTexBlock = &begin_tex,
-	.MaxHwTemps = PFS_NUM_TEMP_REGS
+	.MaxHwTemps = R300_PFS_NUM_TEMP_REGS
 };
 
 /**
  * Final compilation step: Turn the intermediate radeon_program into
  * machine-readable instructions.
  */
-GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
+void r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
-	struct r300_fragment_program_code *code = compiler->code;
-
-	_mesa_bzero(code, sizeof(struct r300_fragment_program_code));
-	code->node[0].alu_end = -1;
-	code->node[0].tex_end = -1;
+	struct r300_emit_state emit;
+	struct r300_fragment_program_code *code = &compiler->code->code.r300;
 
-	if (!radeonPairProgram(compiler->r300->radeon.glCtx, compiler->program, &pair_handler, compiler))
-		return GL_FALSE;
+	memset(&emit, 0, sizeof(emit));
+	emit.compiler = compiler;
 
-	if (!finish_node(compiler))
-		return GL_FALSE;
+	_mesa_bzero(code, sizeof(struct r300_fragment_program_code));
 
-	return GL_TRUE;
+	radeonPairProgram(compiler, &pair_handler, &emit);
+	if (compiler->Base.Error)
+		return;
+
+	/* Finish the program */
+	finish_node(&emit);
+
+	code->config |= emit.current_node; /* FIRST_NODE_HAS_TEX set by finish_node */
+	code->code_offset =
+		(0 << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
+		((code->alu.length-1) << R300_PFS_CNTL_ALU_END_SHIFT) |
+		(0 << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
+		((code->tex.length ? code->tex.length-1 : 0) << R300_PFS_CNTL_TEX_END_SHIFT);
+
+	if (emit.current_node < 3) {
+		int shift = 3 - emit.current_node;
+		int i;
+		for(i = 0; i <= emit.current_node; ++i)
+			code->code_addr[shift + i] = code->code_addr[i];
+		for(i = 0; i < shift; ++i)
+			code->code_addr[i] = 0;
+	}
 }
-
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
index a86d2bd471..1b14cc3888 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
@@ -33,8 +33,9 @@
 
 #include "r300_fragprog_swizzle.h"
 
-#include "r300_reg.h"
+#include "../r300_reg.h"
 #include "radeon_nqssadce.h"
+#include "radeon_compiler.h"
 
 #define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, SWIZZLE_##y, SWIZZLE_##z, SWIZZLE_ZERO))
 
@@ -92,7 +93,7 @@ static const struct swizzle_data* lookup_native_swizzle(GLuint swizzle)
 GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 {
 	if (reg.Abs)
-		reg.NegateBase = 0;
+		reg.Negate = NEGATE_NONE;
 
 	if (opcode == OPCODE_KIL ||
 	    opcode == OPCODE_TEX ||
@@ -100,7 +101,7 @@ GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 	    opcode == OPCODE_TXP) {
 		int j;
 
-		if (reg.Abs || reg.NegateBase != (15*reg.NegateAbs))
+		if (reg.Abs || reg.Negate)
 			return GL_FALSE;
 
 		for(j = 0; j < 4; ++j) {
@@ -121,7 +122,7 @@ GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 		if (GET_SWZ(reg.Swizzle, j) != SWIZZLE_NIL)
 			relevant |= 1 << j;
 
-	if ((reg.NegateBase & relevant) && (reg.NegateBase & relevant) != relevant)
+	if ((reg.Negate & relevant) && ((reg.Negate & relevant) != relevant))
 		return GL_FALSE;
 
 	if (!lookup_native_swizzle(reg.Swizzle))
@@ -137,13 +138,12 @@ GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
 {
 	if (src.Abs)
-		src.NegateBase = 0;
+		src.Negate = NEGATE_NONE;
 
 	while(dst.WriteMask) {
 		const struct swizzle_data *best_swizzle = 0;
 		GLuint best_matchcount = 0;
 		GLuint best_matchmask = 0;
-		GLboolean rgbnegate;
 		int i, comp;
 
 		for(i = 0; i < num_native_swizzles; ++i) {
@@ -157,6 +157,11 @@ void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst,
 				if (swz == SWIZZLE_NIL)
 					continue;
 				if (swz == GET_SWZ(sd->hash, comp)) {
+					/* check if the negate bit of current component
+					 * is the same for already matched components */
+					if (matchmask && (!!(src.Negate & matchmask) != !!(src.Negate & (1 << comp))))
+						continue;
+
 					matchcount++;
 					matchmask |= 1 << comp;
 				}
@@ -170,24 +175,15 @@ void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst,
 			}
 		}
 
-		if ((src.NegateBase & best_matchmask) != 0) {
-			best_matchmask &= src.NegateBase;
-			rgbnegate = !src.NegateAbs;
-		} else {
-			rgbnegate = src.NegateAbs;
-		}
-
-		struct prog_instruction *inst;
-
-		_mesa_insert_instructions(s->Program, s->IP, 1);
-		inst = s->Program->Instructions + s->IP++;
-		inst->Opcode = OPCODE_MOV;
-		inst->DstReg = dst;
-		inst->DstReg.WriteMask &= (best_matchmask | WRITEMASK_W);
-		inst->SrcReg[0] = src;
+		struct rc_instruction *inst = rc_insert_new_instruction(s->Compiler, s->IP->Prev);
+		inst->I.Opcode = OPCODE_MOV;
+		inst->I.DstReg = dst;
+		inst->I.DstReg.WriteMask &= (best_matchmask | WRITEMASK_W);
+		inst->I.SrcReg[0] = src;
+		inst->I.SrcReg[0].Negate = (best_matchmask & src.Negate) ? NEGATE_XYZW : NEGATE_NONE;
 		/* Note: We rely on NqSSA/DCE to set unused swizzle components to NIL */
 
-		dst.WriteMask &= ~inst->DstReg.WriteMask;
+		dst.WriteMask &= ~inst->I.DstReg.WriteMask;
 	}
 }
 
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.h b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.h
index 231bf4eef5..231bf4eef5 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_swizzle.h
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.h
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
new file mode 100644
index 0000000000..76c3a7ecfd
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "radeon_compiler.h"
+
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+#include "shader/prog_statevars.h"
+
+#include "radeon_nqssadce.h"
+#include "radeon_program_alu.h"
+#include "r300_fragprog.h"
+#include "r300_fragprog_swizzle.h"
+#include "r500_fragprog.h"
+
+
+static void nqssadce_init(struct nqssadce_state* s)
+{
+	struct r300_fragment_program_compiler * c = s->UserData;
+	s->Outputs[c->OutputColor].Sourced = WRITEMASK_XYZW;
+	s->Outputs[c->OutputDepth].Sourced = WRITEMASK_W;
+}
+
+static void rewrite_depth_out(struct r300_fragment_program_compiler * c)
+{
+	struct rc_instruction *rci;
+
+	for (rci = c->Base.Program.Instructions.Next; rci != &c->Base.Program.Instructions; rci = rci->Next) {
+		struct prog_instruction * inst = &rci->I;
+
+		if (inst->DstReg.File != PROGRAM_OUTPUT || inst->DstReg.Index != c->OutputDepth)
+			continue;
+
+		if (inst->DstReg.WriteMask & WRITEMASK_Z) {
+			inst->DstReg.WriteMask = WRITEMASK_W;
+		} else {
+			inst->DstReg.WriteMask = 0;
+			continue;
+		}
+
+		switch (inst->Opcode) {
+			case OPCODE_FRC:
+			case OPCODE_MOV:
+				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+				break;
+			case OPCODE_ADD:
+			case OPCODE_MAX:
+			case OPCODE_MIN:
+			case OPCODE_MUL:
+				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+				inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
+				break;
+			case OPCODE_CMP:
+			case OPCODE_MAD:
+				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+				inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
+				inst->SrcReg[2] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[2]);
+				break;
+			default:
+				// Scalar instructions needn't be reswizzled
+				break;
+		}
+	}
+}
+
+void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
+{
+	rewrite_depth_out(c);
+
+	if (c->is_r500) {
+		struct radeon_program_transformation transformations[] = {
+			{ &r500_transform_TEX, c },
+			{ &radeonTransformALU, 0 },
+			{ &radeonTransformDeriv, 0 },
+			{ &radeonTransformTrigScale, 0 }
+		};
+		radeonLocalTransform(&c->Base, 4, transformations);
+	} else {
+		struct radeon_program_transformation transformations[] = {
+			{ &r300_transform_TEX, c },
+			{ &radeonTransformALU, 0 },
+			{ &radeonTransformTrigSimple, 0 }
+		};
+		radeonLocalTransform(&c->Base, 3, transformations);
+	}
+
+	if (c->Base.Debug) {
+		_mesa_printf("Fragment Program: After native rewrite:\n");
+		rc_print_program(&c->Base.Program);
+		fflush(stderr);
+	}
+
+	if (c->is_r500) {
+		struct radeon_nqssadce_descr nqssadce = {
+			.Init = &nqssadce_init,
+			.IsNativeSwizzle = &r500FPIsNativeSwizzle,
+			.BuildSwizzle = &r500FPBuildSwizzle
+		};
+		radeonNqssaDce(&c->Base, &nqssadce, c);
+	} else {
+		struct radeon_nqssadce_descr nqssadce = {
+			.Init = &nqssadce_init,
+			.IsNativeSwizzle = &r300FPIsNativeSwizzle,
+			.BuildSwizzle = &r300FPBuildSwizzle
+		};
+		radeonNqssaDce(&c->Base, &nqssadce, c);
+	}
+
+	if (c->Base.Debug) {
+		_mesa_printf("Compiler: after NqSSA-DCE:\n");
+		rc_print_program(&c->Base.Program);
+		fflush(stderr);
+	}
+
+	if (c->is_r500) {
+		r500BuildFragmentProgramHwCode(c);
+	} else {
+		r300BuildFragmentProgramHwCode(c);
+	}
+
+	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
+
+	if (c->Base.Debug) {
+		if (c->is_r500) {
+			r500FragmentProgramDump(c->code);
+		} else {
+			r300FragmentProgramDump(c->code);
+		}
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
new file mode 100644
index 0000000000..93a516105e
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "radeon_compiler.h"
+
+#include "../r300_reg.h"
+
+#include "radeon_nqssadce.h"
+#include "radeon_program.h"
+#include "radeon_program_alu.h"
+
+#include "shader/prog_print.h"
+
+
+/*
+ * Take an already-setup and valid source then swizzle it appropriately to
+ * obtain a constant ZERO or ONE source.
+ */
+#define __CONST(x, y)	\
+	(PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),	\
+			   t_swizzle(y),	\
+			   t_swizzle(y),	\
+			   t_swizzle(y),	\
+			   t_swizzle(y),	\
+			   t_src_class(vpi->SrcReg[x].File), \
+			   NEGATE_NONE) | (vpi->SrcReg[x].RelAddr << 4))
+
+
+static unsigned long t_dst_mask(GLuint mask)
+{
+	/* WRITEMASK_* is equivalent to VSF_FLAG_* */
+	return mask & WRITEMASK_XYZW;
+}
+
+static unsigned long t_dst_class(gl_register_file file)
+{
+
+	switch (file) {
+	case PROGRAM_TEMPORARY:
+		return PVS_DST_REG_TEMPORARY;
+	case PROGRAM_OUTPUT:
+		return PVS_DST_REG_OUT;
+	case PROGRAM_ADDRESS:
+		return PVS_DST_REG_A0;
+		/*
+		   case PROGRAM_INPUT:
+		   case PROGRAM_LOCAL_PARAM:
+		   case PROGRAM_ENV_PARAM:
+		   case PROGRAM_NAMED_PARAM:
+		   case PROGRAM_STATE_VAR:
+		   case PROGRAM_WRITE_ONLY:
+		   case PROGRAM_ADDRESS:
+		 */
+	default:
+		fprintf(stderr, "problem in %s", __FUNCTION__);
+		_mesa_exit(-1);
+		return -1;
+	}
+}
+
+static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
+				 struct prog_dst_register *dst)
+{
+	if (dst->File == PROGRAM_OUTPUT)
+		return vp->outputs[dst->Index];
+
+	return dst->Index;
+}
+
+static unsigned long t_src_class(gl_register_file file)
+{
+	switch (file) {
+	case PROGRAM_TEMPORARY:
+		return PVS_SRC_REG_TEMPORARY;
+	case PROGRAM_INPUT:
+		return PVS_SRC_REG_INPUT;
+	case PROGRAM_LOCAL_PARAM:
+	case PROGRAM_ENV_PARAM:
+	case PROGRAM_NAMED_PARAM:
+	case PROGRAM_CONSTANT:
+	case PROGRAM_STATE_VAR:
+		return PVS_SRC_REG_CONSTANT;
+		/*
+		   case PROGRAM_OUTPUT:
+		   case PROGRAM_WRITE_ONLY:
+		   case PROGRAM_ADDRESS:
+		 */
+	default:
+		fprintf(stderr, "problem in %s", __FUNCTION__);
+		_mesa_exit(-1);
+		return -1;
+	}
+}
+
+static GLboolean t_src_conflict(struct prog_src_register a, struct prog_src_register b)
+{
+	unsigned long aclass = t_src_class(a.File);
+	unsigned long bclass = t_src_class(b.File);
+
+	if (aclass != bclass)
+		return GL_FALSE;
+	if (aclass == PVS_SRC_REG_TEMPORARY)
+		return GL_FALSE;
+
+	if (a.RelAddr || b.RelAddr)
+		return GL_TRUE;
+	if (a.Index != b.Index)
+		return GL_TRUE;
+
+	return GL_FALSE;
+}
+
+static INLINE unsigned long t_swizzle(GLubyte swizzle)
+{
+	/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+	return swizzle;
+}
+
+static unsigned long t_src_index(struct r300_vertex_program_code *vp,
+				 struct prog_src_register *src)
+{
+	if (src->File == PROGRAM_INPUT) {
+		assert(vp->inputs[src->Index] != -1);
+		return vp->inputs[src->Index];
+	} else {
+		if (src->Index < 0) {
+			fprintf(stderr,
+				"negative offsets for indirect addressing do not work.\n");
+			return 0;
+		}
+		return src->Index;
+	}
+}
+
+/* these two functions should probably be merged... */
+
+static unsigned long t_src(struct r300_vertex_program_code *vp,
+			   struct prog_src_register *src)
+{
+	/* src->Negate uses the NEGATE_ flags from program_instruction.h,
+	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
+	 */
+	return PVS_SRC_OPERAND(t_src_index(vp, src),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
+			       t_src_class(src->File),
+			       src->Negate) | (src->RelAddr << 4);
+}
+
+static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
+				  struct prog_src_register *src)
+{
+	/* src->Negate uses the NEGATE_ flags from program_instruction.h,
+	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
+	 */
+	return PVS_SRC_OPERAND(t_src_index(vp, src),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
+			       t_src_class(src->File),
+			       src->Negate ? NEGATE_XYZW : NEGATE_NONE) |
+	    (src->RelAddr << 4);
+}
+
+static GLboolean valid_dst(struct r300_vertex_program_code *vp,
+			   struct prog_dst_register *dst)
+{
+	if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) {
+		return GL_FALSE;
+	} else if (dst->File == PROGRAM_ADDRESS) {
+		assert(dst->Index == 0);
+	}
+
+	return GL_TRUE;
+}
+
+static void ei_vector1(struct r300_vertex_program_code *vp,
+				GLuint hw_opcode,
+				struct prog_instruction *vpi,
+				GLuint * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
+				     GL_FALSE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	inst[1] = t_src(vp, &vpi->SrcReg[0]);
+	inst[2] = __CONST(0, SWIZZLE_ZERO);
+	inst[3] = __CONST(0, SWIZZLE_ZERO);
+}
+
+static void ei_vector2(struct r300_vertex_program_code *vp,
+				GLuint hw_opcode,
+				struct prog_instruction *vpi,
+				GLuint * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
+				     GL_FALSE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	inst[1] = t_src(vp, &vpi->SrcReg[0]);
+	inst[2] = t_src(vp, &vpi->SrcReg[1]);
+	inst[3] = __CONST(1, SWIZZLE_ZERO);
+}
+
+static void ei_math1(struct r300_vertex_program_code *vp,
+				GLuint hw_opcode,
+				struct prog_instruction *vpi,
+				GLuint * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
+				     GL_TRUE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
+	inst[2] = __CONST(0, SWIZZLE_ZERO);
+	inst[3] = __CONST(0, SWIZZLE_ZERO);
+}
+
+static void ei_lit(struct r300_vertex_program_code *vp,
+				      struct prog_instruction *vpi,
+				      GLuint * inst)
+{
+	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
+
+	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
+				     GL_TRUE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	/* NOTE: Users swizzling might not work. */
+	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
+				  PVS_SRC_SELECT_FORCE_0,	// Z
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
+				  t_src_class(vpi->SrcReg[0].File),
+				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+	    (vpi->SrcReg[0].RelAddr << 4);
+	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
+				  PVS_SRC_SELECT_FORCE_0,	// Z
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
+				  t_src_class(vpi->SrcReg[0].File),
+				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+	    (vpi->SrcReg[0].RelAddr << 4);
+	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),	// Y
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),	// X
+				  PVS_SRC_SELECT_FORCE_0,	// Z
+				  t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),	// W
+				  t_src_class(vpi->SrcReg[0].File),
+				  vpi->SrcReg[0].Negate ? NEGATE_XYZW : NEGATE_NONE) |
+	    (vpi->SrcReg[0].RelAddr << 4);
+}
+
+static void ei_mad(struct r300_vertex_program_code *vp,
+				      struct prog_instruction *vpi,
+				      GLuint * inst)
+{
+	/* Remarks about hardware limitations of MAD
+	 * (please preserve this comment, as this information is _NOT_
+	 * in the documentation provided by AMD).
+	 *
+	 * As described in the documentation, MAD with three unique temporary
+	 * source registers requires the use of the macro version.
+	 *
+	 * However (and this is not mentioned in the documentation), apparently
+	 * the macro version is _NOT_ a full superset of the normal version.
+	 * In particular, the macro version does not always work when relative
+	 * addressing is used in the source operands.
+	 *
+	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
+	 * assembly shader path when using medium quality animations
+	 * (i.e. animations with matrix blending instead of quaternion blending).
+	 *
+	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
+	 * test for this issue - for some reason, it is possible to have vertex
+	 * programs whose prefix is *exactly* the same as the prefix of the
+	 * offending program in Sauerbraten up to the offending instruction
+	 * without causing any trouble.
+	 *
+	 * Bottom line: Only use the macro version only when really necessary;
+	 * according to AMD docs, this should improve performance by one clock
+	 * as a nice side bonus.
+	 */
+	if (vpi->SrcReg[0].File == PROGRAM_TEMPORARY &&
+	    vpi->SrcReg[1].File == PROGRAM_TEMPORARY &&
+	    vpi->SrcReg[2].File == PROGRAM_TEMPORARY &&
+	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
+	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
+	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
+		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
+				GL_FALSE,
+				GL_TRUE,
+				t_dst_index(vp, &vpi->DstReg),
+				t_dst_mask(vpi->DstReg.WriteMask),
+				t_dst_class(vpi->DstReg.File));
+	} else {
+		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
+				GL_FALSE,
+				GL_FALSE,
+				t_dst_index(vp, &vpi->DstReg),
+				t_dst_mask(vpi->DstReg.WriteMask),
+				t_dst_class(vpi->DstReg.File));
+	}
+	inst[1] = t_src(vp, &vpi->SrcReg[0]);
+	inst[2] = t_src(vp, &vpi->SrcReg[1]);
+	inst[3] = t_src(vp, &vpi->SrcReg[2]);
+}
+
+static void ei_pow(struct r300_vertex_program_code *vp,
+				      struct prog_instruction *vpi,
+				      GLuint * inst)
+{
+	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
+				     GL_TRUE,
+				     GL_FALSE,
+				     t_dst_index(vp, &vpi->DstReg),
+				     t_dst_mask(vpi->DstReg.WriteMask),
+				     t_dst_class(vpi->DstReg.File));
+	inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
+	inst[2] = __CONST(0, SWIZZLE_ZERO);
+	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
+}
+
+
+static void translate_vertex_program(struct r300_vertex_program_compiler * compiler)
+{
+	struct rc_instruction *rci;
+
+	compiler->code->pos_end = 0;	/* Not supported yet */
+	compiler->code->length = 0;
+
+	compiler->SetHwInputOutput(compiler);
+
+	for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
+		struct prog_instruction *vpi = &rci->I;
+		GLuint *inst = compiler->code->body.d + compiler->code->length;
+
+		/* Skip instructions writing to non-existing destination */
+		if (!valid_dst(compiler->code, &vpi->DstReg))
+			continue;
+
+		if (compiler->code->length >= VSF_MAX_FRAGMENT_LENGTH) {
+			rc_error(&compiler->Base, "Vertex program has too many instructions\n");
+			return;
+		}
+
+		switch (vpi->Opcode) {
+		case OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
+		case OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
+		case OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
+		case OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
+		case OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
+		case OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
+		case OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
+		case OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
+		case OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
+		case OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
+		case OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
+		case OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
+		case OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
+		case OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
+		case OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
+		case OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
+		case OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
+		case OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
+		case OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
+		case OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
+		default:
+			rc_error(&compiler->Base, "Unknown opcode %i\n", vpi->Opcode);
+			return;
+		}
+
+		compiler->code->length += 4;
+
+		if (compiler->Base.Error)
+			return;
+	}
+}
+
+struct temporary_allocation {
+	GLuint Allocated:1;
+	GLuint HwTemp:15;
+	struct rc_instruction * LastRead;
+};
+
+static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
+{
+	struct rc_instruction *inst;
+	GLuint num_orig_temps = 0;
+	GLboolean hwtemps[VSF_MAX_FRAGMENT_TEMPS];
+	struct temporary_allocation * ta;
+	GLuint i, j;
+
+	compiler->code->num_temporaries = 0;
+	memset(hwtemps, 0, sizeof(hwtemps));
+
+	/* Pass 1: Count original temporaries and allocate structures */
+	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+
+		for (i = 0; i < numsrcs; ++i) {
+			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
+				if (inst->I.SrcReg[i].Index >= num_orig_temps)
+					num_orig_temps = inst->I.SrcReg[i].Index + 1;
+			}
+		}
+
+		if (numdsts) {
+			if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
+				if (inst->I.DstReg.Index >= num_orig_temps)
+					num_orig_temps = inst->I.DstReg.Index + 1;
+			}
+		}
+	}
+
+	ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
+			sizeof(struct temporary_allocation) * num_orig_temps);
+	memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
+
+	/* Pass 2: Determine original temporary lifetimes */
+	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+
+		for (i = 0; i < numsrcs; ++i) {
+			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY)
+				ta[inst->I.SrcReg[i].Index].LastRead = inst;
+		}
+	}
+
+	/* Pass 3: Register allocation */
+	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
+		GLuint numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		GLuint numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+
+		for (i = 0; i < numsrcs; ++i) {
+			if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY) {
+				GLuint orig = inst->I.SrcReg[i].Index;
+				inst->I.SrcReg[i].Index = ta[orig].HwTemp;
+
+				if (ta[orig].Allocated && inst == ta[orig].LastRead)
+					hwtemps[ta[orig].HwTemp] = GL_FALSE;
+			}
+		}
+
+		if (numdsts) {
+			if (inst->I.DstReg.File == PROGRAM_TEMPORARY) {
+				GLuint orig = inst->I.DstReg.Index;
+
+				if (!ta[orig].Allocated) {
+					for(j = 0; j < VSF_MAX_FRAGMENT_TEMPS; ++j) {
+						if (!hwtemps[j])
+							break;
+					}
+					if (j >= VSF_MAX_FRAGMENT_TEMPS) {
+						fprintf(stderr, "Out of hw temporaries\n");
+					} else {
+						ta[orig].Allocated = GL_TRUE;
+						ta[orig].HwTemp = j;
+						hwtemps[j] = GL_TRUE;
+
+						if (j >= compiler->code->num_temporaries)
+							compiler->code->num_temporaries = j + 1;
+					}
+				}
+
+				inst->I.DstReg.Index = ta[orig].HwTemp;
+			}
+		}
+	}
+}
+
+
+/**
+ * Vertex engine cannot read two inputs or two constants at the same time.
+ * Introduce intermediate MOVs to temporary registers to account for this.
+ */
+static GLboolean transform_source_conflicts(
+	struct radeon_compiler *c,
+	struct rc_instruction* inst,
+	void* unused)
+{
+	GLuint num_operands = _mesa_num_inst_src_regs(inst->I.Opcode);
+
+	if (num_operands == 3) {
+		if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[2])
+		    || t_src_conflict(inst->I.SrcReg[0], inst->I.SrcReg[2])) {
+			int tmpreg = rc_find_free_temporary(c);
+			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+			inst_mov->I.Opcode = OPCODE_MOV;
+			inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst_mov->I.DstReg.Index = tmpreg;
+			inst_mov->I.SrcReg[0] = inst->I.SrcReg[2];
+
+			reset_srcreg(&inst->I.SrcReg[2]);
+			inst->I.SrcReg[2].File = PROGRAM_TEMPORARY;
+			inst->I.SrcReg[2].Index = tmpreg;
+		}
+	}
+
+	if (num_operands >= 2) {
+		if (t_src_conflict(inst->I.SrcReg[1], inst->I.SrcReg[0])) {
+			int tmpreg = rc_find_free_temporary(c);
+			struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+			inst_mov->I.Opcode = OPCODE_MOV;
+			inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst_mov->I.DstReg.Index = tmpreg;
+			inst_mov->I.SrcReg[0] = inst->I.SrcReg[1];
+
+			reset_srcreg(&inst->I.SrcReg[1]);
+			inst->I.SrcReg[1].File = PROGRAM_TEMPORARY;
+			inst->I.SrcReg[1].Index = tmpreg;
+		}
+	}
+
+	return GL_TRUE;
+}
+
+static void addArtificialOutputs(struct r300_vertex_program_compiler * compiler)
+{
+	int i;
+
+	for(i = 0; i < 32; ++i) {
+		if ((compiler->RequiredOutputs & (1 << i)) &&
+		    !(compiler->Base.Program.OutputsWritten & (1 << i))) {
+			struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
+			inst->I.Opcode = OPCODE_MOV;
+
+			inst->I.DstReg.File = PROGRAM_OUTPUT;
+			inst->I.DstReg.Index = i;
+			inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+
+			inst->I.SrcReg[0].File = PROGRAM_CONSTANT;
+			inst->I.SrcReg[0].Index = 0;
+			inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+			compiler->Base.Program.OutputsWritten |= 1 << i;
+		}
+	}
+}
+
+static void nqssadceInit(struct nqssadce_state* s)
+{
+	struct r300_vertex_program_compiler * compiler = s->UserData;
+	int i;
+
+	for(i = 0; i < VERT_RESULT_MAX; ++i) {
+		if (compiler->RequiredOutputs & (1 << i))
+			s->Outputs[i].Sourced = WRITEMASK_XYZW;
+	}
+}
+
+static GLboolean swizzleIsNative(GLuint opcode, struct prog_src_register reg)
+{
+	(void) opcode;
+	(void) reg;
+
+	return GL_TRUE;
+}
+
+
+
+void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
+{
+	addArtificialOutputs(compiler);
+
+	{
+		struct radeon_program_transformation transformations[] = {
+			{ &r300_transform_vertex_alu, 0 },
+		};
+		radeonLocalTransform(&compiler->Base, 1, transformations);
+	}
+
+	if (compiler->Base.Debug) {
+		fprintf(stderr, "Vertex program after native rewrite:\n");
+		rc_print_program(&compiler->Base.Program);
+		fflush(stderr);
+	}
+
+	{
+		/* Note: This pass has to be done seperately from ALU rewrite,
+		 * otherwise non-native ALU instructions with source conflits
+		 * will not be treated properly.
+		 */
+		struct radeon_program_transformation transformations[] = {
+			{ &transform_source_conflicts, 0 },
+		};
+		radeonLocalTransform(&compiler->Base, 1, transformations);
+	}
+
+	if (compiler->Base.Debug) {
+		fprintf(stderr, "Vertex program after source conflict resolve:\n");
+		rc_print_program(&compiler->Base.Program);
+		fflush(stderr);
+	}
+
+	{
+		struct radeon_nqssadce_descr nqssadce = {
+			.Init = &nqssadceInit,
+			.IsNativeSwizzle = &swizzleIsNative,
+			.BuildSwizzle = NULL
+		};
+		radeonNqssaDce(&compiler->Base, &nqssadce, compiler);
+
+		/* We need this step for reusing temporary registers */
+		allocate_temporary_registers(compiler);
+
+		if (compiler->Base.Debug) {
+			fprintf(stderr, "Vertex program after NQSSADCE:\n");
+			rc_print_program(&compiler->Base.Program);
+			fflush(stderr);
+		}
+	}
+
+	translate_vertex_program(compiler);
+
+	rc_constants_copy(&compiler->code->constants, &compiler->Base.Program.Constants);
+
+	compiler->code->InputsRead = compiler->Base.Program.InputsRead;
+	compiler->code->OutputsWritten = compiler->Base.Program.OutputsWritten;
+
+	if (compiler->Base.Debug) {
+		fprintf(stderr, "Final vertex program code:\n");
+		r300_vertex_program_dump(compiler->code);
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
new file mode 100644
index 0000000000..980ef3eaea
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
@@ -0,0 +1,177 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "radeon_code.h"
+
+#include <stdio.h>
+
+static char* r300_vs_ve_ops[] = {
+	/* R300 vector ops */
+	"                 VE_NO_OP",
+	"           VE_DOT_PRODUCT",
+	"              VE_MULTIPLY",
+	"                   VE_ADD",
+	"          VE_MULTIPLY_ADD",
+	"       VE_DISTANCE_FACTOR",
+	"              VE_FRACTION",
+	"               VE_MAXIMUM",
+	"               VE_MINIMUM",
+	"VE_SET_GREATER_THAN_EQUAL",
+	"         VE_SET_LESS_THAN",
+	"        VE_MULTIPLYX2_ADD",
+	"        VE_MULTIPLY_CLAMP",
+	"            VE_FLT2FIX_DX",
+	"        VE_FLT2FIX_DX_RND",
+	/* R500 vector ops */
+	"      VE_PRED_SET_EQ_PUSH",
+	"      VE_PRED_SET_GT_PUSH",
+	"     VE_PRED_SET_GTE_PUSH",
+	"     VE_PRED_SET_NEQ_PUSH",
+	"         VE_COND_WRITE_EQ",
+	"         VE_COND_WRITE_GT",
+	"        VE_COND_WRITE_GTE",
+	"        VE_COND_WRITE_NEQ",
+	"      VE_SET_GREATER_THAN",
+	"             VE_SET_EQUAL",
+	"         VE_SET_NOT_EQUAL",
+	"               (reserved)",
+	"               (reserved)",
+	"               (reserved)",
+};
+
+static char* r300_vs_me_ops[] = {
+	/* R300 math ops */
+	"                 ME_NO_OP",
+	"          ME_EXP_BASE2_DX",
+	"          ME_LOG_BASE2_DX",
+	"          ME_EXP_BASEE_FF",
+	"        ME_LIGHT_COEFF_DX",
+	"         ME_POWER_FUNC_FF",
+	"              ME_RECIP_DX",
+	"              ME_RECIP_FF",
+	"         ME_RECIP_SQRT_DX",
+	"         ME_RECIP_SQRT_FF",
+	"              ME_MULTIPLY",
+	"     ME_EXP_BASE2_FULL_DX",
+	"     ME_LOG_BASE2_FULL_DX",
+	" ME_POWER_FUNC_FF_CLAMP_B",
+	"ME_POWER_FUNC_FF_CLAMP_B1",
+	"ME_POWER_FUNC_FF_CLAMP_01",
+	"                   ME_SIN",
+	"                   ME_COS",
+	/* R500 math ops */
+	"        ME_LOG_BASE2_IEEE",
+	"            ME_RECIP_IEEE",
+	"       ME_RECIP_SQRT_IEEE",
+	"           ME_PRED_SET_EQ",
+	"           ME_PRED_SET_GT",
+	"          ME_PRED_SET_GTE",
+	"          ME_PRED_SET_NEQ",
+	"          ME_PRED_SET_CLR",
+	"          ME_PRED_SET_INV",
+	"          ME_PRED_SET_POP",
+	"      ME_PRED_SET_RESTORE",
+	"               (reserved)",
+	"               (reserved)",
+	"               (reserved)",
+};
+
+/* XXX refactor to avoid clashing symbols */
+static char* r300_vs_src_debug[] = {
+	"t",
+	"i",
+	"c",
+	"a",
+};
+
+static char* r300_vs_dst_debug[] = {
+	"t",
+	"a0",
+	"o",
+	"ox",
+	"a",
+	"i",
+	"u",
+	"u",
+};
+
+static char* r300_vs_swiz_debug[] = {
+	"X",
+	"Y",
+	"Z",
+	"W",
+	"0",
+	"1",
+	"U",
+	"U",
+};
+
+
+static void r300_vs_op_dump(uint32_t op)
+{
+	fprintf(stderr, " dst: %d%s op: ",
+			(op >> 13) & 0x7f, r300_vs_dst_debug[(op >> 8) & 0x7]);
+	if (op & 0x80) {
+		if (op & 0x1) {
+			fprintf(stderr, "PVS_MACRO_OP_2CLK_M2X_ADD\n");
+		} else {
+			fprintf(stderr, "   PVS_MACRO_OP_2CLK_MADD\n");
+		}
+	} else if (op & 0x40) {
+		fprintf(stderr, "%s\n", r300_vs_me_ops[op & 0x1f]);
+	} else {
+		fprintf(stderr, "%s\n", r300_vs_ve_ops[op & 0x1f]);
+	}
+}
+
+static void r300_vs_src_dump(uint32_t src)
+{
+	fprintf(stderr, " reg: %d%s swiz: %s%s/%s%s/%s%s/%s%s\n",
+			(src >> 5) & 0x7f, r300_vs_src_debug[src & 0x3],
+			src & (1 << 25) ? "-" : " ",
+			r300_vs_swiz_debug[(src >> 13) & 0x7],
+			src & (1 << 26) ? "-" : " ",
+			r300_vs_swiz_debug[(src >> 16) & 0x7],
+			src & (1 << 27) ? "-" : " ",
+			r300_vs_swiz_debug[(src >> 19) & 0x7],
+			src & (1 << 28) ? "-" : " ",
+			r300_vs_swiz_debug[(src >> 22) & 0x7]);
+}
+
+void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
+{
+	unsigned instrcount = vs->length / 4;
+	unsigned i;
+
+	for(i = 0; i < instrcount; i++) {
+		unsigned offset = i*4;
+		unsigned src;
+
+		fprintf(stderr, "%d: op: 0x%08x", i, vs->body.d[offset]);
+		r300_vs_op_dump(vs->body.d[offset]);
+
+		for(src = 0; src < 3; ++src) {
+			fprintf(stderr, " src%i: 0x%08x", src, vs->body.d[offset+1+src]);
+			r300_vs_src_dump(vs->body.d[offset+1+src]);
+		}
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
new file mode 100644
index 0000000000..7e2faed690
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "r500_fragprog.h"
+
+#include "../r300_reg.h"
+
+static struct prog_src_register shadow_ambient(struct radeon_compiler * c, int tmu)
+{
+	struct prog_src_register reg = { 0, };
+
+	reg.File = PROGRAM_STATE_VAR;
+	reg.Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_SHADOW_AMBIENT, tmu);
+	reg.Swizzle = SWIZZLE_WWWW;
+	return reg;
+}
+
+/**
+ * Transform TEX, TXP, TXB, and KIL instructions in the following way:
+ *  - implement texture compare (shadow extensions)
+ *  - extract non-native source / destination operands
+ */
+GLboolean r500_transform_TEX(
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
+	void* data)
+{
+	struct r300_fragment_program_compiler *compiler =
+		(struct r300_fragment_program_compiler*)data;
+
+	if (inst->I.Opcode != OPCODE_TEX &&
+	    inst->I.Opcode != OPCODE_TXB &&
+	    inst->I.Opcode != OPCODE_TXP &&
+	    inst->I.Opcode != OPCODE_KIL)
+		return GL_FALSE;
+
+	/* ARB_shadow & EXT_shadow_funcs */
+	if (inst->I.Opcode != OPCODE_KIL &&
+	    c->Program.ShadowSamplers & (1 << inst->I.TexSrcUnit)) {
+		GLuint comparefunc = GL_NEVER + compiler->state.unit[inst->I.TexSrcUnit].texture_compare_func;
+
+		if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
+			inst->I.Opcode = OPCODE_MOV;
+
+			if (comparefunc == GL_ALWAYS) {
+				inst->I.SrcReg[0].File = PROGRAM_BUILTIN;
+				inst->I.SrcReg[0].Swizzle = SWIZZLE_1111;
+			} else {
+				inst->I.SrcReg[0] = shadow_ambient(c, inst->I.TexSrcUnit);
+			}
+
+			return GL_TRUE;
+		} else {
+			GLuint comparefunc = GL_NEVER + compiler->state.unit[inst->I.TexSrcUnit].texture_compare_func;
+			GLuint depthmode = compiler->state.unit[inst->I.TexSrcUnit].depth_texture_mode;
+			struct rc_instruction * inst_rcp = rc_insert_new_instruction(c, inst);
+			struct rc_instruction * inst_mad = rc_insert_new_instruction(c, inst_rcp);
+			struct rc_instruction * inst_cmp = rc_insert_new_instruction(c, inst_mad);
+			int pass, fail;
+
+			inst_rcp->I.Opcode = OPCODE_RCP;
+			inst_rcp->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst_rcp->I.DstReg.Index = rc_find_free_temporary(c);
+			inst_rcp->I.DstReg.WriteMask = WRITEMASK_W;
+			inst_rcp->I.SrcReg[0] = inst->I.SrcReg[0];
+			inst_rcp->I.SrcReg[0].Swizzle = SWIZZLE_WWWW;
+
+			inst_cmp->I.DstReg = inst->I.DstReg;
+			inst->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst->I.DstReg.Index = rc_find_free_temporary(c);
+			inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+
+			inst_mad->I.Opcode = OPCODE_MAD;
+			inst_mad->I.DstReg.File = PROGRAM_TEMPORARY;
+			inst_mad->I.DstReg.Index = rc_find_free_temporary(c);
+			inst_mad->I.SrcReg[0] = inst->I.SrcReg[0];
+			inst_mad->I.SrcReg[0].Swizzle = SWIZZLE_ZZZZ;
+			inst_mad->I.SrcReg[1].File = PROGRAM_TEMPORARY;
+			inst_mad->I.SrcReg[1].Index = inst_rcp->I.DstReg.Index;
+			inst_mad->I.SrcReg[1].Swizzle = SWIZZLE_WWWW;
+			inst_mad->I.SrcReg[2].File = PROGRAM_TEMPORARY;
+			inst_mad->I.SrcReg[2].Index = inst->I.DstReg.Index;
+			if (depthmode == 0) /* GL_LUMINANCE */
+				inst_mad->I.SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+			else if (depthmode == 2) /* GL_ALPHA */
+				inst_mad->I.SrcReg[2].Swizzle = SWIZZLE_WWWW;
+
+			/* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
+			 *   r  < tex  <=>      -tex+r < 0
+			 *   r >= tex  <=> not (-tex+r < 0 */
+			if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
+				inst_mad->I.SrcReg[2].Negate = inst_mad->I.SrcReg[2].Negate ^ NEGATE_XYZW;
+			else
+				inst_mad->I.SrcReg[0].Negate = inst_mad->I.SrcReg[0].Negate ^ NEGATE_XYZW;
+
+			inst_cmp->I.Opcode = OPCODE_CMP;
+			/* DstReg has been filled out above */
+			inst_cmp->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+			inst_cmp->I.SrcReg[0].Index = inst_mad->I.DstReg.Index;
+
+			if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+				pass = 1;
+				fail = 2;
+			} else {
+				pass = 2;
+				fail = 1;
+			}
+
+			inst_cmp->I.SrcReg[pass].File = PROGRAM_BUILTIN;
+			inst_cmp->I.SrcReg[pass].Swizzle = SWIZZLE_1111;
+			inst_cmp->I.SrcReg[fail] = shadow_ambient(c, inst->I.TexSrcUnit);
+		}
+	}
+
+	/* Cannot write texture to output registers */
+	if (inst->I.Opcode != OPCODE_KIL && inst->I.DstReg.File != PROGRAM_TEMPORARY) {
+		struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst);
+
+		inst_mov->I.Opcode = OPCODE_MOV;
+		inst_mov->I.DstReg = inst->I.DstReg;
+		inst_mov->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+		inst_mov->I.SrcReg[0].Index = rc_find_free_temporary(c);
+
+		inst->I.DstReg.File = PROGRAM_TEMPORARY;
+		inst->I.DstReg.Index = inst_mov->I.SrcReg[0].Index;
+		inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+	}
+
+	/* Cannot read texture coordinate from constants file */
+	if (inst->I.SrcReg[0].File != PROGRAM_TEMPORARY && inst->I.SrcReg[0].File != PROGRAM_INPUT) {
+		struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
+
+		inst_mov->I.Opcode = OPCODE_MOV;
+		inst_mov->I.DstReg.File = PROGRAM_TEMPORARY;
+		inst_mov->I.DstReg.Index = rc_find_free_temporary(c);
+		inst_mov->I.SrcReg[0] = inst->I.SrcReg[0];
+
+		reset_srcreg(&inst->I.SrcReg[0]);
+		inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+		inst->I.SrcReg[0].Index = inst_mov->I.DstReg.Index;
+	}
+
+	return GL_TRUE;
+}
+
+GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
+{
+	GLuint relevant;
+	int i;
+
+	if (opcode == OPCODE_TEX ||
+	    opcode == OPCODE_TXB ||
+	    opcode == OPCODE_TXP ||
+	    opcode == OPCODE_KIL) {
+		if (reg.Abs)
+			return GL_FALSE;
+
+		if (opcode == OPCODE_KIL && (reg.Swizzle != SWIZZLE_NOOP || reg.Negate != NEGATE_NONE))
+			return GL_FALSE;
+
+		if (reg.Negate)
+			reg.Negate ^= NEGATE_XYZW;
+
+		for(i = 0; i < 4; ++i) {
+			GLuint swz = GET_SWZ(reg.Swizzle, i);
+			if (swz == SWIZZLE_NIL) {
+				reg.Negate &= ~(1 << i);
+				continue;
+			}
+			if (swz >= 4)
+				return GL_FALSE;
+		}
+
+		if (reg.Negate)
+			return GL_FALSE;
+
+		return GL_TRUE;
+	} else if (opcode == OPCODE_DDX || opcode == OPCODE_DDY) {
+		/* DDX/MDH and DDY/MDV explicitly ignore incoming swizzles;
+		 * if it doesn't fit perfectly into a .xyzw case... */
+		if (reg.Swizzle == SWIZZLE_NOOP && !reg.Abs && !reg.Negate)
+			return GL_TRUE;
+
+		return GL_FALSE;
+	} else {
+		/* ALU instructions support almost everything */
+		if (reg.Abs)
+			return GL_TRUE;
+
+		relevant = 0;
+		for(i = 0; i < 3; ++i) {
+			GLuint swz = GET_SWZ(reg.Swizzle, i);
+			if (swz != SWIZZLE_NIL && swz != SWIZZLE_ZERO)
+				relevant |= 1 << i;
+		}
+		if ((reg.Negate & relevant) && ((reg.Negate & relevant) != relevant))
+			return GL_FALSE;
+
+		return GL_TRUE;
+	}
+}
+
+/**
+ * Implement a MOV with a potentially non-native swizzle.
+ *
+ * The only thing we *cannot* do in an ALU instruction is per-component
+ * negation. Therefore, we split the MOV into two instructions when necessary.
+ */
+void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
+{
+	GLuint negatebase[2] = { 0, 0 };
+	int i;
+
+	for(i = 0; i < 4; ++i) {
+		GLuint swz = GET_SWZ(src.Swizzle, i);
+		if (swz == SWIZZLE_NIL)
+			continue;
+		negatebase[GET_BIT(src.Negate, i)] |= 1 << i;
+	}
+
+	for(i = 0; i <= 1; ++i) {
+		if (!negatebase[i])
+			continue;
+
+		struct rc_instruction *inst = rc_insert_new_instruction(s->Compiler, s->IP->Prev);
+		inst->I.Opcode = OPCODE_MOV;
+		inst->I.DstReg = dst;
+		inst->I.DstReg.WriteMask = negatebase[i];
+		inst->I.SrcReg[0] = src;
+		inst->I.SrcReg[0].Negate = (i == 0) ? NEGATE_NONE : NEGATE_XYZW;
+	}
+}
+
+
+static char *toswiz(int swiz_val) {
+  switch(swiz_val) {
+  case 0: return "R";
+  case 1: return "G";
+  case 2: return "B";
+  case 3: return "A";
+  case 4: return "0";
+  case 5: return "1/2";
+  case 6: return "1";
+  case 7: return "U";
+  }
+  return NULL;
+}
+
+static char *toop(int op_val)
+{
+  char *str = NULL;
+  switch (op_val) {
+  case 0: str = "MAD"; break;
+  case 1: str = "DP3"; break;
+  case 2: str = "DP4"; break;
+  case 3: str = "D2A"; break;
+  case 4: str = "MIN"; break;
+  case 5: str = "MAX"; break;
+  case 6: str = "Reserved"; break;
+  case 7: str = "CND"; break;
+  case 8: str = "CMP"; break;
+  case 9: str = "FRC"; break;
+  case 10: str = "SOP"; break;
+  case 11: str = "MDH"; break;
+  case 12: str = "MDV"; break;
+  }
+  return str;
+}
+
+static char *to_alpha_op(int op_val)
+{
+  char *str = NULL;
+  switch (op_val) {
+  case 0: str = "MAD"; break;
+  case 1: str = "DP"; break;
+  case 2: str = "MIN"; break;
+  case 3: str = "MAX"; break;
+  case 4: str = "Reserved"; break;
+  case 5: str = "CND"; break;
+  case 6: str = "CMP"; break;
+  case 7: str = "FRC"; break;
+  case 8: str = "EX2"; break;
+  case 9: str = "LN2"; break;
+  case 10: str = "RCP"; break;
+  case 11: str = "RSQ"; break;
+  case 12: str = "SIN"; break;
+  case 13: str = "COS"; break;
+  case 14: str = "MDH"; break;
+  case 15: str = "MDV"; break;
+  }
+  return str;
+}
+
+static char *to_mask(int val)
+{
+  char *str = NULL;
+  switch(val) {
+  case 0: str = "NONE"; break;
+  case 1: str = "R"; break;
+  case 2: str = "G"; break;
+  case 3: str = "RG"; break;
+  case 4: str = "B"; break;
+  case 5: str = "RB"; break;
+  case 6: str = "GB"; break;
+  case 7: str = "RGB"; break;
+  case 8: str = "A"; break;
+  case 9: str = "AR"; break;
+  case 10: str = "AG"; break;
+  case 11: str = "ARG"; break;
+  case 12: str = "AB"; break;
+  case 13: str = "ARB"; break;
+  case 14: str = "AGB"; break;
+  case 15: str = "ARGB"; break;
+  }
+  return str;
+}
+
+static char *to_texop(int val)
+{
+  switch(val) {
+  case 0: return "NOP";
+  case 1: return "LD";
+  case 2: return "TEXKILL";
+  case 3: return "PROJ";
+  case 4: return "LODBIAS";
+  case 5: return "LOD";
+  case 6: return "DXDY";
+  }
+  return NULL;
+}
+
+void r500FragmentProgramDump(struct rX00_fragment_program_code *c)
+{
+  struct r500_fragment_program_code *code = &c->code.r500;
+  fprintf(stderr, "R500 Fragment Program:\n--------\n");
+
+  int n;
+  uint32_t inst;
+  uint32_t inst0;
+  char *str = NULL;
+
+  for (n = 0; n < code->inst_end+1; n++) {
+    inst0 = inst = code->inst[n].inst0;
+    fprintf(stderr,"%d\t0:CMN_INST   0x%08x:", n, inst);
+    switch(inst & 0x3) {
+    case R500_INST_TYPE_ALU: str = "ALU"; break;
+    case R500_INST_TYPE_OUT: str = "OUT"; break;
+    case R500_INST_TYPE_FC: str = "FC"; break;
+    case R500_INST_TYPE_TEX: str = "TEX"; break;
+    };
+    fprintf(stderr,"%s %s %s %s %s ", str,
+	    inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "",
+	    inst & R500_INST_LAST ? "LAST" : "",
+	    inst & R500_INST_NOP ? "NOP" : "",
+	    inst & R500_INST_ALU_WAIT ? "ALU WAIT" : "");
+    fprintf(stderr,"wmask: %s omask: %s\n", to_mask((inst >> 11) & 0xf),
+	    to_mask((inst >> 15) & 0xf));
+
+    switch(inst0 & 0x3) {
+    case 0:
+    case 1:
+      fprintf(stderr,"\t1:RGB_ADDR   0x%08x:", code->inst[n].inst1);
+      inst = code->inst[n].inst1;
+
+      fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
+	      inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
+	      (inst >> 10) & 0xff, (inst & (1<<18)) ? 'c' : 't',
+	      (inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
+	      (inst >> 30));
+
+      fprintf(stderr,"\t2:ALPHA_ADDR 0x%08x:", code->inst[n].inst2);
+      inst = code->inst[n].inst2;
+      fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
+	      inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
+	      (inst >> 10) & 0xff, (inst & (1<<18)) ? 'c' : 't',
+	      (inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
+	      (inst >> 30));
+      fprintf(stderr,"\t3 RGB_INST:  0x%08x:", code->inst[n].inst3);
+      inst = code->inst[n].inst3;
+      fprintf(stderr,"rgb_A_src:%d %s/%s/%s %d rgb_B_src:%d %s/%s/%s %d\n",
+	      (inst) & 0x3, toswiz((inst >> 2) & 0x7), toswiz((inst >> 5) & 0x7), toswiz((inst >> 8) & 0x7),
+	      (inst >> 11) & 0x3,
+	      (inst >> 13) & 0x3, toswiz((inst >> 15) & 0x7), toswiz((inst >> 18) & 0x7), toswiz((inst >> 21) & 0x7),
+	      (inst >> 24) & 0x3);
+
+
+      fprintf(stderr,"\t4 ALPHA_INST:0x%08x:", code->inst[n].inst4);
+      inst = code->inst[n].inst4;
+      fprintf(stderr,"%s dest:%d%s alp_A_src:%d %s %d alp_B_src:%d %s %d w:%d\n", to_alpha_op(inst & 0xf),
+	      (inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
+	      (inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), (inst >> 17) & 0x3,
+	      (inst >> 19) & 0x3, toswiz((inst >> 21) & 0x7), (inst >> 24) & 0x3,
+	      (inst >> 31) & 0x1);
+
+      fprintf(stderr,"\t5 RGBA_INST: 0x%08x:", code->inst[n].inst5);
+      inst = code->inst[n].inst5;
+      fprintf(stderr,"%s dest:%d%s rgb_C_src:%d %s/%s/%s %d alp_C_src:%d %s %d\n", toop(inst & 0xf),
+	      (inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
+	      (inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), toswiz((inst >> 17) & 0x7), toswiz((inst >> 20) & 0x7),
+	      (inst >> 23) & 0x3,
+	      (inst >> 25) & 0x3, toswiz((inst >> 27) & 0x7), (inst >> 30) & 0x3);
+      break;
+    case 2:
+      break;
+    case 3:
+      inst = code->inst[n].inst1;
+      fprintf(stderr,"\t1:TEX_INST:  0x%08x: id: %d op:%s, %s, %s %s\n", inst, (inst >> 16) & 0xf,
+	      to_texop((inst >> 22) & 0x7), (inst & (1<<25)) ? "ACQ" : "",
+	      (inst & (1<<26)) ? "IGNUNC" : "", (inst & (1<<27)) ? "UNSCALED" : "SCALED");
+      inst = code->inst[n].inst2;
+      fprintf(stderr,"\t2:TEX_ADDR:  0x%08x: src: %d%s %s/%s/%s/%s dst: %d%s %s/%s/%s/%s\n", inst,
+	      inst & 127, inst & (1<<7) ? "(rel)" : "",
+	      toswiz((inst >> 8) & 0x3), toswiz((inst >> 10) & 0x3),
+	      toswiz((inst >> 12) & 0x3), toswiz((inst >> 14) & 0x3),
+	      (inst >> 16) & 127, inst & (1<<23) ? "(rel)" : "",
+	      toswiz((inst >> 24) & 0x3), toswiz((inst >> 26) & 0x3),
+	      toswiz((inst >> 28) & 0x3), toswiz((inst >> 30) & 0x3));
+
+      fprintf(stderr,"\t3:TEX_DXDY:  0x%08x\n", code->inst[n].inst3);
+      break;
+    }
+    fprintf(stderr,"\n");
+  }
+
+}
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
index 1e45538f80..9091f65cd2 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
@@ -33,30 +33,23 @@
 #ifndef __R500_FRAGPROG_H_
 #define __R500_FRAGPROG_H_
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/program.h"
 #include "shader/prog_instruction.h"
 
-#include "r300_context.h"
-#include "r300_state.h"
-#include "radeon_program.h"
+#include "radeon_compiler.h"
+#include "radeon_nqssadce.h"
 
-struct r500_fragment_program;
+extern void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
-extern void r500TranslateFragmentShader(r300ContextPtr r300,
-					struct r500_fragment_program *fp);
+extern void r500FragmentProgramDump(struct rX00_fragment_program_code *c);
 
-struct r500_fragment_program_compiler {
-	r300ContextPtr r300;
-	struct r500_fragment_program *fp;
-	struct r500_fragment_program_code *code;
-	struct gl_program *program;
-};
+extern GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg);
 
-extern GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler);
+extern void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src);
+
+extern GLboolean r500_transform_TEX(
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
+	void* data);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index 4631235f0d..d694725c9b 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -45,47 +45,22 @@
 
 #include "r500_fragprog.h"
 
+#include "../r300_reg.h"
+
 #include "radeon_program_pair.h"
 
 
 #define PROG_CODE \
-	struct r500_fragment_program_compiler *c = (struct r500_fragment_program_compiler*)data; \
-	struct r500_fragment_program_code *code = c->code
+	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
+	struct r500_fragment_program_code *code = &c->code->code.r500
 
 #define error(fmt, args...) do {			\
-		fprintf(stderr, "%s::%s(): " fmt "\n",	\
+		rc_error(&c->Base, "%s::%s(): " fmt "\n",	\
 			__FILE__, __FUNCTION__, ##args);	\
 	} while(0)
 
 
-/**
- * Callback to register hardware constants.
- */
-static GLboolean emit_const(void *data, GLuint file, GLuint idx, GLuint *hwindex)
-{
-	PROG_CODE;
-
-	for (*hwindex = 0; *hwindex < code->const_nr; ++*hwindex) {
-		if (code->constant[*hwindex].File == file &&
-		    code->constant[*hwindex].Index == idx)
-			break;
-	}
-
-	if (*hwindex >= code->const_nr) {
-		if (*hwindex >= PFS_NUM_CONST_REGS) {
-			error("Out of hw constants!\n");
-			return GL_FALSE;
-		}
-
-		code->const_nr++;
-		code->constant[*hwindex].File = file;
-		code->constant[*hwindex].Index = idx;
-	}
-
-	return GL_TRUE;
-}
-
-static GLuint translate_rgb_op(GLuint opcode)
+static GLuint translate_rgb_op(struct r300_fragment_program_compiler *c, GLuint opcode)
 {
 	switch(opcode) {
 	case OPCODE_CMP: return R500_ALU_RGBA_OP_CMP;
@@ -106,7 +81,7 @@ static GLuint translate_rgb_op(GLuint opcode)
 	}
 }
 
-static GLuint translate_alpha_op(GLuint opcode)
+static GLuint translate_alpha_op(struct r300_fragment_program_compiler *c, GLuint opcode)
 {
 	switch(opcode) {
 	case OPCODE_CMP: return R500_ALPHA_OP_CMP;
@@ -189,8 +164,8 @@ static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
 
 	int ip = ++code->inst_end;
 
-	code->inst[ip].inst5 = translate_rgb_op(inst->RGB.Opcode);
-	code->inst[ip].inst4 = translate_alpha_op(inst->Alpha.Opcode);
+	code->inst[ip].inst5 = translate_rgb_op(c, inst->RGB.Opcode);
+	code->inst[ip].inst4 = translate_alpha_op(c, inst->Alpha.Opcode);
 
 	if (inst->RGB.OutputWriteMask || inst->Alpha.OutputWriteMask || inst->Alpha.DepthWriteMask)
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT;
@@ -202,7 +177,7 @@ static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
 	code->inst[ip].inst0 |= (inst->RGB.OutputWriteMask << 15) | (inst->Alpha.OutputWriteMask << 18);
 	if (inst->Alpha.DepthWriteMask) {
 		code->inst[ip].inst4 |= R500_ALPHA_W_OMASK;
-		c->fp->writes_depth = GL_TRUE;
+		c->code->writes_depth = GL_TRUE;
 	}
 
 	code->inst[ip].inst4 |= R500_ALPHA_ADDRD(inst->Alpha.DestIndex);
@@ -234,19 +209,19 @@ static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
 	return GL_TRUE;
 }
 
-static GLuint translate_strq_swizzle(struct prog_src_register src)
+static GLuint translate_strq_swizzle(GLuint swizzle)
 {
 	GLuint swiz = 0;
 	int i;
 	for (i = 0; i < 4; i++)
-		swiz |= (GET_SWZ(src.Swizzle, i) & 0x3) << i*2;
+		swiz |= (GET_SWZ(swizzle, i) & 0x3) << i*2;
 	return swiz;
 }
 
 /**
  * Emit a single TEX instruction
  */
-static GLboolean emit_tex(void *data, struct prog_instruction *inst)
+static GLboolean emit_tex(void *data, struct radeon_pair_texture_instruction *inst)
 {
 	PROG_CODE;
 
@@ -258,7 +233,7 @@ static GLboolean emit_tex(void *data, struct prog_instruction *inst)
 	int ip = ++code->inst_end;
 
 	code->inst[ip].inst0 = R500_INST_TYPE_TEX
-		| (inst->DstReg.WriteMask << 11)
+		| (inst->WriteMask << 11)
 		| R500_INST_TEX_SEM_WAIT;
 	code->inst[ip].inst1 = R500_TEX_ID(inst->TexSrcUnit)
 		| R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
@@ -267,25 +242,25 @@ static GLboolean emit_tex(void *data, struct prog_instruction *inst)
 	        code->inst[ip].inst1 |= R500_TEX_UNSCALED;
 
 	switch (inst->Opcode) {
-	case OPCODE_KIL:
+	case RADEON_OPCODE_KIL:
 		code->inst[ip].inst1 |= R500_TEX_INST_TEXKILL;
 		break;
-	case OPCODE_TEX:
+	case RADEON_OPCODE_TEX:
 		code->inst[ip].inst1 |= R500_TEX_INST_LD;
 		break;
-	case OPCODE_TXB:
+	case RADEON_OPCODE_TXB:
 		code->inst[ip].inst1 |= R500_TEX_INST_LODBIAS;
 		break;
-	case OPCODE_TXP:
+	case RADEON_OPCODE_TXP:
 		code->inst[ip].inst1 |= R500_TEX_INST_PROJ;
 		break;
 	default:
 		error("emit_tex can't handle opcode %x\n", inst->Opcode);
 	}
 
-	code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcReg[0].Index)
-		| (translate_strq_swizzle(inst->SrcReg[0]) << 8)
-		| R500_TEX_DST_ADDR(inst->DstReg.Index)
+	code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcIndex)
+		| (translate_strq_swizzle(inst->SrcSwizzle) << 8)
+		| R500_TEX_DST_ADDR(inst->DestIndex)
 		| R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
 		| R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
 
@@ -293,35 +268,32 @@ static GLboolean emit_tex(void *data, struct prog_instruction *inst)
 }
 
 static const struct radeon_pair_handler pair_handler = {
-	.EmitConst = emit_const,
 	.EmitPaired = emit_paired,
 	.EmitTex = emit_tex,
 	.MaxHwTemps = 128
 };
 
-GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler)
+void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
-	struct r500_fragment_program_code *code = compiler->code;
+	struct r500_fragment_program_code *code = &compiler->code->code.r500;
 
 	_mesa_bzero(code, sizeof(*code));
 	code->max_temp_idx = 1;
-	code->inst_offset = 0;
 	code->inst_end = -1;
 
-	if (!radeonPairProgram(compiler->r300->radeon.glCtx, compiler->program, &pair_handler, compiler))
-		return GL_FALSE;
+	radeonPairProgram(compiler, &pair_handler, compiler);
+	if (compiler->Base.Error)
+		return;
 
 	if ((code->inst[code->inst_end].inst0 & R500_INST_TYPE_MASK) != R500_INST_TYPE_OUT) {
 		/* This may happen when dead-code elimination is disabled or
 		 * when most of the fragment program logic is leading to a KIL */
 		if (code->inst_end >= 511) {
-			error("Introducing fake OUT: Too many instructions");
-			return GL_FALSE;
+			rc_error(&compiler->Base, "Introducing fake OUT: Too many instructions");
+			return;
 		}
 
 		int ip = ++code->inst_end;
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
 	}
-
-	return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.c b/src/mesa/drivers/dri/r300/compiler/radeon_code.c
new file mode 100644
index 0000000000..c7923004df
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (C) 2009 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "main/mtypes.h"
+#include "shader/prog_instruction.h"
+
+#include "radeon_code.h"
+
+void rc_constants_init(struct rc_constant_list * c)
+{
+	memset(c, 0, sizeof(*c));
+}
+
+/**
+ * Copy a constants structure, assuming that the destination structure
+ * is not initialized.
+ */
+void rc_constants_copy(struct rc_constant_list * dst, struct rc_constant_list * src)
+{
+	dst->Constants = malloc(sizeof(struct rc_constant) * src->Count);
+	memcpy(dst->Constants, src->Constants, sizeof(struct rc_constant) * src->Count);
+	dst->Count = src->Count;
+	dst->_Reserved = src->Count;
+}
+
+void rc_constants_destroy(struct rc_constant_list * c)
+{
+	free(c->Constants);
+	memset(c, 0, sizeof(*c));
+}
+
+unsigned rc_constants_add(struct rc_constant_list * c, struct rc_constant * constant)
+{
+	unsigned index = c->Count;
+
+	if (c->Count >= c->_Reserved) {
+		struct rc_constant * newlist;
+
+		c->_Reserved = c->_Reserved * 2;
+		if (!c->_Reserved)
+			c->_Reserved = 16;
+
+		newlist = malloc(sizeof(struct rc_constant) * c->_Reserved);
+		memcpy(newlist, c->Constants, sizeof(struct rc_constant) * c->Count);
+
+		free(c->Constants);
+		c->Constants = newlist;
+	}
+
+	c->Constants[index] = *constant;
+	c->Count++;
+
+	return index;
+}
+
+
+/**
+ * Add a state vector to the constant list, while trying to avoid duplicates.
+ */
+unsigned rc_constants_add_state(struct rc_constant_list * c, unsigned state0, unsigned state1)
+{
+	unsigned index;
+	struct rc_constant constant;
+
+	for(index = 0; index < c->Count; ++index) {
+		if (c->Constants[index].Type == RC_CONSTANT_STATE) {
+			if (c->Constants[index].u.State[0] == state0 &&
+			    c->Constants[index].u.State[1] == state1)
+				return index;
+		}
+	}
+
+	memset(&constant, 0, sizeof(constant));
+	constant.Type = RC_CONSTANT_STATE;
+	constant.Size = 4;
+	constant.u.State[0] = state0;
+	constant.u.State[1] = state1;
+
+	return rc_constants_add(c, &constant);
+}
+
+
+/**
+ * Add an immediate vector to the constant list, while trying to avoid
+ * duplicates.
+ */
+unsigned rc_constants_add_immediate_vec4(struct rc_constant_list * c, const float * data)
+{
+	unsigned index;
+	struct rc_constant constant;
+
+	for(index = 0; index < c->Count; ++index) {
+		if (c->Constants[index].Type == RC_CONSTANT_IMMEDIATE) {
+			if (!memcmp(c->Constants[index].u.Immediate, data, sizeof(float)*4))
+				return index;
+		}
+	}
+
+	memset(&constant, 0, sizeof(constant));
+	constant.Type = RC_CONSTANT_IMMEDIATE;
+	constant.Size = 4;
+	memcpy(constant.u.Immediate, data, sizeof(float) * 4);
+
+	return rc_constants_add(c, &constant);
+}
+
+
+/**
+ * Add an immediate scalar to the constant list, while trying to avoid
+ * duplicates.
+ */
+unsigned rc_constants_add_immediate_scalar(struct rc_constant_list * c, float data, unsigned * swizzle)
+{
+	unsigned index;
+	int free_index = -1;
+	struct rc_constant constant;
+
+	for(index = 0; index < c->Count; ++index) {
+		if (c->Constants[index].Type == RC_CONSTANT_IMMEDIATE) {
+			for(unsigned comp = 0; comp < c->Constants[index].Size; ++comp) {
+				if (c->Constants[index].u.Immediate[comp] == data) {
+					*swizzle = MAKE_SWIZZLE4(comp, comp, comp, comp);
+					return index;
+				}
+			}
+
+			if (c->Constants[index].Size < 4)
+				free_index = index;
+		}
+	}
+
+	if (free_index >= 0) {
+		unsigned comp = c->Constants[free_index].Size++;
+		c->Constants[free_index].u.Immediate[comp] = data;
+		*swizzle = MAKE_SWIZZLE4(comp, comp, comp, comp);
+		return free_index;
+	}
+
+	memset(&constant, 0, sizeof(constant));
+	constant.Type = RC_CONSTANT_IMMEDIATE;
+	constant.Size = 1;
+	constant.u.Immediate[0] = data;
+	*swizzle = SWIZZLE_XXXX;
+
+	return rc_constants_add(c, &constant);
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
new file mode 100644
index 0000000000..3e88554ba1
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef RADEON_CODE_H
+#define RADEON_CODE_H
+
+#include <stdint.h>
+
+#define R300_PFS_MAX_ALU_INST     64
+#define R300_PFS_MAX_TEX_INST     32
+#define R300_PFS_MAX_TEX_INDIRECT 4
+#define R300_PFS_NUM_TEMP_REGS    32
+#define R300_PFS_NUM_CONST_REGS   32
+
+#define R500_PFS_MAX_INST         512
+#define R500_PFS_NUM_TEMP_REGS    128
+#define R500_PFS_NUM_CONST_REGS   256
+
+
+#define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
+
+enum {
+	/**
+	 * External constants are constants whose meaning is unknown to this
+	 * compiler. For example, a Mesa gl_program's constants are turned
+	 * into external constants.
+	 */
+	RC_CONSTANT_EXTERNAL = 0,
+
+	RC_CONSTANT_IMMEDIATE,
+
+	/**
+	 * Constant referring to state that is known by this compiler,
+	 * see RC_STATE_xxx, i.e. *not* arbitrary Mesa (or other) state.
+	 */
+	RC_CONSTANT_STATE
+};
+
+enum {
+	RC_STATE_SHADOW_AMBIENT = 0,
+
+	RC_STATE_R300_WINDOW_DIMENSION,
+	RC_STATE_R300_TEXRECT_FACTOR
+};
+
+struct rc_constant {
+	unsigned Type:2; /**< RC_CONSTANT_xxx */
+	unsigned Size:3;
+
+	union {
+		unsigned External;
+		float Immediate[4];
+		unsigned State[2];
+	} u;
+};
+
+struct rc_constant_list {
+	struct rc_constant * Constants;
+	unsigned Count;
+
+	unsigned _Reserved;
+};
+
+void rc_constants_init(struct rc_constant_list * c);
+void rc_constants_copy(struct rc_constant_list * dst, struct rc_constant_list * src);
+void rc_constants_destroy(struct rc_constant_list * c);
+unsigned rc_constants_add(struct rc_constant_list * c, struct rc_constant * constant);
+unsigned rc_constants_add_state(struct rc_constant_list * c, unsigned state1, unsigned state2);
+unsigned rc_constants_add_immediate_vec4(struct rc_constant_list * c, const float * data);
+unsigned rc_constants_add_immediate_scalar(struct rc_constant_list * c, float data, unsigned * swizzle);
+
+/**
+ * Stores state that influences the compilation of a fragment program.
+ */
+struct r300_fragment_program_external_state {
+	struct {
+		/**
+		 * If the sampler is used as a shadow sampler,
+		 * this field is:
+		 *  0 - GL_LUMINANCE
+		 *  1 - GL_INTENSITY
+		 *  2 - GL_ALPHA
+		 * depending on the depth texture mode.
+		 */
+		unsigned depth_texture_mode : 2;
+
+		/**
+		 * If the sampler is used as a shadow sampler,
+		 * this field is (texture_compare_func - GL_NEVER).
+		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
+		 *
+		 * Otherwise, this field is 0.
+		 */
+		unsigned texture_compare_func : 3;
+	} unit[16];
+};
+
+
+
+struct r300_fragment_program_node {
+	int tex_offset; /**< first tex instruction */
+	int tex_end; /**< last tex instruction, relative to tex_offset */
+	int alu_offset; /**< first ALU instruction */
+	int alu_end; /**< last ALU instruction, relative to alu_offset */
+	int flags;
+};
+
+/**
+ * Stores an R300 fragment program in its compiled-to-hardware form.
+ */
+struct r300_fragment_program_code {
+	struct {
+		int length; /**< total # of texture instructions used */
+		uint32_t inst[R300_PFS_MAX_TEX_INST];
+	} tex;
+
+	struct {
+		int length; /**< total # of ALU instructions used */
+		struct {
+			uint32_t rgb_inst;
+			uint32_t rgb_addr;
+			uint32_t alpha_inst;
+			uint32_t alpha_addr;
+		} inst[R300_PFS_MAX_ALU_INST];
+	} alu;
+
+	uint32_t config; /* US_CONFIG */
+	uint32_t pixsize; /* US_PIXSIZE */
+	uint32_t code_offset; /* US_CODE_OFFSET */
+	uint32_t code_addr[4]; /* US_CODE_ADDR */
+};
+
+
+struct r500_fragment_program_code {
+	struct {
+		uint32_t inst0;
+		uint32_t inst1;
+		uint32_t inst2;
+		uint32_t inst3;
+		uint32_t inst4;
+		uint32_t inst5;
+	} inst[R500_PFS_MAX_INST];
+
+	int inst_end; /* Number of instructions - 1; also, last instruction to be executed */
+
+	int max_temp_idx;
+};
+
+struct rX00_fragment_program_code {
+	union {
+		struct r300_fragment_program_code r300;
+		struct r500_fragment_program_code r500;
+	} code;
+
+	unsigned writes_depth:1;
+
+	struct rc_constant_list constants;
+};
+
+
+#define VSF_MAX_FRAGMENT_LENGTH (255*4)
+#define VSF_MAX_FRAGMENT_TEMPS (14)
+
+#define VSF_MAX_INPUTS 32
+#define VSF_MAX_OUTPUTS 32
+
+struct r300_vertex_program_code {
+	int length;
+	union {
+		uint32_t d[VSF_MAX_FRAGMENT_LENGTH];
+		float f[VSF_MAX_FRAGMENT_LENGTH];
+	} body;
+
+	int pos_end;
+	int num_temporaries;	/* Number of temp vars used by program */
+	int inputs[VSF_MAX_INPUTS];
+	int outputs[VSF_MAX_OUTPUTS];
+
+	struct rc_constant_list constants;
+
+	uint32_t InputsRead;
+	uint32_t OutputsWritten;
+};
+
+void r300_vertex_program_dump(struct r300_vertex_program_code * vs);
+
+#endif /* RADEON_CODE_H */
+
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
new file mode 100644
index 0000000000..da950d5289
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#include "radeon_compiler.h"
+
+#include <stdarg.h>
+
+#include "radeon_program.h"
+
+
+void rc_init(struct radeon_compiler * c)
+{
+	memset(c, 0, sizeof(*c));
+
+	memory_pool_init(&c->Pool);
+	c->Program.Instructions.Prev = &c->Program.Instructions;
+	c->Program.Instructions.Next = &c->Program.Instructions;
+	c->Program.Instructions.I.Opcode = OPCODE_END;
+}
+
+void rc_destroy(struct radeon_compiler * c)
+{
+	rc_constants_destroy(&c->Program.Constants);
+	memory_pool_destroy(&c->Pool);
+	free(c->ErrorMsg);
+}
+
+void rc_debug(struct radeon_compiler * c, const char * fmt, ...)
+{
+	va_list ap;
+
+	if (!c->Debug)
+		return;
+
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
+
+void rc_error(struct radeon_compiler * c, const char * fmt, ...)
+{
+	va_list ap;
+
+	c->Error = GL_TRUE;
+
+	if (!c->ErrorMsg) {
+		/* Only remember the first error */
+		char buf[1024];
+		int written;
+
+		va_start(ap, fmt);
+		written = vsnprintf(buf, sizeof(buf), fmt, ap);
+		va_end(ap);
+
+		if (written < sizeof(buf)) {
+			c->ErrorMsg = strdup(buf);
+		} else {
+			c->ErrorMsg = malloc(written + 1);
+
+			va_start(ap, fmt);
+			vsnprintf(c->ErrorMsg, written + 1, fmt, ap);
+			va_end(ap);
+		}
+	}
+
+	if (c->Debug) {
+		fprintf(stderr, "r300compiler error: ");
+
+		va_start(ap, fmt);
+		vfprintf(stderr, fmt, ap);
+		va_end(ap);
+	}
+}
+
+/**
+ * Rewrite the program such that everything that source the given input
+ * register will source new_input instead.
+ */
+void rc_move_input(struct radeon_compiler * c, unsigned input, struct prog_src_register new_input)
+{
+	struct rc_instruction * inst;
+
+	c->Program.InputsRead &= ~(1 << input);
+
+	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
+		const unsigned numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		unsigned i;
+
+		for(i = 0; i < numsrcs; ++i) {
+			if (inst->I.SrcReg[i].File == PROGRAM_INPUT && inst->I.SrcReg[i].Index == input) {
+				inst->I.SrcReg[i].File = new_input.File;
+				inst->I.SrcReg[i].Index = new_input.Index;
+				inst->I.SrcReg[i].Swizzle = combine_swizzles(new_input.Swizzle, inst->I.SrcReg[i].Swizzle);
+				if (!inst->I.SrcReg[i].Abs) {
+					inst->I.SrcReg[i].Negate ^= new_input.Negate;
+					inst->I.SrcReg[i].Abs = new_input.Abs;
+				}
+
+				c->Program.InputsRead |= 1 << new_input.Index;
+			}
+		}
+	}
+}
+
+
+/**
+ * Rewrite the program such that everything that writes into the given
+ * output register will instead write to new_output. The new_output
+ * writemask is honoured.
+ */
+void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_output, unsigned writemask)
+{
+	struct rc_instruction * inst;
+
+	c->Program.OutputsWritten &= ~(1 << output);
+
+	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
+		const unsigned numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+
+		if (numdsts) {
+			if (inst->I.DstReg.File == PROGRAM_OUTPUT && inst->I.DstReg.Index == output) {
+				inst->I.DstReg.Index = new_output;
+				inst->I.DstReg.WriteMask &= writemask;
+
+				c->Program.OutputsWritten |= 1 << new_output;
+			}
+		}
+	}
+}
+
+
+/**
+ * Rewrite the program such that a given output is duplicated.
+ */
+void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_output)
+{
+	unsigned tempreg = rc_find_free_temporary(c);
+	struct rc_instruction * inst;
+
+	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next) {
+		const unsigned numdsts = _mesa_num_inst_dst_regs(inst->I.Opcode);
+
+		if (numdsts) {
+			if (inst->I.DstReg.File == PROGRAM_OUTPUT && inst->I.DstReg.Index == output) {
+				inst->I.DstReg.File = PROGRAM_TEMPORARY;
+				inst->I.DstReg.Index = tempreg;
+			}
+		}
+	}
+
+	inst = rc_insert_new_instruction(c, c->Program.Instructions.Prev);
+	inst->I.Opcode = OPCODE_MOV;
+	inst->I.DstReg.File = PROGRAM_OUTPUT;
+	inst->I.DstReg.Index = output;
+
+	inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+	inst->I.SrcReg[0].Index = tempreg;
+	inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	inst = rc_insert_new_instruction(c, c->Program.Instructions.Prev);
+	inst->I.Opcode = OPCODE_MOV;
+	inst->I.DstReg.File = PROGRAM_OUTPUT;
+	inst->I.DstReg.Index = dup_output;
+
+	inst->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+	inst->I.SrcReg[0].Index = tempreg;
+	inst->I.SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	c->Program.OutputsWritten |= 1 << dup_output;
+}
+
+
+/**
+ * Introduce standard code fragment to deal with fragment.position.
+ */
+void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input)
+{
+	unsigned tempregi = rc_find_free_temporary(c);
+
+	c->Program.InputsRead &= ~(1 << wpos);
+	c->Program.InputsRead |= 1 << new_input;
+
+	/* perspective divide */
+	struct rc_instruction * inst_rcp = rc_insert_new_instruction(c, &c->Program.Instructions);
+	inst_rcp->I.Opcode = OPCODE_RCP;
+
+	inst_rcp->I.DstReg.File = PROGRAM_TEMPORARY;
+	inst_rcp->I.DstReg.Index = tempregi;
+	inst_rcp->I.DstReg.WriteMask = WRITEMASK_W;
+
+	inst_rcp->I.SrcReg[0].File = PROGRAM_INPUT;
+	inst_rcp->I.SrcReg[0].Index = new_input;
+	inst_rcp->I.SrcReg[0].Swizzle = SWIZZLE_WWWW;
+
+	struct rc_instruction * inst_mul = rc_insert_new_instruction(c, inst_rcp);
+	inst_mul->I.Opcode = OPCODE_MUL;
+
+	inst_mul->I.DstReg.File = PROGRAM_TEMPORARY;
+	inst_mul->I.DstReg.Index = tempregi;
+	inst_mul->I.DstReg.WriteMask = WRITEMASK_XYZ;
+
+	inst_mul->I.SrcReg[0].File = PROGRAM_INPUT;
+	inst_mul->I.SrcReg[0].Index = new_input;
+
+	inst_mul->I.SrcReg[1].File = PROGRAM_TEMPORARY;
+	inst_mul->I.SrcReg[1].Index = tempregi;
+	inst_mul->I.SrcReg[1].Swizzle = SWIZZLE_WWWW;
+
+	/* viewport transformation */
+	struct rc_instruction * inst_mad = rc_insert_new_instruction(c, inst_mul);
+	inst_mad->I.Opcode = OPCODE_MAD;
+
+	inst_mad->I.DstReg.File = PROGRAM_TEMPORARY;
+	inst_mad->I.DstReg.Index = tempregi;
+	inst_mad->I.DstReg.WriteMask = WRITEMASK_XYZ;
+
+	inst_mad->I.SrcReg[0].File = PROGRAM_TEMPORARY;
+	inst_mad->I.SrcReg[0].Index = tempregi;
+	inst_mad->I.SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	inst_mad->I.SrcReg[1].File = PROGRAM_STATE_VAR;
+	inst_mad->I.SrcReg[1].Index = rc_constants_add_state(&c->Program.Constants, RC_STATE_R300_WINDOW_DIMENSION, 0);
+	inst_mad->I.SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	inst_mad->I.SrcReg[2].File = PROGRAM_STATE_VAR;
+	inst_mad->I.SrcReg[2].Index = inst_mad->I.SrcReg[1].Index;
+	inst_mad->I.SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	struct rc_instruction * inst;
+	for (inst = inst_mad->Next; inst != &c->Program.Instructions; inst = inst->Next) {
+		const unsigned numsrcs = _mesa_num_inst_src_regs(inst->I.Opcode);
+		unsigned i;
+
+		for(i = 0; i < numsrcs; i++) {
+			if (inst->I.SrcReg[i].File == PROGRAM_INPUT &&
+			    inst->I.SrcReg[i].Index == wpos) {
+				inst->I.SrcReg[i].File = PROGRAM_TEMPORARY;
+				inst->I.SrcReg[i].Index = tempregi;
+			}
+		}
+	}
+}
+
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
new file mode 100644
index 0000000000..e63ab8840a
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+#ifndef RADEON_COMPILER_H
+#define RADEON_COMPILER_H
+
+#include "main/mtypes.h"
+#include "shader/prog_instruction.h"
+
+#include "memory_pool.h"
+#include "radeon_code.h"
+
+
+struct rc_instruction {
+	struct rc_instruction * Prev;
+	struct rc_instruction * Next;
+	struct prog_instruction I;
+};
+
+struct rc_program {
+	/**
+	 * Instructions.Next points to the first instruction,
+	 * Instructions.Prev points to the last instruction.
+	 */
+	struct rc_instruction Instructions;
+
+	/* Long term, we should probably remove InputsRead & OutputsWritten,
+	 * since updating dependent state can be fragile, and they aren't
+	 * actually used very often. */
+	uint32_t InputsRead;
+	uint32_t OutputsWritten;
+	uint32_t ShadowSamplers; /**< Texture units used for shadow sampling. */
+
+	struct rc_constant_list Constants;
+};
+
+struct radeon_compiler {
+	struct memory_pool Pool;
+	struct rc_program Program;
+	unsigned Debug:1;
+	unsigned Error:1;
+	char * ErrorMsg;
+};
+
+void rc_init(struct radeon_compiler * c);
+void rc_destroy(struct radeon_compiler * c);
+
+void rc_debug(struct radeon_compiler * c, const char * fmt, ...);
+void rc_error(struct radeon_compiler * c, const char * fmt, ...);
+
+void rc_mesa_to_rc_program(struct radeon_compiler * c, struct gl_program * program);
+
+void rc_calculate_inputs_outputs(struct radeon_compiler * c);
+
+void rc_move_input(struct radeon_compiler * c, unsigned input, struct prog_src_register new_input);
+void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_output, unsigned writemask);
+void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_output);
+void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input);
+
+struct r300_fragment_program_compiler {
+	struct radeon_compiler Base;
+	struct rX00_fragment_program_code *code;
+	struct r300_fragment_program_external_state state;
+	unsigned is_r500;
+	unsigned OutputDepth;
+	unsigned OutputColor;
+
+	void * UserData;
+	void (*AllocateHwInputs)(
+		struct r300_fragment_program_compiler * c,
+		void (*allocate)(void * data, unsigned input, unsigned hwreg),
+		void * mydata);
+};
+
+void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c);
+
+
+struct r300_vertex_program_compiler {
+	struct radeon_compiler Base;
+	struct r300_vertex_program_code *code;
+	GLbitfield RequiredOutputs;
+
+	void * UserData;
+	void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c);
+};
+
+void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c);
+
+#endif /* RADEON_COMPILER_H */
diff --git a/src/mesa/drivers/dri/r300/radeon_nqssadce.c b/src/mesa/drivers/dri/r300/compiler/radeon_nqssadce.c
index 97ce016c99..aaaa50ad1f 100644
--- a/src/mesa/drivers/dri/r300/radeon_nqssadce.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_nqssadce.c
@@ -36,6 +36,8 @@
 
 #include "radeon_nqssadce.h"
 
+#include "radeon_compiler.h"
+
 
 /**
  * Return the @ref register_state for the given register (or 0 for untracked
@@ -46,6 +48,7 @@ static struct register_state *get_reg_state(struct nqssadce_state* s, GLuint fil
 	switch(file) {
 	case PROGRAM_TEMPORARY: return &s->Temps[index];
 	case PROGRAM_OUTPUT: return &s->Outputs[index];
+	case PROGRAM_ADDRESS: return &s->Address;
 	default: return 0;
 	}
 }
@@ -56,17 +59,17 @@ static struct register_state *get_reg_state(struct nqssadce_state* s, GLuint fil
  *
  * @note Works correctly only for X, Y, Z, W swizzles, not for constant swizzles.
  */
-static struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg)
+struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg)
 {
 	struct prog_src_register tmp = srcreg;
 	int i;
 	tmp.Swizzle = 0;
-	tmp.NegateBase = 0;
+	tmp.Negate = NEGATE_NONE;
 	for(i = 0; i < 4; ++i) {
 		GLuint swz = GET_SWZ(swizzle, i);
 		if (swz < 4) {
 			tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
-			tmp.NegateBase |= GET_BIT(srcreg.NegateBase, swz) << i;
+			tmp.Negate |= GET_BIT(srcreg.Negate, swz) << i;
 		} else {
 			tmp.Swizzle |= swz << (i*3);
 		}
@@ -75,9 +78,10 @@ static struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_reg
 }
 
 
-static struct prog_instruction* track_used_srcreg(struct nqssadce_state* s,
-	struct prog_instruction *inst, GLint src, GLuint sourced)
+static void track_used_srcreg(struct nqssadce_state* s,
+	GLint src, GLuint sourced)
 {
+	struct prog_instruction * inst = &s->IP->I;
 	int i;
 	GLuint deswz_source = 0;
 
@@ -94,18 +98,16 @@ static struct prog_instruction* track_used_srcreg(struct nqssadce_state* s,
 	if (!s->Descr->IsNativeSwizzle(inst->Opcode, inst->SrcReg[src])) {
 		struct prog_dst_register dstreg = inst->DstReg;
 		dstreg.File = PROGRAM_TEMPORARY;
-		dstreg.Index = _mesa_find_free_register(s->Program, PROGRAM_TEMPORARY);
+		dstreg.Index = rc_find_free_temporary(s->Compiler);
 		dstreg.WriteMask = sourced;
 
 		s->Descr->BuildSwizzle(s, dstreg, inst->SrcReg[src]);
 
-		inst = s->Program->Instructions + s->IP;
 		inst->SrcReg[src].File = PROGRAM_TEMPORARY;
 		inst->SrcReg[src].Index = dstreg.Index;
 		inst->SrcReg[src].Swizzle = 0;
-		inst->SrcReg[src].NegateBase = 0;
+		inst->SrcReg[src].Negate = NEGATE_NONE;
 		inst->SrcReg[src].Abs = 0;
-		inst->SrcReg[src].NegateAbs = 0;
 		for(i = 0; i < 4; ++i) {
 			if (GET_BIT(sourced, i))
 				inst->SrcReg[src].Swizzle |= i << (3*i);
@@ -115,67 +117,38 @@ static struct prog_instruction* track_used_srcreg(struct nqssadce_state* s,
 		deswz_source = sourced;
 	}
 
-	struct register_state *regstate = get_reg_state(s, inst->SrcReg[src].File, inst->SrcReg[src].Index);
-	if (regstate)
-		regstate->Sourced |= deswz_source & 0xf;
-
-	return inst;
-}
+	struct register_state *regstate;
 
-
-static void rewrite_depth_out(struct prog_instruction *inst)
-{
-	if (inst->DstReg.WriteMask & WRITEMASK_Z) {
-		inst->DstReg.WriteMask = WRITEMASK_W;
+	if (inst->SrcReg[src].RelAddr) {
+		regstate = get_reg_state(s, PROGRAM_ADDRESS, 0);
+		if (regstate)
+			regstate->Sourced |= WRITEMASK_X;
 	} else {
-		inst->DstReg.WriteMask = 0;
-		return;
-	}
-
-	switch (inst->Opcode) {
-	case OPCODE_FRC:
-	case OPCODE_MOV:
-		inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
-		break;
-	case OPCODE_ADD:
-	case OPCODE_MAX:
-	case OPCODE_MIN:
-	case OPCODE_MUL:
-		inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
-		inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
-		break;
-	case OPCODE_CMP:
-	case OPCODE_MAD:
-		inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
-		inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
-		inst->SrcReg[2] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[2]);
-		break;
-	default:
-		// Scalar instructions needn't be reswizzled
-		break;
+		regstate = get_reg_state(s, inst->SrcReg[src].File, inst->SrcReg[src].Index);
+		if (regstate)
+			regstate->Sourced |= deswz_source & 0xf;
 	}
 }
 
-static void unalias_srcregs(struct prog_instruction *inst, GLuint oldindex, GLuint newindex)
+static void unalias_srcregs(struct rc_instruction *inst, GLuint oldindex, GLuint newindex)
 {
-	int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
+	int nsrc = _mesa_num_inst_src_regs(inst->I.Opcode);
 	int i;
 	for(i = 0; i < nsrc; ++i)
-		if (inst->SrcReg[i].File == PROGRAM_TEMPORARY && inst->SrcReg[i].Index == oldindex)
-			inst->SrcReg[i].Index = newindex;
+		if (inst->I.SrcReg[i].File == PROGRAM_TEMPORARY && inst->I.SrcReg[i].Index == oldindex)
+			inst->I.SrcReg[i].Index = newindex;
 }
 
 static void unalias_temporary(struct nqssadce_state* s, GLuint oldindex)
 {
-	GLuint newindex = _mesa_find_free_register(s->Program, PROGRAM_TEMPORARY);
-	int ip;
-	for(ip = 0; ip < s->IP; ++ip) {
-		struct prog_instruction* inst = s->Program->Instructions + ip;
-		if (inst->DstReg.File == PROGRAM_TEMPORARY && inst->DstReg.Index == oldindex)
-			inst->DstReg.Index = newindex;
+	GLuint newindex = rc_find_free_temporary(s->Compiler);
+	struct rc_instruction * inst;
+	for(inst = s->Compiler->Program.Instructions.Next; inst != s->IP; inst = inst->Next) {
+		if (inst->I.DstReg.File == PROGRAM_TEMPORARY && inst->I.DstReg.Index == oldindex)
+			inst->I.DstReg.Index = newindex;
 		unalias_srcregs(inst, oldindex, newindex);
 	}
-	unalias_srcregs(s->Program->Instructions + s->IP, oldindex, newindex);
+	unalias_srcregs(s->IP, oldindex, newindex);
 }
 
 
@@ -184,20 +157,16 @@ static void unalias_temporary(struct nqssadce_state* s, GLuint oldindex)
  */
 static void process_instruction(struct nqssadce_state* s)
 {
-	struct prog_instruction *inst = s->Program->Instructions + s->IP;
+	struct prog_instruction *inst = &s->IP->I;
+	GLuint WriteMask;
 
 	if (inst->Opcode == OPCODE_END)
 		return;
 
 	if (inst->Opcode != OPCODE_KIL) {
-		if (s->Descr->RewriteDepthOut) {
-			if (inst->DstReg.File == PROGRAM_OUTPUT && inst->DstReg.Index == FRAG_RESULT_DEPR)
-				rewrite_depth_out(inst);
-		}
-
 		struct register_state *regstate = get_reg_state(s, inst->DstReg.File, inst->DstReg.Index);
 		if (!regstate) {
-			_mesa_problem(s->Ctx, "NqssaDce: bad destination register (%i[%i])\n",
+			rc_error(s->Compiler, "NqssaDce: bad destination register (%i[%i])\n",
 				inst->DstReg.File, inst->DstReg.Index);
 			return;
 		}
@@ -206,7 +175,9 @@ static void process_instruction(struct nqssadce_state* s)
 		regstate->Sourced &= ~inst->DstReg.WriteMask;
 
 		if (inst->DstReg.WriteMask == 0) {
-			_mesa_delete_instructions(s->Program, s->IP, 1);
+			struct rc_instruction * inst_remove = s->IP;
+			s->IP = s->IP->Prev;
+			rc_remove_instruction(inst_remove);
 			return;
 		}
 
@@ -214,28 +185,30 @@ static void process_instruction(struct nqssadce_state* s)
 			unalias_temporary(s, inst->DstReg.Index);
 	}
 
-	/* Attention: Due to swizzle emulation code, the following
-	 * might change the instruction stream under us, so we have
-	 * to be careful with the inst pointer. */
+	WriteMask = inst->DstReg.WriteMask;
+
 	switch (inst->Opcode) {
+	case OPCODE_ARL:
 	case OPCODE_DDX:
 	case OPCODE_DDY:
 	case OPCODE_FRC:
 	case OPCODE_MOV:
-		inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask);
+		track_used_srcreg(s, 0, WriteMask);
 		break;
 	case OPCODE_ADD:
 	case OPCODE_MAX:
 	case OPCODE_MIN:
 	case OPCODE_MUL:
-		inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask);
-		inst = track_used_srcreg(s, inst, 1, inst->DstReg.WriteMask);
+	case OPCODE_SGE:
+	case OPCODE_SLT:
+		track_used_srcreg(s, 0, WriteMask);
+		track_used_srcreg(s, 1, WriteMask);
 		break;
 	case OPCODE_CMP:
 	case OPCODE_MAD:
-		inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask);
-		inst = track_used_srcreg(s, inst, 1, inst->DstReg.WriteMask);
-		inst = track_used_srcreg(s, inst, 2, inst->DstReg.WriteMask);
+		track_used_srcreg(s, 0, WriteMask);
+		track_used_srcreg(s, 1, WriteMask);
+		track_used_srcreg(s, 2, WriteMask);
 		break;
 	case OPCODE_COS:
 	case OPCODE_EX2:
@@ -243,42 +216,79 @@ static void process_instruction(struct nqssadce_state* s)
 	case OPCODE_RCP:
 	case OPCODE_RSQ:
 	case OPCODE_SIN:
-		inst = track_used_srcreg(s, inst, 0, 0x1);
+		track_used_srcreg(s, 0, 0x1);
 		break;
 	case OPCODE_DP3:
-		inst = track_used_srcreg(s, inst, 0, 0x7);
-		inst = track_used_srcreg(s, inst, 1, 0x7);
+		track_used_srcreg(s, 0, 0x7);
+		track_used_srcreg(s, 1, 0x7);
 		break;
 	case OPCODE_DP4:
-		inst = track_used_srcreg(s, inst, 0, 0xf);
-		inst = track_used_srcreg(s, inst, 1, 0xf);
+		track_used_srcreg(s, 0, 0xf);
+		track_used_srcreg(s, 1, 0xf);
 		break;
 	case OPCODE_KIL:
 	case OPCODE_TEX:
 	case OPCODE_TXB:
 	case OPCODE_TXP:
-		inst = track_used_srcreg(s, inst, 0, 0xf);
+		track_used_srcreg(s, 0, 0xf);
+		break;
+	case OPCODE_DST:
+		track_used_srcreg(s, 0, 0x6);
+		track_used_srcreg(s, 1, 0xa);
+		break;
+	case OPCODE_EXP:
+	case OPCODE_LOG:
+	case OPCODE_POW:
+		track_used_srcreg(s, 0, 0x3);
+		break;
+	case OPCODE_LIT:
+		track_used_srcreg(s, 0, 0xb);
 		break;
 	default:
-		_mesa_problem(s->Ctx, "NqssaDce: Unknown opcode %d\n", inst->Opcode);
+		rc_error(s->Compiler, "NqssaDce: Unknown opcode %d\n", inst->Opcode);
 		return;
 	}
+
+	s->IP = s->IP->Prev;
 }
 
+void rc_calculate_inputs_outputs(struct radeon_compiler * c)
+{
+	struct rc_instruction *inst;
+
+	c->Program.InputsRead = 0;
+	c->Program.OutputsWritten = 0;
+
+	for(inst = c->Program.Instructions.Next; inst != &c->Program.Instructions; inst = inst->Next)
+	{
+		int i;
+		int num_src_regs = _mesa_num_inst_src_regs(inst->I.Opcode);
+
+		for (i = 0; i < num_src_regs; ++i) {
+			if (inst->I.SrcReg[i].File == PROGRAM_INPUT)
+				c->Program.InputsRead |= 1 << inst->I.SrcReg[i].Index;
+		}
+
+		if (_mesa_num_inst_dst_regs(inst->I.Opcode)) {
+			if (inst->I.DstReg.File == PROGRAM_OUTPUT)
+				c->Program.OutputsWritten |= 1 << inst->I.DstReg.Index;
+		}
+	}
+}
 
-void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce_descr* descr)
+void radeonNqssaDce(struct radeon_compiler * c, struct radeon_nqssadce_descr* descr, void * data)
 {
 	struct nqssadce_state s;
 
 	_mesa_bzero(&s, sizeof(s));
-	s.Ctx = ctx;
-	s.Program = p;
+	s.Compiler = c;
 	s.Descr = descr;
+	s.UserData = data;
 	s.Descr->Init(&s);
-	s.IP = p->NumInstructions;
+	s.IP = c->Program.Instructions.Prev;
 
-	while(s.IP > 0) {
-		s.IP--;
+	while(s.IP != &c->Program.Instructions && !c->Error)
 		process_instruction(&s);
-	}
+
+	rc_calculate_inputs_outputs(c);
 }
diff --git a/src/mesa/drivers/dri/r300/radeon_nqssadce.h b/src/mesa/drivers/dri/r300/compiler/radeon_nqssadce.h
index a4f94abcb6..b3fc77a35a 100644
--- a/src/mesa/drivers/dri/r300/radeon_nqssadce.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_nqssadce.h
@@ -30,7 +30,6 @@
 
 #include "radeon_program.h"
 
-
 struct register_state {
 	/**
 	 * Bitmask indicating which components of the register are sourced
@@ -44,20 +43,22 @@ struct register_state {
  * read from, etc.
  */
 struct nqssadce_state {
-	GLcontext *Ctx;
-	struct gl_program *Program;
+	struct radeon_compiler *Compiler;
 	struct radeon_nqssadce_descr *Descr;
 
 	/**
 	 * All instructions after this instruction pointer have been dealt with.
 	 */
-	int IP;
+	struct rc_instruction * IP;
 
 	/**
 	 * Which registers are read by subsequent instructions?
 	 */
 	struct register_state Temps[MAX_PROGRAM_TEMPS];
 	struct register_state Outputs[VERT_RESULT_MAX];
+	struct register_state Address;
+
+	void * UserData;
 };
 
 
@@ -82,15 +83,9 @@ struct radeon_nqssadce_descr {
 	 * The transformation will work recursively on the emitted instruction(s).
 	 */
 	void (*BuildSwizzle)(struct nqssadce_state*, struct prog_dst_register dst, struct prog_src_register src);
-
-	/**
-	 * Rewrite instructions that write to DEPR.z to write to DEPR.w
-	 * instead (rewriting is done *before* the WriteMask test).
-	 */
-	GLboolean RewriteDepthOut;
-	void *Data;
 };
 
-void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce_descr* descr);
+void radeonNqssaDce(struct radeon_compiler * c, struct radeon_nqssadce_descr* descr, void * data);
+struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg);
 
 #endif /* __RADEON_PROGRAM_NQSSADCE_H_ */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program.c b/src/mesa/drivers/dri/r300/compiler/radeon_program.c
new file mode 100644
index 0000000000..bbbf0dd776
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.c
@@ -0,0 +1,187 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_program.h"
+
+#include "radeon_compiler.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+
+/**
+ * Transform the given clause in the following way:
+ *  1. Replace it with an empty clause
+ *  2. For every instruction in the original clause, try the given
+ *     transformations in order.
+ *  3. If one of the transformations returns GL_TRUE, assume that it
+ *     has emitted the appropriate instruction(s) into the new clause;
+ *     otherwise, copy the instruction verbatim.
+ *
+ * \note The transformation is currently not recursive; in other words,
+ * instructions emitted by transformations are not transformed.
+ *
+ * \note The transform is called 'local' because it can only look at
+ * one instruction at a time.
+ */
+void radeonLocalTransform(
+	struct radeon_compiler * c,
+	int num_transformations,
+	struct radeon_program_transformation* transformations)
+{
+	struct rc_instruction * inst = c->Program.Instructions.Next;
+
+	while(inst != &c->Program.Instructions) {
+		struct rc_instruction * current = inst;
+		int i;
+
+		inst = inst->Next;
+
+		for(i = 0; i < num_transformations; ++i) {
+			struct radeon_program_transformation* t = transformations + i;
+
+			if (t->function(c, current, t->userData))
+				break;
+		}
+	}
+}
+
+
+GLint rc_find_free_temporary(struct radeon_compiler * c)
+{
+	GLboolean used[MAX_PROGRAM_TEMPS];
+	GLuint i;
+
+	memset(used, 0, sizeof(used));
+
+	for (struct rc_instruction * rcinst = c->Program.Instructions.Next; rcinst != &c->Program.Instructions; rcinst = rcinst->Next) {
+		const struct prog_instruction *inst = &rcinst->I;
+		const GLuint nsrc = _mesa_num_inst_src_regs(inst->Opcode);
+		const GLuint ndst = _mesa_num_inst_dst_regs(inst->Opcode);
+		GLuint k;
+
+		for (k = 0; k < nsrc; k++) {
+			if (inst->SrcReg[k].File == PROGRAM_TEMPORARY)
+				used[inst->SrcReg[k].Index] = GL_TRUE;
+		}
+
+		if (ndst) {
+			if (inst->DstReg.File == PROGRAM_TEMPORARY)
+				used[inst->DstReg.Index] = GL_TRUE;
+		}
+	}
+
+	for (i = 0; i < MAX_PROGRAM_TEMPS; i++) {
+		if (!used[i])
+			return i;
+	}
+
+	return -1;
+}
+
+
+struct rc_instruction *rc_alloc_instruction(struct radeon_compiler * c)
+{
+	struct rc_instruction * inst = memory_pool_malloc(&c->Pool, sizeof(struct rc_instruction));
+
+	inst->Prev = 0;
+	inst->Next = 0;
+
+	_mesa_init_instructions(&inst->I, 1);
+
+	return inst;
+}
+
+
+struct rc_instruction *rc_insert_new_instruction(struct radeon_compiler * c, struct rc_instruction * after)
+{
+	struct rc_instruction * inst = rc_alloc_instruction(c);
+
+	inst->Prev = after;
+	inst->Next = after->Next;
+
+	inst->Prev->Next = inst;
+	inst->Next->Prev = inst;
+
+	return inst;
+}
+
+void rc_remove_instruction(struct rc_instruction * inst)
+{
+	inst->Prev->Next = inst->Next;
+	inst->Next->Prev = inst->Prev;
+}
+
+
+void rc_mesa_to_rc_program(struct radeon_compiler * c, struct gl_program * program)
+{
+	struct prog_instruction *source;
+	unsigned int i;
+
+	for(source = program->Instructions; source->Opcode != OPCODE_END; ++source) {
+		struct rc_instruction * dest = rc_insert_new_instruction(c, c->Program.Instructions.Prev);
+		dest->I = *source;
+	}
+
+	c->Program.ShadowSamplers = program->ShadowSamplers;
+	c->Program.InputsRead = program->InputsRead;
+	c->Program.OutputsWritten = program->OutputsWritten;
+
+	for(i = 0; i < program->Parameters->NumParameters; ++i) {
+		struct rc_constant constant;
+
+		constant.Type = RC_CONSTANT_EXTERNAL;
+		constant.Size = 4;
+		constant.u.External = i;
+
+		rc_constants_add(&c->Program.Constants, &constant);
+	}
+}
+
+
+/**
+ * Print program to stderr, default options.
+ */
+void rc_print_program(const struct rc_program *prog)
+{
+	GLuint indent = 0;
+	GLuint linenum = 1;
+	struct rc_instruction *inst;
+
+	fprintf(stderr, "# Radeon Compiler Program\n");
+
+	for(inst = prog->Instructions.Next; inst != &prog->Instructions; inst = inst->Next) {
+		fprintf(stderr, "%3d: ", linenum);
+
+		/* Massive hack: We rely on the fact that the printers do not actually
+		 * use the gl_program argument (last argument) in debug mode */
+		indent = _mesa_fprint_instruction_opt(
+				stderr, &inst->I,
+				indent, PROG_PRINT_DEBUG, 0);
+
+		linenum++;
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_program.h b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
index b411f69bc8..561958608c 100644
--- a/src/mesa/drivers/dri/r300/radeon_program.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
@@ -34,12 +34,9 @@
 #include "shader/program.h"
 #include "shader/prog_instruction.h"
 
-
-enum {
-	CLAUSE_MIXED = 0,
-	CLAUSE_ALU,
-	CLAUSE_TEX
-};
+struct radeon_compiler;
+struct rc_instruction;
+struct rc_program;
 
 enum {
 	PROGRAM_BUILTIN = PROGRAM_FILE_MAX /**< not a real register, but a special swizzle constant */
@@ -52,18 +49,43 @@ enum {
 #define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO)
 #define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE)
 
-/**
- * Transformation context that is passed to local transformations.
- *
- * Care must be taken with some operations during transformation,
- * e.g. finding new temporary registers must use @ref radeonFindFreeTemporary
- */
-struct radeon_transform_context {
-	GLcontext *Ctx;
-	struct gl_program *Program;
-	struct prog_instruction *OldInstructions;
-	GLuint OldNumInstructions;
-};
+static inline GLuint get_swz(GLuint swz, GLuint idx)
+{
+	if (idx & 0x4)
+		return idx;
+	return GET_SWZ(swz, idx);
+}
+
+static inline GLuint combine_swizzles4(GLuint src, GLuint swz_x, GLuint swz_y, GLuint swz_z, GLuint swz_w)
+{
+	GLuint ret = 0;
+
+	ret |= get_swz(src, swz_x);
+	ret |= get_swz(src, swz_y) << 3;
+	ret |= get_swz(src, swz_z) << 6;
+	ret |= get_swz(src, swz_w) << 9;
+
+	return ret;
+}
+
+static inline GLuint combine_swizzles(GLuint src, GLuint swz)
+{
+	GLuint ret = 0;
+
+	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_X));
+	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_Y)) << 3;
+	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_Z)) << 6;
+	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_W)) << 9;
+
+	return ret;
+}
+
+static INLINE void reset_srcreg(struct prog_src_register* reg)
+{
+	_mesa_bzero(reg, sizeof(*reg));
+	reg->Swizzle = SWIZZLE_NOOP;
+}
+
 
 /**
  * A transformation that can be passed to \ref radeonLocalTransform.
@@ -77,23 +99,23 @@ struct radeon_transform_context {
  */
 struct radeon_program_transformation {
 	GLboolean (*function)(
-		struct radeon_transform_context*,
-		struct prog_instruction*,
+		struct radeon_compiler*,
+		struct rc_instruction*,
 		void*);
 	void *userData;
 };
 
 void radeonLocalTransform(
-	GLcontext* ctx,
-	struct gl_program *program,
+	struct radeon_compiler *c,
 	int num_transformations,
 	struct radeon_program_transformation* transformations);
 
-/**
- * Find a usable free temporary register during program transformation
- */
-GLint radeonFindFreeTemporary(struct radeon_transform_context *ctx);
+GLint rc_find_free_temporary(struct radeon_compiler * c);
+
+struct rc_instruction *rc_alloc_instruction(struct radeon_compiler * c);
+struct rc_instruction *rc_insert_new_instruction(struct radeon_compiler * c, struct rc_instruction * after);
+void rc_remove_instruction(struct rc_instruction * inst);
 
-struct prog_instruction *radeonAppendInstructions(struct gl_program *program, int count);
+void rc_print_program(const struct rc_program *prog);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
new file mode 100644
index 0000000000..8071899eaa
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
@@ -0,0 +1,695 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * @file
+ *
+ * Shareable transformations that transform "special" ALU instructions
+ * into ALU instructions that are supported by hardware.
+ *
+ */
+
+#include "radeon_program_alu.h"
+
+#include "radeon_compiler.h"
+
+
+static struct rc_instruction *emit1(
+	struct radeon_compiler * c, struct rc_instruction * after,
+	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
+	struct prog_src_register SrcReg)
+{
+	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
+
+	fpi->I.Opcode = Opcode;
+	fpi->I.SaturateMode = Saturate;
+	fpi->I.DstReg = DstReg;
+	fpi->I.SrcReg[0] = SrcReg;
+	return fpi;
+}
+
+static struct rc_instruction *emit2(
+	struct radeon_compiler * c, struct rc_instruction * after,
+	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
+	struct prog_src_register SrcReg0, struct prog_src_register SrcReg1)
+{
+	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
+
+	fpi->I.Opcode = Opcode;
+	fpi->I.SaturateMode = Saturate;
+	fpi->I.DstReg = DstReg;
+	fpi->I.SrcReg[0] = SrcReg0;
+	fpi->I.SrcReg[1] = SrcReg1;
+	return fpi;
+}
+
+static struct rc_instruction *emit3(
+	struct radeon_compiler * c, struct rc_instruction * after,
+	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
+	struct prog_src_register SrcReg0, struct prog_src_register SrcReg1,
+	struct prog_src_register SrcReg2)
+{
+	struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
+
+	fpi->I.Opcode = Opcode;
+	fpi->I.SaturateMode = Saturate;
+	fpi->I.DstReg = DstReg;
+	fpi->I.SrcReg[0] = SrcReg0;
+	fpi->I.SrcReg[1] = SrcReg1;
+	fpi->I.SrcReg[2] = SrcReg2;
+	return fpi;
+}
+
+static struct prog_dst_register dstreg(int file, int index)
+{
+	struct prog_dst_register dst;
+	dst.File = file;
+	dst.Index = index;
+	dst.WriteMask = WRITEMASK_XYZW;
+	dst.CondMask = COND_TR;
+	dst.RelAddr = 0;
+	dst.CondSwizzle = SWIZZLE_NOOP;
+	dst.CondSrc = 0;
+	dst.pad = 0;
+	return dst;
+}
+
+static struct prog_dst_register dstregtmpmask(int index, int mask)
+{
+	struct prog_dst_register dst = {0};
+	dst.File = PROGRAM_TEMPORARY;
+	dst.Index = index;
+	dst.WriteMask = mask;
+	dst.RelAddr = 0;
+	dst.CondMask = COND_TR;
+	dst.CondSwizzle = SWIZZLE_NOOP;
+	dst.CondSrc = 0;
+	dst.pad = 0;
+	return dst;
+}
+
+static const struct prog_src_register builtin_zero = {
+	.File = PROGRAM_BUILTIN,
+	.Index = 0,
+	.Swizzle = SWIZZLE_0000
+};
+static const struct prog_src_register builtin_one = {
+	.File = PROGRAM_BUILTIN,
+	.Index = 0,
+	.Swizzle = SWIZZLE_1111
+};
+static const struct prog_src_register srcreg_undefined = {
+	.File = PROGRAM_UNDEFINED,
+	.Index = 0,
+	.Swizzle = SWIZZLE_NOOP
+};
+
+static struct prog_src_register srcreg(int file, int index)
+{
+	struct prog_src_register src = srcreg_undefined;
+	src.File = file;
+	src.Index = index;
+	return src;
+}
+
+static struct prog_src_register srcregswz(int file, int index, int swz)
+{
+	struct prog_src_register src = srcreg_undefined;
+	src.File = file;
+	src.Index = index;
+	src.Swizzle = swz;
+	return src;
+}
+
+static struct prog_src_register absolute(struct prog_src_register reg)
+{
+	struct prog_src_register newreg = reg;
+	newreg.Abs = 1;
+	newreg.Negate = NEGATE_NONE;
+	return newreg;
+}
+
+static struct prog_src_register negate(struct prog_src_register reg)
+{
+	struct prog_src_register newreg = reg;
+	newreg.Negate = newreg.Negate ^ NEGATE_XYZW;
+	return newreg;
+}
+
+static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x, GLuint y, GLuint z, GLuint w)
+{
+	struct prog_src_register swizzled = reg;
+	swizzled.Swizzle = MAKE_SWIZZLE4(
+		x >= 4 ? x : GET_SWZ(reg.Swizzle, x),
+		y >= 4 ? y : GET_SWZ(reg.Swizzle, y),
+		z >= 4 ? z : GET_SWZ(reg.Swizzle, z),
+		w >= 4 ? w : GET_SWZ(reg.Swizzle, w));
+	return swizzled;
+}
+
+static struct prog_src_register scalar(struct prog_src_register reg)
+{
+	return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+}
+
+static void transform_ABS(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	struct prog_src_register src = inst->I.SrcReg[0];
+	src.Abs = 1;
+	src.Negate = NEGATE_NONE;
+	emit1(c, inst->Prev, OPCODE_MOV, inst->I.SaturateMode, inst->I.DstReg, src);
+	rc_remove_instruction(inst);
+}
+
+static void transform_DP3(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	struct prog_src_register src0 = inst->I.SrcReg[0];
+	struct prog_src_register src1 = inst->I.SrcReg[1];
+	src0.Negate &= ~NEGATE_W;
+	src0.Swizzle &= ~(7 << (3 * 3));
+	src0.Swizzle |= SWIZZLE_ZERO << (3 * 3);
+	src1.Negate &= ~NEGATE_W;
+	src1.Swizzle &= ~(7 << (3 * 3));
+	src1.Swizzle |= SWIZZLE_ZERO << (3 * 3);
+	emit2(c, inst->Prev, OPCODE_DP4, inst->I.SaturateMode, inst->I.DstReg, src0, src1);
+	rc_remove_instruction(inst);
+}
+
+static void transform_DPH(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	struct prog_src_register src0 = inst->I.SrcReg[0];
+	src0.Negate &= ~NEGATE_W;
+	src0.Swizzle &= ~(7 << (3 * 3));
+	src0.Swizzle |= SWIZZLE_ONE << (3 * 3);
+	emit2(c, inst->Prev, OPCODE_DP4, inst->I.SaturateMode, inst->I.DstReg, src0, inst->I.SrcReg[1]);
+	rc_remove_instruction(inst);
+}
+
+/**
+ * [1, src0.y*src1.y, src0.z, src1.w]
+ * So basically MUL with lotsa swizzling.
+ */
+static void transform_DST(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	emit2(c, inst->Prev, OPCODE_MUL, inst->I.SaturateMode, inst->I.DstReg,
+		swizzle(inst->I.SrcReg[0], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE),
+		swizzle(inst->I.SrcReg[1], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_ONE, SWIZZLE_W));
+	rc_remove_instruction(inst);
+}
+
+static void transform_FLR(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+	emit1(c, inst->Prev, OPCODE_FRC, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->I.SrcReg[0]);
+	emit2(c, inst->Prev, OPCODE_ADD, inst->I.SaturateMode, inst->I.DstReg,
+		inst->I.SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+	rc_remove_instruction(inst);
+}
+
+/**
+ * Definition of LIT (from ARB_fragment_program):
+ *
+ *  tmp = VectorLoad(op0);
+ *  if (tmp.x < 0) tmp.x = 0;
+ *  if (tmp.y < 0) tmp.y = 0;
+ *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ *  result.x = 1.0;
+ *  result.y = tmp.x;
+ *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ *  result.w = 1.0;
+ *
+ * The longest path of computation is the one leading to result.z,
+ * consisting of 5 operations. This implementation of LIT takes
+ * 5 slots, if the subsequent optimization passes are clever enough
+ * to pair instructions correctly.
+ */
+static void transform_LIT(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	GLuint constant;
+	GLuint constant_swizzle;
+	GLuint temp;
+	struct prog_src_register srctemp;
+
+	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
+
+	if (inst->I.DstReg.WriteMask != WRITEMASK_XYZW || inst->I.DstReg.File != PROGRAM_TEMPORARY) {
+		struct rc_instruction * inst_mov;
+
+		inst_mov = emit1(c, inst,
+			OPCODE_MOV, 0, inst->I.DstReg,
+			srcreg(PROGRAM_TEMPORARY, rc_find_free_temporary(c)));
+
+		inst->I.DstReg.File = PROGRAM_TEMPORARY;
+		inst->I.DstReg.Index = inst_mov->I.SrcReg[0].Index;
+		inst->I.DstReg.WriteMask = WRITEMASK_XYZW;
+	}
+
+	temp = inst->I.DstReg.Index;
+	srctemp = srcreg(PROGRAM_TEMPORARY, temp);
+
+	// tmp.x = max(0.0, Src.x);
+	// tmp.y = max(0.0, Src.y);
+	// tmp.w = clamp(Src.z, -128+eps, 128-eps);
+	emit2(c, inst->Prev, OPCODE_MAX, 0,
+		dstregtmpmask(temp, WRITEMASK_XYW),
+		inst->I.SrcReg[0],
+		swizzle(srcreg(PROGRAM_CONSTANT, constant),
+			SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, constant_swizzle&3));
+	emit2(c, inst->Prev, OPCODE_MIN, 0,
+		dstregtmpmask(temp, WRITEMASK_Z),
+		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+		negate(srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle)));
+
+	// tmp.w = Pow(tmp.y, tmp.w)
+	emit1(c, inst->Prev, OPCODE_LG2, 0,
+		dstregtmpmask(temp, WRITEMASK_W),
+		swizzle(srctemp, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
+	emit2(c, inst->Prev, OPCODE_MUL, 0,
+		dstregtmpmask(temp, WRITEMASK_W),
+		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+		swizzle(srctemp, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z));
+	emit1(c, inst->Prev, OPCODE_EX2, 0,
+		dstregtmpmask(temp, WRITEMASK_W),
+		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
+
+	// tmp.z = (tmp.x > 0) ? tmp.w : 0.0
+	emit3(c, inst->Prev, OPCODE_CMP, inst->I.SaturateMode,
+		dstregtmpmask(temp, WRITEMASK_Z),
+		negate(swizzle(srctemp, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
+		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+		builtin_zero);
+
+	// tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
+	emit1(c, inst->Prev, OPCODE_MOV, inst->I.SaturateMode,
+		dstregtmpmask(temp, WRITEMASK_XYW),
+		swizzle(srctemp, SWIZZLE_ONE, SWIZZLE_X, SWIZZLE_ONE, SWIZZLE_ONE));
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_LRP(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, OPCODE_ADD, 0,
+		dstreg(PROGRAM_TEMPORARY, tempreg),
+		inst->I.SrcReg[1], negate(inst->I.SrcReg[2]));
+	emit3(c, inst->Prev, OPCODE_MAD, inst->I.SaturateMode,
+		inst->I.DstReg,
+		inst->I.SrcReg[0], srcreg(PROGRAM_TEMPORARY, tempreg), inst->I.SrcReg[2]);
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_POW(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+	struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg);
+	struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg);
+	tempdst.WriteMask = WRITEMASK_W;
+	tempsrc.Swizzle = SWIZZLE_WWWW;
+
+	emit1(c, inst->Prev, OPCODE_LG2, 0, tempdst, scalar(inst->I.SrcReg[0]));
+	emit2(c, inst->Prev, OPCODE_MUL, 0, tempdst, tempsrc, scalar(inst->I.SrcReg[1]));
+	emit1(c, inst->Prev, OPCODE_EX2, inst->I.SaturateMode, inst->I.DstReg, tempsrc);
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_RSQ(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	inst->I.SrcReg[0] = absolute(inst->I.SrcReg[0]);
+}
+
+static void transform_SGE(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->I.SrcReg[0], negate(inst->I.SrcReg[1]));
+	emit3(c, inst->Prev, OPCODE_CMP, inst->I.SaturateMode, inst->I.DstReg,
+		srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one);
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_SLT(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->I.SrcReg[0], negate(inst->I.SrcReg[1]));
+	emit3(c, inst->Prev, OPCODE_CMP, inst->I.SaturateMode, inst->I.DstReg,
+		srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero);
+
+	rc_remove_instruction(inst);
+}
+
+static void transform_SUB(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	inst->I.Opcode = OPCODE_ADD;
+	inst->I.SrcReg[1] = negate(inst->I.SrcReg[1]);
+}
+
+static void transform_SWZ(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	inst->I.Opcode = OPCODE_MOV;
+}
+
+static void transform_XPD(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	int tempreg = rc_find_free_temporary(c);
+
+	emit2(c, inst->Prev, OPCODE_MUL, 0, dstreg(PROGRAM_TEMPORARY, tempreg),
+		swizzle(inst->I.SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
+		swizzle(inst->I.SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W));
+	emit3(c, inst->Prev, OPCODE_MAD, inst->I.SaturateMode, inst->I.DstReg,
+		swizzle(inst->I.SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W),
+		swizzle(inst->I.SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
+		negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+
+	rc_remove_instruction(inst);
+}
+
+
+/**
+ * Can be used as a transformation for @ref radeonClauseLocalTransform,
+ * no userData necessary.
+ *
+ * Eliminates the following ALU instructions:
+ *  ABS, DPH, DST, FLR, LIT, LRP, POW, SGE, SLT, SUB, SWZ, XPD
+ * using:
+ *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
+ *
+ * Transforms RSQ to Radeon's native RSQ by explicitly setting
+ * absolute value.
+ *
+ * @note should be applicable to R300 and R500 fragment programs.
+ */
+GLboolean radeonTransformALU(
+	struct radeon_compiler * c,
+	struct rc_instruction* inst,
+	void* unused)
+{
+	switch(inst->I.Opcode) {
+	case OPCODE_ABS: transform_ABS(c, inst); return GL_TRUE;
+	case OPCODE_DPH: transform_DPH(c, inst); return GL_TRUE;
+	case OPCODE_DST: transform_DST(c, inst); return GL_TRUE;
+	case OPCODE_FLR: transform_FLR(c, inst); return GL_TRUE;
+	case OPCODE_LIT: transform_LIT(c, inst); return GL_TRUE;
+	case OPCODE_LRP: transform_LRP(c, inst); return GL_TRUE;
+	case OPCODE_POW: transform_POW(c, inst); return GL_TRUE;
+	case OPCODE_RSQ: transform_RSQ(c, inst); return GL_TRUE;
+	case OPCODE_SGE: transform_SGE(c, inst); return GL_TRUE;
+	case OPCODE_SLT: transform_SLT(c, inst); return GL_TRUE;
+	case OPCODE_SUB: transform_SUB(c, inst); return GL_TRUE;
+	case OPCODE_SWZ: transform_SWZ(c, inst); return GL_TRUE;
+	case OPCODE_XPD: transform_XPD(c, inst); return GL_TRUE;
+	default:
+		return GL_FALSE;
+	}
+}
+
+
+static void transform_r300_vertex_ABS(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	/* Note: r500 can take absolute values, but r300 cannot. */
+	inst->I.Opcode = OPCODE_MAX;
+	inst->I.SrcReg[1] = inst->I.SrcReg[0];
+	inst->I.SrcReg[1].Negate ^= NEGATE_XYZW;
+}
+
+/**
+ * For use with radeonLocalTransform, this transforms non-native ALU
+ * instructions of the r300 up to r500 vertex engine.
+ */
+GLboolean r300_transform_vertex_alu(
+	struct radeon_compiler * c,
+	struct rc_instruction* inst,
+	void* unused)
+{
+	switch(inst->I.Opcode) {
+	case OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return GL_TRUE;
+	case OPCODE_DP3: transform_DP3(c, inst); return GL_TRUE;
+	case OPCODE_DPH: transform_DPH(c, inst); return GL_TRUE;
+	case OPCODE_FLR: transform_FLR(c, inst); return GL_TRUE;
+	case OPCODE_LRP: transform_LRP(c, inst); return GL_TRUE;
+	case OPCODE_SUB: transform_SUB(c, inst); return GL_TRUE;
+	case OPCODE_SWZ: transform_SWZ(c, inst); return GL_TRUE;
+	case OPCODE_XPD: transform_XPD(c, inst); return GL_TRUE;
+	default:
+		return GL_FALSE;
+	}
+}
+
+static void sincos_constants(struct radeon_compiler* c, GLuint *constants)
+{
+	static const GLfloat SinCosConsts[2][4] = {
+		{
+			1.273239545,		// 4/PI
+			-0.405284735,		// -4/(PI*PI)
+			3.141592654,		// PI
+			0.2225			// weight
+		},
+		{
+			0.75,
+			0.5,
+			0.159154943,		// 1/(2*PI)
+			6.283185307		// 2*PI
+		}
+	};
+	int i;
+
+	for(i = 0; i < 2; ++i)
+		constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
+}
+
+/**
+ * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
+ *
+ * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
+ * MAD tmp.x, tmp.y, |src|, tmp.x
+ * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
+ * MAD dest, tmp.y, weight, tmp.x
+ */
+static void sin_approx(
+	struct radeon_compiler* c, struct rc_instruction * after,
+	struct prog_dst_register dst, struct prog_src_register src, const GLuint* constants)
+{
+	GLuint tempreg = rc_find_free_temporary(c);
+
+	emit2(c, after->Prev, OPCODE_MUL, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
+		swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+		srcreg(PROGRAM_CONSTANT, constants[0]));
+	emit3(c, after->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_X),
+		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+		absolute(swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
+		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
+	emit3(c, after->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_Y),
+		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+		absolute(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
+		negate(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)));
+	emit3(c, after->Prev, OPCODE_MAD, 0, dst,
+		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+		swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
+}
+
+/**
+ * Translate the trigonometric functions COS, SIN, and SCS
+ * using only the basic instructions
+ *  MOV, ADD, MUL, MAD, FRC
+ */
+GLboolean radeonTransformTrigSimple(struct radeon_compiler* c,
+	struct rc_instruction* inst,
+	void* unused)
+{
+	if (inst->I.Opcode != OPCODE_COS &&
+	    inst->I.Opcode != OPCODE_SIN &&
+	    inst->I.Opcode != OPCODE_SCS)
+		return GL_FALSE;
+
+	GLuint constants[2];
+	GLuint tempreg = rc_find_free_temporary(c);
+
+	sincos_constants(c, constants);
+
+	if (inst->I.Opcode == OPCODE_COS) {
+		// MAD tmp.x, src, 1/(2*PI), 0.75
+		// FRC tmp.x, tmp.x
+		// MAD tmp.z, tmp.x, 2*PI, -PI
+		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+			swizzle(inst->I.SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
+		emit1(c, inst->Prev, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
+		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
+
+		sin_approx(c, inst->Prev, inst->I.DstReg,
+			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+			constants);
+	} else if (inst->I.Opcode == OPCODE_SIN) {
+		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+			swizzle(inst->I.SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
+		emit1(c, inst->Prev, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
+		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
+
+		sin_approx(c, inst->Prev, inst->I.DstReg,
+			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+			constants);
+	} else {
+		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
+			swizzle(inst->I.SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W));
+		emit1(c, inst->Prev, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
+			srcreg(PROGRAM_TEMPORARY, tempreg));
+		emit3(c, inst->Prev, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
+			srcreg(PROGRAM_TEMPORARY, tempreg),
+			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
+
+		struct prog_dst_register dst = inst->I.DstReg;
+
+		dst.WriteMask = inst->I.DstReg.WriteMask & WRITEMASK_X;
+		sin_approx(c, inst->Prev, dst,
+			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+			constants);
+
+		dst.WriteMask = inst->I.DstReg.WriteMask & WRITEMASK_Y;
+		sin_approx(c, inst->Prev, dst,
+			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+			constants);
+	}
+
+	rc_remove_instruction(inst);
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Transform the trigonometric functions COS, SIN, and SCS
+ * to include pre-scaling by 1/(2*PI) and taking the fractional
+ * part, so that the input to COS and SIN is always in the range [0,1).
+ * SCS is replaced by one COS and one SIN instruction.
+ *
+ * @warning This transformation implicitly changes the semantics of SIN and COS!
+ */
+GLboolean radeonTransformTrigScale(struct radeon_compiler* c,
+	struct rc_instruction* inst,
+	void* unused)
+{
+	if (inst->I.Opcode != OPCODE_COS &&
+	    inst->I.Opcode != OPCODE_SIN &&
+	    inst->I.Opcode != OPCODE_SCS)
+		return GL_FALSE;
+
+	static const GLfloat RCP_2PI = 0.15915494309189535;
+	GLuint temp;
+	GLuint constant;
+	GLuint constant_swizzle;
+
+	temp = rc_find_free_temporary(c);
+	constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
+
+	emit2(c, inst->Prev, OPCODE_MUL, 0, dstregtmpmask(temp, WRITEMASK_W),
+		swizzle(inst->I.SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+		srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle));
+	emit1(c, inst->Prev, OPCODE_FRC, 0, dstregtmpmask(temp, WRITEMASK_W),
+		srcreg(PROGRAM_TEMPORARY, temp));
+
+	if (inst->I.Opcode == OPCODE_COS) {
+		emit1(c, inst->Prev, OPCODE_COS, inst->I.SaturateMode, inst->I.DstReg,
+			srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+	} else if (inst->I.Opcode == OPCODE_SIN) {
+		emit1(c, inst->Prev, OPCODE_SIN, inst->I.SaturateMode,
+			inst->I.DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+	} else if (inst->I.Opcode == OPCODE_SCS) {
+		struct prog_dst_register moddst = inst->I.DstReg;
+
+		if (inst->I.DstReg.WriteMask & WRITEMASK_X) {
+			moddst.WriteMask = WRITEMASK_X;
+			emit1(c, inst->Prev, OPCODE_COS, inst->I.SaturateMode, moddst,
+				srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+		}
+		if (inst->I.DstReg.WriteMask & WRITEMASK_Y) {
+			moddst.WriteMask = WRITEMASK_Y;
+			emit1(c, inst->Prev, OPCODE_SIN, inst->I.SaturateMode, moddst,
+				srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+		}
+	}
+
+	rc_remove_instruction(inst);
+
+	return GL_TRUE;
+}
+
+/**
+ * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
+ * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
+ * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
+ *
+ * @warning This explicitly changes the form of DDX and DDY!
+ */
+
+GLboolean radeonTransformDeriv(struct radeon_compiler* c,
+	struct rc_instruction* inst,
+	void* unused)
+{
+	if (inst->I.Opcode != OPCODE_DDX && inst->I.Opcode != OPCODE_DDY)
+		return GL_FALSE;
+
+	inst->I.SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE);
+	inst->I.SrcReg[1].Negate = NEGATE_XYZW;
+
+	return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h
index b45958115c..147efec6fc 100644
--- a/src/mesa/drivers/dri/r300/radeon_program_alu.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.h
@@ -31,23 +31,28 @@
 #include "radeon_program.h"
 
 GLboolean radeonTransformALU(
-	struct radeon_transform_context *t,
-	struct prog_instruction*,
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
+	void*);
+
+GLboolean r300_transform_vertex_alu(
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
 	void*);
 
 GLboolean radeonTransformTrigSimple(
-	struct radeon_transform_context *t,
-	struct prog_instruction*,
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
 	void*);
 
 GLboolean radeonTransformTrigScale(
-	struct radeon_transform_context *t,
-	struct prog_instruction*,
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
 	void*);
 
 GLboolean radeonTransformDeriv(
-	struct radeon_transform_context *t,
-	struct prog_instruction*,
+	struct radeon_compiler * c,
+	struct rc_instruction * inst,
 	void*);
 
 #endif /* __RADEON_PROGRAM_ALU_H_ */
diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
index 58bc0d5843..4c26db5d24 100644
--- a/src/mesa/drivers/dri/r300/radeon_program_pair.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
@@ -35,17 +35,19 @@
 
 #include "radeon_program_pair.h"
 
-#include "radeon_context.h"
-
+#include "memory_pool.h"
+#include "radeon_compiler.h"
 #include "shader/prog_print.h"
 
 #define error(fmt, args...) do { \
-	_mesa_problem(s->Ctx, "%s::%s(): " fmt "\n",	\
+	rc_error(&s->Compiler->Base, "%s::%s(): " fmt "\n",	\
 		__FILE__, __FUNCTION__, ##args);	\
-	s->Error = GL_TRUE;				\
 } while(0)
 
 struct pair_state_instruction {
+	struct prog_instruction Instruction;
+	GLuint IP; /**< Position of this instruction in original program */
+
 	GLuint IsTex:1; /**< Is a texture instruction */
 	GLuint NeedRGB:1; /**< Needs the RGB ALU */
 	GLuint NeedAlpha:1; /**< Needs the Alpha ALU */
@@ -73,7 +75,7 @@ struct pair_state_instruction {
  * Used to keep track of which instructions read a value.
  */
 struct reg_value_reader {
-	GLuint IP; /**< IP of the instruction that performs this access */
+	struct pair_state_instruction *Reader;
 	struct reg_value_reader *Next;
 };
 
@@ -82,7 +84,7 @@ struct reg_value_reader {
  * PROGRAM_TEMPORARY.
  */
 struct reg_value {
-	GLuint IP; /**< IP of the instruction that writes this value */
+	struct pair_state_instruction *Writer;
 	struct reg_value *Next; /**< Pointer to the next value to be written to the same PROGRAM_TEMPORARY component */
 
 	/**
@@ -116,11 +118,8 @@ struct pair_register_translation {
 };
 
 struct pair_state {
-	GLcontext *Ctx;
-	struct gl_program *Program;
+	struct r300_fragment_program_compiler * Compiler;
 	const struct radeon_pair_handler *Handler;
-	GLboolean Error;
-	GLboolean Debug;
 	GLboolean Verbose;
 	void *UserData;
 
@@ -130,11 +129,6 @@ struct pair_state {
 	struct pair_register_translation Inputs[FRAG_ATTRIB_MAX];
 	struct pair_register_translation Temps[MAX_PROGRAM_TEMPS];
 
-	/**
-	 * Derived information about program instructions.
-	 */
-	struct pair_state_instruction *Instructions;
-
 	struct {
 		GLuint RefCount; /**< # of times this occurs in an unscheduled SrcReg or DstReg */
 	} HwTemps[128];
@@ -147,14 +141,6 @@ struct pair_state {
 	struct pair_state_instruction *ReadyRGB;
 	struct pair_state_instruction *ReadyAlpha;
 	struct pair_state_instruction *ReadyTEX;
-
-	/**
-	 * Pool of @ref reg_value structures for fast allocation.
-	 */
-	struct reg_value *ValuePool;
-	GLuint ValuePoolUsed;
-	struct reg_value_reader *ReaderPool;
-	GLuint ReaderPoolUsed;
 };
 
 
@@ -183,7 +169,7 @@ static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index)
 
 	struct pair_register_translation *t = get_register(s, file, index);
 	if (!t) {
-		_mesa_problem(s->Ctx, "get_hw_reg: %i[%i]\n", file, index);
+		error("get_hw_reg: %i[%i]\n", file, index);
 		return 0;
 	}
 
@@ -221,15 +207,13 @@ static void add_pairinst_to_list(struct pair_state_instruction **list, struct pa
 }
 
 /**
- * The instruction at the given IP has become ready. Link it into the ready
+ * The given instruction has become ready. Link it into the ready
  * instructions.
  */
-static void instruction_ready(struct pair_state *s, int ip)
+static void instruction_ready(struct pair_state *s, struct pair_state_instruction *pairinst)
 {
-	struct pair_state_instruction *pairinst = s->Instructions + ip;
-
 	if (s->Verbose)
-		_mesa_printf("instruction_ready(%i)\n", ip);
+		_mesa_printf("instruction_ready(%i)\n", pairinst->IP);
 
 	if (pairinst->IsTex)
 		add_pairinst_to_list(&s->ReadyTEX, pairinst);
@@ -255,8 +239,7 @@ static void final_rewrite(struct pair_state *s, struct prog_instruction *inst)
 		inst->SrcReg[2] = inst->SrcReg[1];
 		inst->SrcReg[1].File = PROGRAM_BUILTIN;
 		inst->SrcReg[1].Swizzle = SWIZZLE_1111;
-		inst->SrcReg[1].NegateBase = 0;
-		inst->SrcReg[1].NegateAbs = 0;
+		inst->SrcReg[1].Negate = NEGATE_NONE;
 		inst->Opcode = OPCODE_MAD;
 		break;
 	case OPCODE_CMP:
@@ -297,12 +280,12 @@ static void final_rewrite(struct pair_state *s, struct prog_instruction *inst)
  * Classify an instruction according to which ALUs etc. it needs
  */
 static void classify_instruction(struct pair_state *s,
-	struct prog_instruction *inst, struct pair_state_instruction *pairinst)
+	struct pair_state_instruction *psi)
 {
-	pairinst->NeedRGB = (inst->DstReg.WriteMask & WRITEMASK_XYZ) ? 1 : 0;
-	pairinst->NeedAlpha = (inst->DstReg.WriteMask & WRITEMASK_W) ? 1 : 0;
+	psi->NeedRGB = (psi->Instruction.DstReg.WriteMask & WRITEMASK_XYZ) ? 1 : 0;
+	psi->NeedAlpha = (psi->Instruction.DstReg.WriteMask & WRITEMASK_W) ? 1 : 0;
 
-	switch(inst->Opcode) {
+	switch(psi->Instruction.Opcode) {
 	case OPCODE_ADD:
 	case OPCODE_CMP:
 	case OPCODE_DDX:
@@ -320,24 +303,24 @@ static void classify_instruction(struct pair_state *s,
 	case OPCODE_RCP:
 	case OPCODE_RSQ:
 	case OPCODE_SIN:
-		pairinst->IsTranscendent = 1;
-		pairinst->NeedAlpha = 1;
+		psi->IsTranscendent = 1;
+		psi->NeedAlpha = 1;
 		break;
 	case OPCODE_DP4:
-		pairinst->NeedAlpha = 1;
+		psi->NeedAlpha = 1;
 		/* fall through */
 	case OPCODE_DP3:
-		pairinst->NeedRGB = 1;
+		psi->NeedRGB = 1;
 		break;
 	case OPCODE_KIL:
 	case OPCODE_TEX:
 	case OPCODE_TXB:
 	case OPCODE_TXP:
 	case OPCODE_END:
-		pairinst->IsTex = 1;
+		psi->IsTex = 1;
 		break;
 	default:
-		error("Unknown opcode %d\n", inst->Opcode);
+		error("Unknown opcode %d\n", psi->Instruction.Opcode);
 		break;
 	}
 }
@@ -349,30 +332,34 @@ static void classify_instruction(struct pair_state *s,
  */
 static void scan_instructions(struct pair_state *s)
 {
-	struct prog_instruction *inst;
-	struct pair_state_instruction *pairinst;
+	struct rc_instruction *source;
 	GLuint ip;
 
-	for(inst = s->Program->Instructions, pairinst = s->Instructions, ip = 0;
-	    inst->Opcode != OPCODE_END;
-	    ++inst, ++pairinst, ++ip) {
-		final_rewrite(s, inst);
-		classify_instruction(s, inst, pairinst);
+	for(source = s->Compiler->Base.Program.Instructions.Next, ip = 0;
+	    source != &s->Compiler->Base.Program.Instructions;
+	    source = source->Next, ++ip) {
+		struct pair_state_instruction *pairinst = memory_pool_malloc(&s->Compiler->Base.Pool, sizeof(*pairinst));
+		memset(pairinst, 0, sizeof(struct pair_state_instruction));
 
-		int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
+		pairinst->Instruction = source->I;
+		pairinst->IP = ip;
+		final_rewrite(s, &pairinst->Instruction);
+		classify_instruction(s, pairinst);
+
+		int nsrc = _mesa_num_inst_src_regs(pairinst->Instruction.Opcode);
 		int j;
 		for(j = 0; j < nsrc; j++) {
 			struct pair_register_translation *t =
-				get_register(s, inst->SrcReg[j].File, inst->SrcReg[j].Index);
+				get_register(s, pairinst->Instruction.SrcReg[j].File, pairinst->Instruction.SrcReg[j].Index);
 			if (!t)
 				continue;
 
 			t->RefCount++;
 
-			if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
+			if (pairinst->Instruction.SrcReg[j].File == PROGRAM_TEMPORARY) {
 				int i;
 				for(i = 0; i < 4; ++i) {
-					GLuint swz = GET_SWZ(inst->SrcReg[j].Swizzle, i);
+					GLuint swz = GET_SWZ(pairinst->Instruction.SrcReg[j].Swizzle, i);
 					if (swz >= 4)
 						continue; /* constant or NIL swizzle */
 					if (!t->Value[swz])
@@ -382,36 +369,37 @@ static void scan_instructions(struct pair_state *s)
 					 * also rewrites the value. The code below adds
 					 * a dependency for the DstReg, which is a superset
 					 * of the SrcReg dependency. */
-					if (inst->DstReg.File == PROGRAM_TEMPORARY &&
-					    inst->DstReg.Index == inst->SrcReg[j].Index &&
-					    GET_BIT(inst->DstReg.WriteMask, swz))
+					if (pairinst->Instruction.DstReg.File == PROGRAM_TEMPORARY &&
+					    pairinst->Instruction.DstReg.Index == pairinst->Instruction.SrcReg[j].Index &&
+					    GET_BIT(pairinst->Instruction.DstReg.WriteMask, swz))
 						continue;
 
-					struct reg_value_reader* r = &s->ReaderPool[s->ReaderPoolUsed++];
+					struct reg_value_reader* r = memory_pool_malloc(&s->Compiler->Base.Pool, sizeof(*r));
 					pairinst->NumDependencies++;
 					t->Value[swz]->NumReaders++;
-					r->IP = ip;
+					r->Reader = pairinst;
 					r->Next = t->Value[swz]->Readers;
 					t->Value[swz]->Readers = r;
 				}
 			}
 		}
 
-		int ndst = _mesa_num_inst_dst_regs(inst->Opcode);
+		int ndst = _mesa_num_inst_dst_regs(pairinst->Instruction.Opcode);
 		if (ndst) {
 			struct pair_register_translation *t =
-				get_register(s, inst->DstReg.File, inst->DstReg.Index);
+				get_register(s, pairinst->Instruction.DstReg.File, pairinst->Instruction.DstReg.Index);
 			if (t) {
 				t->RefCount++;
 
-				if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+				if (pairinst->Instruction.DstReg.File == PROGRAM_TEMPORARY) {
 					int j;
 					for(j = 0; j < 4; ++j) {
-						if (!GET_BIT(inst->DstReg.WriteMask, j))
+						if (!GET_BIT(pairinst->Instruction.DstReg.WriteMask, j))
 							continue;
 
-						struct reg_value* v = &s->ValuePool[s->ValuePoolUsed++];
-						v->IP = ip;
+						struct reg_value* v = memory_pool_malloc(&s->Compiler->Base.Pool, sizeof(*v));
+						memset(v, 0, sizeof(struct reg_value));
+						v->Writer = pairinst;
 						if (t->Value[j]) {
 							pairinst->NumDependencies++;
 							t->Value[j]->Next = v;
@@ -427,7 +415,7 @@ static void scan_instructions(struct pair_state *s)
 			_mesa_printf("scan(%i): NumDeps = %i\n", ip, pairinst->NumDependencies);
 
 		if (!pairinst->NumDependencies)
-			instruction_ready(s, ip);
+			instruction_ready(s, pairinst);
 	}
 
 	/* Clear the PROGRAM_TEMPORARY state */
@@ -439,70 +427,23 @@ static void scan_instructions(struct pair_state *s)
 }
 
 
-/**
- * Reserve hardware temporary registers for the program inputs.
- *
- * @note This allocation is performed explicitly, because the order of inputs
- * is determined by the RS hardware.
- */
-static void allocate_input_registers(struct pair_state *s)
+static void decrement_dependencies(struct pair_state *s, struct pair_state_instruction *pairinst)
 {
-	GLuint InputsRead = s->Program->InputsRead;
-	int i;
-	GLuint hwindex = 0;
-
-	/* Texcoords come first */
-	for (i = 0; i < s->Ctx->Const.MaxTextureUnits; i++) {
-		if (InputsRead & (FRAG_BIT_TEX0 << i))
-			alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_TEX0+i, hwindex++);
-	}
-	InputsRead &= ~FRAG_BITS_TEX_ANY;
-
-	/* fragment position treated as a texcoord */
-	if (InputsRead & FRAG_BIT_WPOS)
-		alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_WPOS, hwindex++);
-	InputsRead &= ~FRAG_BIT_WPOS;
-
-	/* Then primary colour */
-	if (InputsRead & FRAG_BIT_COL0)
-		alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL0, hwindex++);
-	InputsRead &= ~FRAG_BIT_COL0;
-
-	/* Secondary color */
-	if (InputsRead & FRAG_BIT_COL1)
-		alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL1, hwindex++);
-	InputsRead &= ~FRAG_BIT_COL1;
-
-	/* Fog coordinate */
-	if (InputsRead & FRAG_BIT_FOGC)
-		alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_FOGC, hwindex++);
-	InputsRead &= ~FRAG_BIT_FOGC;
-
-	/* Anything else */
-	if (InputsRead)
-		error("Don't know how to handle inputs 0x%x\n", InputsRead);
-}
-
-
-static void decrement_dependencies(struct pair_state *s, int ip)
-{
-	struct pair_state_instruction *pairinst = s->Instructions + ip;
 	ASSERT(pairinst->NumDependencies > 0);
 	if (!--pairinst->NumDependencies)
-		instruction_ready(s, ip);
+		instruction_ready(s, pairinst);
 }
 
 /**
  * Update the dependency tracking state based on what the instruction
  * at the given IP does.
  */
-static void commit_instruction(struct pair_state *s, int ip)
+static void commit_instruction(struct pair_state *s, struct pair_state_instruction *pairinst)
 {
-	struct prog_instruction *inst = s->Program->Instructions + ip;
-	struct pair_state_instruction *pairinst = s->Instructions + ip;
+	struct prog_instruction *inst = &pairinst->Instruction;
 
 	if (s->Verbose)
-		_mesa_printf("commit_instruction(%i)\n", ip);
+		_mesa_printf("commit_instruction(%i)\n", pairinst->IP);
 
 	if (inst->DstReg.File == PROGRAM_TEMPORARY) {
 		struct pair_register_translation *t = &s->Temps[inst->DstReg.Index];
@@ -517,11 +458,11 @@ static void commit_instruction(struct pair_state *s, int ip)
 			if (t->Value[i]->NumReaders) {
 				struct reg_value_reader *r;
 				for(r = pairinst->Values[i]->Readers; r; r = r->Next)
-					decrement_dependencies(s, r->IP);
+					decrement_dependencies(s, r->Reader);
 			} else if (t->Value[i]->Next) {
 				/* This happens when the only reader writes
 				 * the register at the same time */
-				decrement_dependencies(s, t->Value[i]->Next->IP);
+				decrement_dependencies(s, t->Value[i]->Next->Writer);
 			}
 		}
 	}
@@ -555,7 +496,7 @@ static void commit_instruction(struct pair_state *s, int ip)
 
 			if (!--t->Value[swz]->NumReaders) {
 				if (t->Value[swz]->Next)
-					decrement_dependencies(s, t->Value[swz]->Next->IP);
+					decrement_dependencies(s, t->Value[swz]->Next->Writer);
 			}
 		}
 	}
@@ -586,35 +527,52 @@ static void emit_all_tex(struct pair_state *s)
 
 	// Allocate destination hardware registers in one block to avoid conflicts.
 	for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
-		int ip = pairinst - s->Instructions;
-		struct prog_instruction *inst = s->Program->Instructions + ip;
+		struct prog_instruction *inst = &pairinst->Instruction;
 		if (inst->Opcode != OPCODE_KIL)
 			get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
 	}
 
-	if (s->Debug)
+	if (s->Compiler->Base.Debug)
 		_mesa_printf(" BEGIN_TEX\n");
 
 	if (s->Handler->BeginTexBlock)
-		s->Error = s->Error || !s->Handler->BeginTexBlock(s->UserData);
+		s->Compiler->Base.Error = s->Compiler->Base.Error || !s->Handler->BeginTexBlock(s->UserData);
 
 	for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
-		int ip = pairinst - s->Instructions;
-		struct prog_instruction *inst = s->Program->Instructions + ip;
-		commit_instruction(s, ip);
+		struct prog_instruction *inst = &pairinst->Instruction;
+		commit_instruction(s, pairinst);
 
 		if (inst->Opcode != OPCODE_KIL)
 			inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
 		inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index);
 
-		if (s->Debug) {
+		if (s->Compiler->Base.Debug) {
 			_mesa_printf("   ");
 			_mesa_print_instruction(inst);
+			fflush(stderr);
+		}
+
+		struct radeon_pair_texture_instruction rpti;
+
+		switch(inst->Opcode) {
+		case OPCODE_TEX: rpti.Opcode = RADEON_OPCODE_TEX; break;
+		case OPCODE_TXB: rpti.Opcode = RADEON_OPCODE_TXB; break;
+		case OPCODE_TXP: rpti.Opcode = RADEON_OPCODE_TXP; break;
+		default:
+		case OPCODE_KIL: rpti.Opcode = RADEON_OPCODE_KIL; break;
 		}
-		s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst);
+
+		rpti.DestIndex = inst->DstReg.Index;
+		rpti.WriteMask = inst->DstReg.WriteMask;
+		rpti.TexSrcUnit = inst->TexSrcUnit;
+		rpti.TexSrcTarget = inst->TexSrcTarget;
+		rpti.SrcIndex = inst->SrcReg[0].Index;
+		rpti.SrcSwizzle = inst->SrcReg[0].Swizzle;
+
+		s->Compiler->Base.Error = s->Compiler->Base.Error || !s->Handler->EmitTex(s->UserData, &rpti);
 	}
 
-	if (s->Debug)
+	if (s->Compiler->Base.Debug)
 		_mesa_printf(" END_TEX\n");
 }
 
@@ -637,7 +595,7 @@ static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instructio
 		index = get_hw_reg(s, src.File, src.Index);
 	} else {
 		constant = 1;
-		s->Error |= !s->Handler->EmitConst(s->UserData, src.File, src.Index, &index);
+		index = src.Index;
 	}
 
 	for(i = 0; i < 3; ++i) {
@@ -684,10 +642,12 @@ static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instructio
  * Fill the given ALU instruction's opcodes and source operands into the given pair,
  * if possible.
  */
-static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip)
+static GLboolean fill_instruction_into_pair(
+	struct pair_state *s,
+	struct radeon_pair_instruction *pair,
+	struct pair_state_instruction *pairinst)
 {
-	struct pair_state_instruction *pairinst = s->Instructions + ip;
-	struct prog_instruction *inst = s->Program->Instructions + ip;
+	struct prog_instruction *inst = &pairinst->Instruction;
 
 	ASSERT(!pairinst->NeedRGB || pair->RGB.Opcode == OPCODE_NOP);
 	ASSERT(!pairinst->NeedAlpha || pair->Alpha.Opcode == OPCODE_NOP);
@@ -722,7 +682,6 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
 		if (pairinst->NeedRGB && !pairinst->IsTranscendent) {
 			GLboolean srcrgb = GL_FALSE;
 			GLboolean srcalpha = GL_FALSE;
-			GLuint negatebase = 0;
 			int j;
 			for(j = 0; j < 3; ++j) {
 				GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
@@ -730,8 +689,6 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
 					srcrgb = GL_TRUE;
 				else if (swz < 4)
 					srcalpha = GL_TRUE;
-				if (swz != SWIZZLE_NIL && GET_BIT(inst->SrcReg[i].NegateBase, j))
-					negatebase = 1;
 			}
 			source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
 			if (source < 0)
@@ -739,12 +696,11 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
 			pair->RGB.Arg[i].Source = source;
 			pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff;
 			pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs;
-			pair->RGB.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs;
+			pair->RGB.Arg[i].Negate = !!(inst->SrcReg[i].Negate & (NEGATE_X | NEGATE_Y | NEGATE_Z));
 		}
 		if (pairinst->NeedAlpha) {
 			GLboolean srcrgb = GL_FALSE;
 			GLboolean srcalpha = GL_FALSE;
-			GLuint negatebase = GET_BIT(inst->SrcReg[i].NegateBase, pairinst->IsTranscendent ? 0 : 3);
 			GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, pairinst->IsTranscendent ? 0 : 3);
 			if (swz < 3)
 				srcrgb = GL_TRUE;
@@ -756,7 +712,7 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
 			pair->Alpha.Arg[i].Source = source;
 			pair->Alpha.Arg[i].Swizzle = swz;
 			pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs;
-			pair->Alpha.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs;
+			pair->Alpha.Arg[i].Negate = !!(inst->SrcReg[i].Negate & NEGATE_W);
 		}
 	}
 
@@ -772,16 +728,18 @@ static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_
  * we are absolutely certain that we're going to emit a certain
  * instruction pairing.
  */
-static void fill_dest_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip)
+static void fill_dest_into_pair(
+	struct pair_state *s,
+	struct radeon_pair_instruction *pair,
+	struct pair_state_instruction *pairinst)
 {
-	struct pair_state_instruction *pairinst = s->Instructions + ip;
-	struct prog_instruction *inst = s->Program->Instructions + ip;
+	struct prog_instruction *inst = &pairinst->Instruction;
 
 	if (inst->DstReg.File == PROGRAM_OUTPUT) {
-		if (inst->DstReg.Index == FRAG_RESULT_COLR) {
+		if (inst->DstReg.Index == s->Compiler->OutputColor) {
 			pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
 			pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
-		} else if (inst->DstReg.Index == FRAG_RESULT_DEPR) {
+		} else if (inst->DstReg.Index == s->Compiler->OutputDepth) {
 			pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
 		}
 	} else {
@@ -808,24 +766,24 @@ static void fill_dest_into_pair(struct pair_state *s, struct radeon_pair_instruc
 static void emit_alu(struct pair_state *s)
 {
 	struct radeon_pair_instruction pair;
+	struct pair_state_instruction *psi;
 
 	if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) {
-		int ip;
 		if (s->ReadyFullALU) {
-			ip = s->ReadyFullALU - s->Instructions;
+			psi = s->ReadyFullALU;
 			s->ReadyFullALU = s->ReadyFullALU->NextReady;
 		} else if (s->ReadyRGB) {
-			ip = s->ReadyRGB - s->Instructions;
+			psi = s->ReadyRGB;
 			s->ReadyRGB = s->ReadyRGB->NextReady;
 		} else {
-			ip = s->ReadyAlpha - s->Instructions;
+			psi = s->ReadyAlpha;
 			s->ReadyAlpha = s->ReadyAlpha->NextReady;
 		}
 
 		_mesa_bzero(&pair, sizeof(pair));
-		fill_instruction_into_pair(s, &pair, ip);
-		fill_dest_into_pair(s, &pair, ip);
-		commit_instruction(s, ip);
+		fill_instruction_into_pair(s, &pair, psi);
+		fill_dest_into_pair(s, &pair, psi);
+		commit_instruction(s, psi);
 	} else {
 		struct pair_state_instruction **prgb;
 		struct pair_state_instruction **palpha;
@@ -834,65 +792,65 @@ static void emit_alu(struct pair_state *s)
 		 * many source slots; try all possible pairings if necessary */
 		for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
 			for(palpha = &s->ReadyAlpha; *palpha; palpha = &(*palpha)->NextReady) {
-				int rgbip = *prgb - s->Instructions;
-				int alphaip = *palpha - s->Instructions;
+				struct pair_state_instruction * psirgb = *prgb;
+				struct pair_state_instruction * psialpha = *palpha;
 				_mesa_bzero(&pair, sizeof(pair));
-				fill_instruction_into_pair(s, &pair, rgbip);
-				if (!fill_instruction_into_pair(s, &pair, alphaip))
+				fill_instruction_into_pair(s, &pair, psirgb);
+				if (!fill_instruction_into_pair(s, &pair, psialpha))
 					continue;
 				*prgb = (*prgb)->NextReady;
 				*palpha = (*palpha)->NextReady;
-				fill_dest_into_pair(s, &pair, rgbip);
-				fill_dest_into_pair(s, &pair, alphaip);
-				commit_instruction(s, rgbip);
-				commit_instruction(s, alphaip);
+				fill_dest_into_pair(s, &pair, psirgb);
+				fill_dest_into_pair(s, &pair, psialpha);
+				commit_instruction(s, psirgb);
+				commit_instruction(s, psialpha);
 				goto success;
 			}
 		}
 
 		/* No success in pairing; just take the first RGB instruction */
-		int ip = s->ReadyRGB - s->Instructions;
+		psi = s->ReadyRGB;
 		s->ReadyRGB = s->ReadyRGB->NextReady;
+
 		_mesa_bzero(&pair, sizeof(pair));
-		fill_instruction_into_pair(s, &pair, ip);
-		fill_dest_into_pair(s, &pair, ip);
-		commit_instruction(s, ip);
+		fill_instruction_into_pair(s, &pair, psi);
+		fill_dest_into_pair(s, &pair, psi);
+		commit_instruction(s, psi);
 	success: ;
 	}
 
-	if (s->Debug)
+	if (s->Compiler->Base.Debug)
 		radeonPrintPairInstruction(&pair);
 
-	s->Error = s->Error || !s->Handler->EmitPaired(s->UserData, &pair);
+	s->Compiler->Base.Error = s->Compiler->Base.Error || !s->Handler->EmitPaired(s->UserData, &pair);
 }
 
+/* Callback function for assigning input registers to hardware registers */
+static void alloc_helper(void * data, unsigned input, unsigned hwreg)
+{
+	struct pair_state * s = data;
+	alloc_hw_reg(s, PROGRAM_INPUT, input, hwreg);
+}
 
-GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
+void radeonPairProgram(
+	struct r300_fragment_program_compiler * compiler,
 	const struct radeon_pair_handler* handler, void *userdata)
 {
 	struct pair_state s;
 
 	_mesa_bzero(&s, sizeof(s));
-	s.Ctx = ctx;
-	s.Program = program;
+	s.Compiler = compiler;
 	s.Handler = handler;
 	s.UserData = userdata;
-	s.Debug = (RADEON_DEBUG & DEBUG_PIXEL) ? GL_TRUE : GL_FALSE;
-	s.Verbose = GL_FALSE && s.Debug;
-
-	s.Instructions = (struct pair_state_instruction*)_mesa_calloc(
-		sizeof(struct pair_state_instruction)*s.Program->NumInstructions);
-	s.ValuePool = (struct reg_value*)_mesa_calloc(sizeof(struct reg_value)*s.Program->NumInstructions*4);
-	s.ReaderPool = (struct reg_value_reader*)_mesa_calloc(
-		sizeof(struct reg_value_reader)*s.Program->NumInstructions*12);
+	s.Verbose = GL_FALSE && s.Compiler->Base.Debug;
 
-	if (s.Debug)
+	if (s.Compiler->Base.Debug)
 		_mesa_printf("Emit paired program\n");
 
 	scan_instructions(&s);
-	allocate_input_registers(&s);
+	s.Compiler->AllocateHwInputs(s.Compiler, &alloc_helper, &s);
 
-	while(!s.Error &&
+	while(!s.Compiler->Base.Error &&
 	      (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) {
 		if (s.ReadyTEX)
 			emit_all_tex(&s);
@@ -901,14 +859,8 @@ GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
 			emit_alu(&s);
 	}
 
-	if (s.Debug)
+	if (s.Compiler->Base.Debug)
 		_mesa_printf(" END\n");
-
-	_mesa_free(s.Instructions);
-	_mesa_free(s.ValuePool);
-	_mesa_free(s.ReaderPool);
-
-	return !s.Error;
 }
 
 
diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
index 4624a24629..ff76178551 100644
--- a/src/mesa/drivers/dri/r300/radeon_program_pair.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
@@ -30,6 +30,8 @@
 
 #include "radeon_program.h"
 
+struct r300_fragment_program_compiler;
+
 
 /**
  * Represents a paired instruction, as found in R300 and R500
@@ -82,18 +84,32 @@ struct radeon_pair_instruction {
 };
 
 
+enum {
+	RADEON_OPCODE_TEX = 0,
+	RADEON_OPCODE_TXB,
+	RADEON_OPCODE_TXP,
+	RADEON_OPCODE_KIL
+};
+
+struct radeon_pair_texture_instruction {
+	GLuint Opcode:2; /**< one of RADEON_OPCODE_xxx */
+
+	GLuint DestIndex:8;
+	GLuint WriteMask:4;
+
+	GLuint TexSrcUnit:5;
+	GLuint TexSrcTarget:3;
+
+	GLuint SrcIndex:8;
+	GLuint SrcSwizzle:12;
+};
+
+
 /**
  *
  */
 struct radeon_pair_handler {
 	/**
-	 * Fill in the proper hardware index for the given constant register.
-	 *
-	 * @return GL_FALSE on error.
-	 */
-	GLboolean (*EmitConst)(void*, GLuint file, GLuint index, GLuint *hwindex);
-
-	/**
 	 * Write a paired instruction to the hardware.
 	 *
 	 * @return GL_FALSE on error.
@@ -107,7 +123,7 @@ struct radeon_pair_handler {
 	 *
 	 * @return GL_FALSE on error.
 	 */
-	GLboolean (*EmitTex)(void*, struct prog_instruction*);
+	GLboolean (*EmitTex)(void*, struct radeon_pair_texture_instruction*);
 
 	/**
 	 * Called before a block of contiguous, independent texture
@@ -115,10 +131,11 @@ struct radeon_pair_handler {
 	 */
 	GLboolean (*BeginTexBlock)(void*);
 
-	GLuint MaxHwTemps;
+	unsigned MaxHwTemps;
 };
 
-GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
+void radeonPairProgram(
+	struct r300_fragment_program_compiler * compiler,
 	const struct radeon_pair_handler*, void *userdata);
 
 void radeonPrintPairInstruction(struct radeon_pair_instruction *inst);
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
index c9e1dfe977..da5b7ba642 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
@@ -44,237 +44,425 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "radeon_drm.h"
 
-#include "radeon_ioctl.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "radeon_reg.h"
 #include "r300_reg.h"
 #include "r300_cmdbuf.h"
 #include "r300_emit.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_state.h"
+#include "radeon_reg.h"
+#include "radeon_queryobj.h"
 
-// Set this to 1 for extremely verbose debugging of command buffers
-#define DEBUG_CMDBUF		0
-
-/**
- * Send the current command buffer via ioctl to the hardware.
+/** # of dwords reserved for additional instructions that may need to be written
+ * during flushing.
  */
-int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller)
-{
-	int ret;
-	int i;
-	drm_radeon_cmd_buffer_t cmd;
-	int start;
-
-	if (r300->radeon.lost_context) {
-		start = 0;
-		r300->radeon.lost_context = GL_FALSE;
-	} else
-		start = r300->cmdbuf.count_reemit;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "%s from %s - %i cliprects\n",
-			__FUNCTION__, caller, r300->radeon.numClipRects);
-
-		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE)
-			for (i = start; i < r300->cmdbuf.count_used; ++i)
-				fprintf(stderr, "%d: %08x\n", i,
-					r300->cmdbuf.cmd_buf[i]);
-	}
+#define SPACE_FOR_FLUSHING	4
 
-	cmd.buf = (char *)(r300->cmdbuf.cmd_buf + start);
-	cmd.bufsz = (r300->cmdbuf.count_used - start) * 4;
+static unsigned packet0_count(r300ContextPtr r300, uint32_t *pkt)
+{
+    if (r300->radeon.radeonScreen->kernel_mm) {
+        return ((((*pkt) >> 16) & 0x3FFF) + 1);
+    } else {
+        drm_r300_cmd_header_t *t = (drm_r300_cmd_header_t*)pkt;
+        return t->packet0.count;
+    }
+}
 
-	if (r300->radeon.state.scissor.enabled) {
-		cmd.nbox = r300->radeon.state.scissor.numClipRects;
-		cmd.boxes =
-		    (drm_clip_rect_t *) r300->radeon.state.scissor.pClipRects;
-	} else {
-		cmd.nbox = r300->radeon.numClipRects;
-		cmd.boxes = (drm_clip_rect_t *) r300->radeon.pClipRects;
-	}
+#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
 
-	ret = drmCommandWrite(r300->radeon.dri.fd,
-			      DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
+int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int cnt;
+	int extra = 1;
+	cnt = vpu_count(atom->cmd);
 
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "Syncing in %s (from %s)\n\n",
-			__FUNCTION__, caller);
-		radeonWaitForIdleLocked(&r300->radeon);
+	if (r300->radeon.radeonScreen->kernel_mm) {
+		extra = 5;
 	}
 
-	r300->dma.nr_released_bufs = 0;
-	r300->cmdbuf.count_used = 0;
-	r300->cmdbuf.count_reemit = 0;
-
-	return ret;
+	return cnt ? (cnt * 4) + extra : 0;
 }
 
-int r300FlushCmdBuf(r300ContextPtr r300, const char *caller)
-{
-	int ret;
-
-	LOCK_HARDWARE(&r300->radeon);
 
-	ret = r300FlushCmdBufLocked(r300, caller);
+void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	drm_r300_cmd_header_t cmd;
+	uint32_t addr, ndw;
+
+	cmd.u = atom->cmd[0];
+	addr = (cmd.vpu.adrhi << 8) | cmd.vpu.adrlo;
+	ndw = atom->check(ctx, atom);
+
+	BEGIN_BATCH_NO_AUTOSTATE(ndw);
+
+	ndw -= 5;
+	OUT_BATCH_REGVAL(R300_VAP_PVS_VECTOR_INDX_REG, addr);
+	OUT_BATCH(CP_PACKET0(R300_VAP_PVS_UPLOAD_DATA, ndw-1) | RADEON_ONE_REG_WR);
+	OUT_BATCH_TABLE(&atom->cmd[1], ndw);
+	OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+	END_BATCH();
+}
 
-	UNLOCK_HARDWARE(&r300->radeon);
+void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	drm_r300_cmd_header_t cmd;
+	uint32_t addr, ndw, sz;
+	int type, clamp;
+
+	ndw = atom->check(ctx, atom);
+
+	cmd.u = atom->cmd[0];
+	sz = cmd.r500fp.count;
+	addr = ((cmd.r500fp.adrhi_flags & 1) << 8) | cmd.r500fp.adrlo;
+	type = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_TYPE);
+	clamp = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_CLAMP);
+
+	addr |= (type << 16);
+	addr |= (clamp << 17);
+
+	BEGIN_BATCH_NO_AUTOSTATE(ndw);
+	OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_INDEX, 0));
+	OUT_BATCH(addr);
+	ndw-=3;
+	OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_DATA, ndw-1) | RADEON_ONE_REG_WR);
+	OUT_BATCH_TABLE(&atom->cmd[1], ndw);
+	END_BATCH();
+}
 
-	if (ret) {
-		fprintf(stderr, "drmRadeonCmdBuffer: %d\n", ret);
-		_mesa_exit(ret);
+static int check_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int numtmus = packet0_count(r300, r300->hw.tex.offset.cmd);
+	int dw = 0, i;
+	if (atom->cmd[0] == CP_PACKET2) {
+		return dw;
 	}
-
-	return ret;
+	for(i = 0; i < numtmus; ++i) {
+		radeonTexObj *t = r300->hw.textures[i];
+		if (!t && !r300->radeon.radeonScreen->kernel_mm) {
+			dw += 0;
+		} else if (t && t->image_override && !t->bo) {
+			if (!r300->radeon.radeonScreen->kernel_mm)
+				dw += 2;
+		} else
+			dw += 4;
+	}
+	return dw;
 }
 
-static void r300PrintStateAtom(r300ContextPtr r300, struct r300_state_atom *state)
+static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
 {
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	int numtmus = packet0_count(r300, r300->hw.tex.offset.cmd);
 	int i;
-	int dwords = (*state->check) (r300, state);
-
-	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords,
-		state->cmd_size);
 
-	if (RADEON_DEBUG & DEBUG_VERBOSE) {
-		for (i = 0; i < dwords; i++) {
-			fprintf(stderr, "      %s[%d]: %08x\n",
-				state->name, i, state->cmd[i]);
+	for(i = 0; i < numtmus; ++i) {
+		radeonTexObj *t = r300->hw.textures[i];
+		if (t && !t->image_override) {
+			BEGIN_BATCH_NO_AUTOSTATE(4);
+			OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+			OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+					RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+			END_BATCH();
+		} else if (!t) {
+			/* Texture unit hasn't a texture bound.
+			 * We assign the current color buffer as a fakery to make
+			 * KIL work on KMS (without it, the CS checker will complain).
+			 */
+			if (r300->radeon.radeonScreen->kernel_mm) {
+				struct radeon_renderbuffer *rrb = radeon_get_colorbuffer(&r300->radeon);
+				if (rrb && rrb->bo) {
+					BEGIN_BATCH_NO_AUTOSTATE(4);
+					OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+					OUT_BATCH_RELOC(0, rrb->bo, 0,
+							RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+					END_BATCH();
+				}
+			}
+		} else { /* override cases */
+			if (t->bo) {
+				BEGIN_BATCH_NO_AUTOSTATE(4);
+				OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+				OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
+						RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+				END_BATCH();
+			} else if (!r300->radeon.radeonScreen->kernel_mm) {
+				BEGIN_BATCH_NO_AUTOSTATE(2);
+				OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+				OUT_BATCH(t->override_offset);
+				END_BATCH();
+			} else {
+				/* Texture unit hasn't a texture bound nothings to do */
+			}
 		}
 	}
 }
 
-/**
- * Emit all atoms with a dirty field equal to dirty.
- *
- * The caller must have ensured that there is enough space in the command
- * buffer.
- */
-static INLINE void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
+void r300_emit_scissor(GLcontext *ctx)
 {
-	struct r300_state_atom *atom;
-	uint32_t *dest;
-	int dwords;
-
-	dest = r300->cmdbuf.cmd_buf + r300->cmdbuf.count_used;
-
-	/* Emit WAIT */
-	*dest = cmdwait(R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit cache flush */
-	*dest = cmdpacket0(R300_TX_INVALTAGS, 1);
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	*dest = R300_TX_FLUSH;
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit END3D */
-	*dest = cmdpacify();
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit actual atoms */
-
-	foreach(atom, &r300->hw.atomlist) {
-		if ((atom->dirty || r300->hw.all_dirty) == dirty) {
-			dwords = (*atom->check) (r300, atom);
-			if (dwords) {
-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					r300PrintStateAtom(r300, atom);
-				}
-				memcpy(dest, atom->cmd, dwords * 4);
-				dest += dwords;
-				r300->cmdbuf.count_used += dwords;
-				atom->dirty = GL_FALSE;
-			} else {
-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					fprintf(stderr, "  skip state %s\n",
-						atom->name);
-				}
-			}
-		}
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    unsigned x1, y1, x2, y2;
+	struct radeon_renderbuffer *rrb;
+
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        return;
+    }
+	rrb = radeon_get_colorbuffer(&r300->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return;
 	}
+    if (r300->radeon.state.scissor.enabled) {
+        x1 = r300->radeon.state.scissor.rect.x1;
+        y1 = r300->radeon.state.scissor.rect.y1;
+        x2 = r300->radeon.state.scissor.rect.x2;
+        y2 = r300->radeon.state.scissor.rect.y2;
+    } else {
+        x1 = 0;
+        y1 = 0;
+        x2 = rrb->base.Width - 1;
+        y2 = rrb->base.Height - 1;
+    }
+    if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+        x1 += R300_SCISSORS_OFFSET;
+        y1 += R300_SCISSORS_OFFSET;
+        x2 += R300_SCISSORS_OFFSET;
+        y2 += R300_SCISSORS_OFFSET;
+    }
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_BATCH((x1 << R300_SCISSORS_X_SHIFT)|(y1 << R300_SCISSORS_Y_SHIFT));
+    OUT_BATCH((x2 << R300_SCISSORS_X_SHIFT)|(y2 << R300_SCISSORS_Y_SHIFT));
+    END_BATCH();
+}
+static int check_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	uint32_t dw = 6 + 3 + 16;
+	if (r300->radeon.radeonScreen->kernel_mm)
+		dw += 2;
+	if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+		dw -= 3 + 16;
+	}
+	return dw;
 }
 
-/**
- * Copy dirty hardware state atoms into the command buffer.
- *
- * We also copy out clean state if we're at the start of a buffer. That makes
- * it easy to recover from lost contexts.
- */
-void r300EmitState(r300ContextPtr r300)
+static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_PRIMS))
-		fprintf(stderr, "%s\n", __FUNCTION__);
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t cbpitch;
+	uint32_t offset = r300->radeon.state.color.draw_offset;
+	uint32_t dw = 6;
+	int i;
 
-	if (r300->cmdbuf.count_used && !r300->hw.is_dirty
-	    && !r300->hw.all_dirty)
+	rrb = radeon_get_colorbuffer(&r300->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
 		return;
+	}
 
-	/* To avoid going across the entire set of states multiple times, just check
-	 * for enough space for the case of emitting all state, and inline the
-	 * r300AllocCmdBuf code here without all the checks.
-	 */
-	r300EnsureCmdBufSpace(r300, r300->hw.max_state_size, __FUNCTION__);
+        if (RADEON_DEBUG & RADEON_STATE)
+           fprintf(stderr,"rrb is %p %d %dx%d\n", rrb, offset, rrb->base.Width, rrb->base.Height);
+	cbpitch = (rrb->pitch / rrb->cpp);
+	if (rrb->cpp == 4)
+		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+	else switch (rrb->base._ActualFormat) {
+	case GL_RGB5:
+		cbpitch |= R300_COLOR_FORMAT_RGB565;
+		break;
+	case GL_RGBA4:
+		cbpitch |= R300_COLOR_FORMAT_ARGB4444;
+		break;
+	case GL_RGB5_A1:
+		cbpitch |= R300_COLOR_FORMAT_ARGB1555;
+		break;
+	}
 
-	if (!r300->cmdbuf.count_used) {
-		if (RADEON_DEBUG & DEBUG_STATE)
-			fprintf(stderr, "Begin reemit state\n");
+	if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+		cbpitch |= R300_COLOR_TILE_ENABLE;
+
+    	if (r300->radeon.radeonScreen->kernel_mm)
+		dw += 2;
+	BEGIN_BATCH_NO_AUTOSTATE(dw);
+	OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+	OUT_BATCH_RELOC(offset, rrb->bo, offset, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	OUT_BATCH_REGSEQ(R300_RB3D_COLORPITCH0, 1);
+    	if (!r300->radeon.radeonScreen->kernel_mm)
+		OUT_BATCH(cbpitch);
+	else
+		OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	END_BATCH();
+    if (r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+            BEGIN_BATCH_NO_AUTOSTATE(3);
+            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+            OUT_BATCH(0);
+            OUT_BATCH(((rrb->base.Width - 1) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->base.Height - 1) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+            BEGIN_BATCH_NO_AUTOSTATE(16);
+            for (i = 0; i < 4; i++) {
+                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+                OUT_BATCH((0 << R300_CLIPRECT_X_SHIFT) | (0 << R300_CLIPRECT_Y_SHIFT));
+                OUT_BATCH(((rrb->base.Width - 1) << R300_CLIPRECT_X_SHIFT) | ((rrb->base.Height - 1) << R300_CLIPRECT_Y_SHIFT));
+            }
+            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+            OUT_BATCH(0xAAAA);
+            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+            OUT_BATCH(0xffffff);
+            END_BATCH();
+        } else {
+            BEGIN_BATCH_NO_AUTOSTATE(3);
+            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+            OUT_BATCH((R300_SCISSORS_OFFSET << R300_SCISSORS_X_SHIFT) |
+                    (R300_SCISSORS_OFFSET << R300_SCISSORS_Y_SHIFT));
+            OUT_BATCH(((rrb->base.Width + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->base.Height + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+            BEGIN_BATCH_NO_AUTOSTATE(16);
+            for (i = 0; i < 4; i++) {
+                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+                OUT_BATCH((R300_SCISSORS_OFFSET << R300_CLIPRECT_X_SHIFT) | (R300_SCISSORS_OFFSET << R300_CLIPRECT_Y_SHIFT));
+                OUT_BATCH(((R300_SCISSORS_OFFSET + rrb->base.Width - 1) << R300_CLIPRECT_X_SHIFT) |
+                          ((R300_SCISSORS_OFFSET + rrb->base.Height - 1) << R300_CLIPRECT_Y_SHIFT));
+            }
+            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+            OUT_BATCH(0xAAAA);
+            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+            OUT_BATCH(0xffffff);
+            END_BATCH();
+        }
+    }
+}
 
-		r300EmitAtoms(r300, GL_FALSE);
-		r300->cmdbuf.count_reemit = r300->cmdbuf.count_used;
-	}
+static int check_zb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	uint32_t dw;
+	dw = 6;
+	if (r300->radeon.radeonScreen->kernel_mm)
+		dw += 2;
+	return dw;
+}
 
-	if (RADEON_DEBUG & DEBUG_STATE)
-		fprintf(stderr, "Begin dirty state\n");
+static void emit_zb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t zbpitch;
+	uint32_t dw = atom->check(ctx, atom);
+
+	rrb = radeon_get_depthbuffer(&r300->radeon);
+	if (!rrb)
+		return;
 
-	r300EmitAtoms(r300, GL_TRUE);
+	zbpitch = (rrb->pitch / rrb->cpp);
+	if (!r300->radeon.radeonScreen->kernel_mm) {
+	    if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+	        zbpitch |= R300_DEPTHMACROTILE_ENABLE;
+	   }
+	    if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
+	        zbpitch |= R300_DEPTHMICROTILE_TILED;
+	    }
+	}
 
-	assert(r300->cmdbuf.count_used < r300->cmdbuf.size);
+	BEGIN_BATCH_NO_AUTOSTATE(dw);
+	OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
+	OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	OUT_BATCH_REGSEQ(R300_ZB_DEPTHPITCH, 1);
+    	if (!r300->radeon.radeonScreen->kernel_mm)
+	    OUT_BATCH(zbpitch);
+	else
+	    OUT_BATCH_RELOC(cbpitch, rrb->bo, zbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	END_BATCH();
+}
 
-	r300->hw.is_dirty = GL_FALSE;
-	r300->hw.all_dirty = GL_FALSE;
+static void emit_zstencil_format(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t format = 0;
+
+	rrb = radeon_get_depthbuffer(&r300->radeon);
+	if (!rrb)
+	  format = 0;
+	else {
+	  if (rrb->cpp == 2)
+	    format = R300_DEPTHFORMAT_16BIT_INT_Z;
+	  else if (rrb->cpp == 4)
+	    format = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+	}
+
+	BEGIN_BATCH_NO_AUTOSTATE(atom->cmd_size);
+	OUT_BATCH(atom->cmd[0]);
+	atom->cmd[1] &= ~0xf;
+	atom->cmd[1] |= format;
+	OUT_BATCH(atom->cmd[1]);
+	OUT_BATCH(atom->cmd[2]);
+	OUT_BATCH(atom->cmd[3]);
+	OUT_BATCH(atom->cmd[4]);
+	END_BATCH();
 }
 
-#define packet0_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->packet0.count)
-#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
-#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
+static int check_never(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   return 0;
+}
 
-static int check_always(r300ContextPtr r300, struct r300_state_atom *atom)
+static int check_always(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	return atom->cmd_size;
 }
 
-static int check_variable(r300ContextPtr r300, struct r300_state_atom *atom)
+static int check_variable(GLcontext *ctx, struct radeon_state_atom *atom)
 {
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int cnt;
-	cnt = packet0_count(atom->cmd);
+	if (atom->cmd[0] == CP_PACKET2) {
+		return 0;
+	}
+	cnt = packet0_count(r300, atom->cmd);
 	return cnt ? cnt + 1 : 0;
 }
 
-static int check_vpu(r300ContextPtr r300, struct r300_state_atom *atom)
+int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
-	cnt = vpu_count(atom->cmd);
-	return cnt ? (cnt * 4) + 1 : 0;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int extra = 1;
+	cnt = r500fp_count(atom->cmd);
+	if (r300->radeon.radeonScreen->kernel_mm)
+		extra = 3;
+
+	return cnt ? (cnt * 6) + extra : 0;
 }
 
-static int check_r500fp(r300ContextPtr r300, struct r300_state_atom *atom)
+int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	int extra = 1;
 	cnt = r500fp_count(atom->cmd);
-	return cnt ? (cnt * 6) + 1 : 0;
-}
+	if (r300->radeon.radeonScreen->kernel_mm)
+		extra = 3;
 
-static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
-{
-	int cnt;
 	cnt = r500fp_count(atom->cmd);
-	return cnt ? (cnt * 4) + 1 : 0;
+	return cnt ? (cnt * 4) + extra : 0;
 }
 
 #define ALLOC_STATE( ATOM, CHK, SZ, IDX )				\
@@ -285,8 +473,8 @@ static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
       r300->hw.ATOM.idx = (IDX);					\
       r300->hw.ATOM.check = check_##CHK;				\
       r300->hw.ATOM.dirty = GL_FALSE;					\
-      r300->hw.max_state_size += (SZ);					\
-      insert_at_tail(&r300->hw.atomlist, &r300->hw.ATOM);		\
+      r300->radeon.hw.max_state_size += (SZ);					\
+      insert_at_tail(&r300->radeon.hw.atomlist, &r300->hw.ATOM);		\
    } while (0)
 /**
  * Allocate memory for the command buffer and initialize the state atom
@@ -294,251 +482,310 @@ static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
  */
 void r300InitCmdBuf(r300ContextPtr r300)
 {
-	int size, mtu;
-	int has_tcl = 1;
+	int mtu;
+	int has_tcl;
 	int is_r500 = 0;
-	int i;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
 		is_r500 = 1;
 
-	r300->hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
+	r300->radeon.hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
 
 	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
+	if (RADEON_DEBUG & RADEON_TEXTURE) {
 		fprintf(stderr, "Using %d maximum texture units..\n", mtu);
 	}
 
 	/* Setup the atom linked list */
-	make_empty_list(&r300->hw.atomlist);
-	r300->hw.atomlist.name = "atom-list";
+	make_empty_list(&r300->radeon.hw.atomlist);
+	r300->radeon.hw.atomlist.name = "atom-list";
 
 	/* Initialize state atoms */
 	ALLOC_STATE(vpt, always, R300_VPT_CMDSIZE, 0);
-	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(R300_SE_VPORT_XSCALE, 6);
+	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VPORT_XSCALE, 6);
 	ALLOC_STATE(vap_cntl, always, R300_VAP_CNTL_SIZE, 0);
-	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(R300_VAP_PVS_STATE_FLUSH_REG, 1);
+	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_STATE_FLUSH_REG, 1);
 	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH_1] = 0;
-	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(R300_VAP_CNTL, 1);
-	if (is_r500) {
+	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL, 1);
+	if (is_r500 && !r300->radeon.radeonScreen->kernel_mm) {
 	    ALLOC_STATE(vap_index_offset, always, 2, 0);
-	    r300->hw.vap_index_offset.cmd[0] = cmdpacket0(R500_VAP_INDEX_OFFSET, 1);
+	    r300->hw.vap_index_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_VAP_INDEX_OFFSET, 1);
 	    r300->hw.vap_index_offset.cmd[1] = 0;
 	}
 	ALLOC_STATE(vte, always, 3, 0);
-	r300->hw.vte.cmd[0] = cmdpacket0(R300_SE_VTE_CNTL, 2);
+	r300->hw.vte.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VTE_CNTL, 2);
 	ALLOC_STATE(vap_vf_max_vtx_indx, always, 3, 0);
-	r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(R300_VAP_VF_MAX_VTX_INDX, 2);
+	r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VF_MAX_VTX_INDX, 2);
 	ALLOC_STATE(vap_cntl_status, always, 2, 0);
-	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(R300_VAP_CNTL_STATUS, 1);
+	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL_STATUS, 1);
 	ALLOC_STATE(vir[0], variable, R300_VIR_CMDSIZE, 0);
 	r300->hw.vir[0].cmd[R300_VIR_CMD_0] =
-	    cmdpacket0(R300_VAP_PROG_STREAM_CNTL_0, 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_0, 1);
 	ALLOC_STATE(vir[1], variable, R300_VIR_CMDSIZE, 1);
 	r300->hw.vir[1].cmd[R300_VIR_CMD_0] =
-	    cmdpacket0(R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
 	ALLOC_STATE(vic, always, R300_VIC_CMDSIZE, 0);
-	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(R300_VAP_VTX_STATE_CNTL, 2);
+	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VTX_STATE_CNTL, 2);
 	ALLOC_STATE(vap_psc_sgn_norm_cntl, always, 2, 0);
-	r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
+	r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
 
 	if (has_tcl) {
 		ALLOC_STATE(vap_clip_cntl, always, 2, 0);
-		r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(R300_VAP_CLIP_CNTL, 1);
+		r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CLIP_CNTL, 1);
 		ALLOC_STATE(vap_clip, always, 5, 0);
-		r300->hw.vap_clip.cmd[0] = cmdpacket0(R300_VAP_GB_VERT_CLIP_ADJ, 4);
+		r300->hw.vap_clip.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_GB_VERT_CLIP_ADJ, 4);
 		ALLOC_STATE(vap_pvs_vtx_timeout_reg, always, 2, 0);
-		r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(VAP_PVS_VTX_TIMEOUT_REG, 1);
+		r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, VAP_PVS_VTX_TIMEOUT_REG, 1);
 	}
 
 	ALLOC_STATE(vof, always, R300_VOF_CMDSIZE, 0);
 	r300->hw.vof.cmd[R300_VOF_CMD_0] =
-	    cmdpacket0(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_OUTPUT_VTX_FMT_0, 2);
 
 	if (has_tcl) {
 		ALLOC_STATE(pvs, always, R300_PVS_CMDSIZE, 0);
 		r300->hw.pvs.cmd[R300_PVS_CMD_0] =
-		    cmdpacket0(R300_VAP_PVS_CODE_CNTL_0, 3);
+		    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_CODE_CNTL_0, 3);
 	}
 
 	ALLOC_STATE(gb_enable, always, 2, 0);
-	r300->hw.gb_enable.cmd[0] = cmdpacket0(R300_GB_ENABLE, 1);
-	ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
-	r300->hw.gb_misc.cmd[0] = cmdpacket0(R300_GB_MSPOS0, 5);
+	r300->hw.gb_enable.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_ENABLE, 1);
+	if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+		ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
+	} else {
+		ALLOC_STATE(gb_misc, never, R300_GB_MISC_CMDSIZE, 0);
+	}
+	r300->hw.gb_misc.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_MSPOS0, 3);
+	ALLOC_STATE(gb_misc2, always, R300_GB_MISC2_CMDSIZE, 0);
+	r300->hw.gb_misc2.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x401C, 2);
 	ALLOC_STATE(txe, always, R300_TXE_CMDSIZE, 0);
-	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(R300_TX_ENABLE, 1);
+	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_ENABLE, 1);
 	ALLOC_STATE(ga_point_s0, always, 5, 0);
-	r300->hw.ga_point_s0.cmd[0] = cmdpacket0(R300_GA_POINT_S0, 4);
+	r300->hw.ga_point_s0.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_S0, 4);
 	ALLOC_STATE(ga_triangle_stipple, always, 2, 0);
-	r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(R300_GA_TRIANGLE_STIPPLE, 1);
+	r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_TRIANGLE_STIPPLE, 1);
 	ALLOC_STATE(ps, always, R300_PS_CMDSIZE, 0);
-	r300->hw.ps.cmd[0] = cmdpacket0(R300_GA_POINT_SIZE, 1);
+	r300->hw.ps.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_SIZE, 1);
 	ALLOC_STATE(ga_point_minmax, always, 4, 0);
-	r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(R300_GA_POINT_MINMAX, 3);
+	r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_MINMAX, 3);
 	ALLOC_STATE(lcntl, always, 2, 0);
-	r300->hw.lcntl.cmd[0] = cmdpacket0(R300_GA_LINE_CNTL, 1);
+	r300->hw.lcntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_CNTL, 1);
 	ALLOC_STATE(ga_line_stipple, always, 4, 0);
-	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(R300_GA_LINE_STIPPLE_VALUE, 3);
-	ALLOC_STATE(shade, always, 5, 0);
-	r300->hw.shade.cmd[0] = cmdpacket0(R300_GA_ENHANCE, 4);
+	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_STIPPLE_VALUE, 3);
+        if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+		ALLOC_STATE(shade, always, 2, 0);
+        } else {
+		ALLOC_STATE(shade, never, 2, 0);
+        }
+	r300->hw.shade.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_ENHANCE, 1);
+	ALLOC_STATE(shade2, always, 4, 0);
+	r300->hw.shade2.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x4278, 3);
 	ALLOC_STATE(polygon_mode, always, 4, 0);
-	r300->hw.polygon_mode.cmd[0] = cmdpacket0(R300_GA_POLY_MODE, 3);
+	r300->hw.polygon_mode.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POLY_MODE, 3);
 	ALLOC_STATE(fogp, always, 3, 0);
-	r300->hw.fogp.cmd[0] = cmdpacket0(R300_GA_FOG_SCALE, 2);
+	r300->hw.fogp.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_FOG_SCALE, 2);
 	ALLOC_STATE(zbias_cntl, always, 2, 0);
-	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(R300_SU_TEX_WRAP, 1);
+	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_TEX_WRAP, 1);
 	ALLOC_STATE(zbs, always, R300_ZBS_CMDSIZE, 0);
 	r300->hw.zbs.cmd[R300_ZBS_CMD_0] =
-	    cmdpacket0(R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
 	ALLOC_STATE(occlusion_cntl, always, 2, 0);
-	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(R300_SU_POLY_OFFSET_ENABLE, 1);
+	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_ENABLE, 1);
 	ALLOC_STATE(cul, always, R300_CUL_CMDSIZE, 0);
-	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(R300_SU_CULL_MODE, 1);
+	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_CULL_MODE, 1);
 	ALLOC_STATE(su_depth_scale, always, 3, 0);
-	r300->hw.su_depth_scale.cmd[0] = cmdpacket0(R300_SU_DEPTH_SCALE, 2);
+	r300->hw.su_depth_scale.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_DEPTH_SCALE, 2);
 	ALLOC_STATE(rc, always, R300_RC_CMDSIZE, 0);
-	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(R300_RS_COUNT, 2);
+	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_COUNT, 2);
 	if (is_r500) {
-		ALLOC_STATE(ri, always, R500_RI_CMDSIZE, 0);
-		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R500_RS_IP_0, 16);
-		for (i = 0; i < 8; i++) {
-			r300->hw.ri.cmd[R300_RI_CMD_0 + i +1] =
-			  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-                          (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
-		}
+		ALLOC_STATE(ri, variable, R500_RI_CMDSIZE, 0);
+		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, 16);
 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
-		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, 1);
+		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, 1);
 	} else {
-		ALLOC_STATE(ri, always, R300_RI_CMDSIZE, 0);
-		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R300_RS_IP_0, 8);
+		ALLOC_STATE(ri, variable, R300_RI_CMDSIZE, 0);
+		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, 8);
 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
-		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, 1);
+		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, 1);
 	}
 	ALLOC_STATE(sc_hyperz, always, 3, 0);
-	r300->hw.sc_hyperz.cmd[0] = cmdpacket0(R300_SC_HYPERZ, 2);
+	r300->hw.sc_hyperz.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_HYPERZ, 2);
 	ALLOC_STATE(sc_screendoor, always, 2, 0);
-	r300->hw.sc_screendoor.cmd[0] = cmdpacket0(R300_SC_SCREENDOOR, 1);
+	r300->hw.sc_screendoor.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_SCREENDOOR, 1);
 	ALLOC_STATE(us_out_fmt, always, 6, 0);
-	r300->hw.us_out_fmt.cmd[0] = cmdpacket0(R300_US_OUT_FMT, 5);
+	r300->hw.us_out_fmt.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_OUT_FMT, 5);
 
 	if (is_r500) {
 		ALLOC_STATE(fp, always, R500_FP_CMDSIZE, 0);
-		r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(R500_US_CONFIG, 2);
+		r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CONFIG, 2);
 		r300->hw.fp.cmd[R500_FP_CNTL] = R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO;
-		r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(R500_US_CODE_ADDR, 3);
-		r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(R500_US_FC_CTRL, 1);
+		r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CODE_ADDR, 3);
+		r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(r300->radeon.radeonScreen, R500_US_FC_CTRL, 1);
 		r300->hw.fp.cmd[R500_FP_FC_CNTL] = 0; /* FIXME when we add flow control */
 
 		ALLOC_STATE(r500fp, r500fp, R500_FPI_CMDSIZE, 0);
-		r300->hw.r500fp.cmd[R300_FPI_CMD_0] = cmdr500fp(0, 0, 0, 0);
+		r300->hw.r500fp.cmd[R300_FPI_CMD_0] =
+			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 0, 0);
+		if (r300->radeon.radeonScreen->kernel_mm)
+			r300->hw.r500fp.emit = emit_r500fp;
+
 		ALLOC_STATE(r500fp_const, r500fp_const, R500_FPP_CMDSIZE, 0);
-		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] = cmdr500fp(0, 0, 1, 0);
+		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] =
+			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 1, 0);
+		if (r300->radeon.radeonScreen->kernel_mm)
+			r300->hw.r500fp_const.emit = emit_r500fp;
 	} else {
 		ALLOC_STATE(fp, always, R300_FP_CMDSIZE, 0);
-		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(R300_US_CONFIG, 3);
-		r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(R300_US_CODE_ADDR_0, 4);
+		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CONFIG, 3);
+		r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CODE_ADDR_0, 4);
+
 		ALLOC_STATE(fpt, variable, R300_FPT_CMDSIZE, 0);
-		r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(R300_US_TEX_INST_0, 0);
+		r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_TEX_INST_0, 0);
 
 		ALLOC_STATE(fpi[0], variable, R300_FPI_CMDSIZE, 0);
-		r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, 1);
+		r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, 1);
 		ALLOC_STATE(fpi[1], variable, R300_FPI_CMDSIZE, 1);
-		r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, 1);
+		r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, 1);
 		ALLOC_STATE(fpi[2], variable, R300_FPI_CMDSIZE, 2);
-		r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, 1);
+		r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, 1);
 		ALLOC_STATE(fpi[3], variable, R300_FPI_CMDSIZE, 3);
-		r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, 1);
+		r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, 1);
 		ALLOC_STATE(fpp, variable, R300_FPP_CMDSIZE, 0);
-		r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, 0);
+		r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_PFS_PARAM_0_X, 0);
 	}
 	ALLOC_STATE(fogs, always, R300_FOGS_CMDSIZE, 0);
-	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(R300_FG_FOG_BLEND, 1);
+	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_BLEND, 1);
 	ALLOC_STATE(fogc, always, R300_FOGC_CMDSIZE, 0);
-	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(R300_FG_FOG_COLOR_R, 3);
+	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_COLOR_R, 3);
 	ALLOC_STATE(at, always, R300_AT_CMDSIZE, 0);
-	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(R300_FG_ALPHA_FUNC, 2);
+	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_ALPHA_FUNC, 2);
 	ALLOC_STATE(fg_depth_src, always, 2, 0);
-	r300->hw.fg_depth_src.cmd[0] = cmdpacket0(R300_FG_DEPTH_SRC, 1);
+	r300->hw.fg_depth_src.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_DEPTH_SRC, 1);
 	ALLOC_STATE(rb3d_cctl, always, 2, 0);
-	r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(R300_RB3D_CCTL, 1);
+	r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CCTL, 1);
 	ALLOC_STATE(bld, always, R300_BLD_CMDSIZE, 0);
-	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(R300_RB3D_CBLEND, 2);
+	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CBLEND, 2);
 	ALLOC_STATE(cmk, always, R300_CMK_CMDSIZE, 0);
-	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(RB3D_COLOR_CHANNEL_MASK, 1);
+	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, RB3D_COLOR_CHANNEL_MASK, 1);
 	if (is_r500) {
 		ALLOC_STATE(blend_color, always, 3, 0);
-		r300->hw.blend_color.cmd[0] = cmdpacket0(R500_RB3D_CONSTANT_COLOR_AR, 2);
+		r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_CONSTANT_COLOR_AR, 2);
 	} else {
 		ALLOC_STATE(blend_color, always, 2, 0);
-		r300->hw.blend_color.cmd[0] = cmdpacket0(R300_RB3D_BLEND_COLOR, 1);
+		r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_BLEND_COLOR, 1);
 	}
 	ALLOC_STATE(rop, always, 2, 0);
-	r300->hw.rop.cmd[0] = cmdpacket0(R300_RB3D_ROPCNTL, 1);
-	ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0);
-	r300->hw.cb.cmd[R300_CB_CMD_0] = cmdpacket0(R300_RB3D_COLOROFFSET0, 1);
-	r300->hw.cb.cmd[R300_CB_CMD_1] = cmdpacket0(R300_RB3D_COLORPITCH0, 1);
+	r300->hw.rop.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_ROPCNTL, 1);
+	ALLOC_STATE(cb, cb_offset, R300_CB_CMDSIZE, 0);
+	r300->hw.cb.emit = &emit_cb_offset;
 	ALLOC_STATE(rb3d_dither_ctl, always, 10, 0);
-	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(R300_RB3D_DITHER_CTL, 9);
+	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DITHER_CTL, 9);
 	ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
-	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(R300_RB3D_AARESOLVE_CTL, 1);
-	ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_AARESOLVE_CTL, 1);
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV350) {
+		ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
+	} else {
+		ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, never, 3, 0);
+	}
+	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
 	ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
 	r300->hw.zs.cmd[R300_ZS_CMD_0] =
-	    cmdpacket0(R300_ZB_CNTL, 3);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_CNTL, 3);
+	if (is_r500) {
+		if (r300->radeon.radeonScreen->kernel_mm)
+			ALLOC_STATE(zsb, always, R300_ZSB_CMDSIZE, 0);
+		else
+			ALLOC_STATE(zsb, never, R300_ZSB_CMDSIZE, 0);
+		r300->hw.zsb.cmd[R300_ZSB_CMD_0] =
+			cmdpacket0(r300->radeon.radeonScreen, R500_ZB_STENCILREFMASK_BF, 1);
+	}
+
 	ALLOC_STATE(zstencil_format, always, 5, 0);
 	r300->hw.zstencil_format.cmd[0] =
-	    cmdpacket0(R300_ZB_FORMAT, 4);
-	ALLOC_STATE(zb, always, R300_ZB_CMDSIZE, 0);
-	r300->hw.zb.cmd[R300_ZB_CMD_0] = cmdpacket0(R300_ZB_DEPTHOFFSET, 2);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_FORMAT, 4);
+	r300->hw.zstencil_format.emit = emit_zstencil_format;
+
+	ALLOC_STATE(zb, zb_offset, R300_ZB_CMDSIZE, 0);
+	r300->hw.zb.emit = emit_zb_offset;
 	ALLOC_STATE(zb_depthclearvalue, always, 2, 0);
-	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(R300_ZB_DEPTHCLEARVALUE, 1);
-	ALLOC_STATE(unk4F30, always, 3, 0);
-	r300->hw.unk4F30.cmd[0] = cmdpacket0(0x4F30, 2);
+	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_DEPTHCLEARVALUE, 1);
+	ALLOC_STATE(zb_zmask, always, 3, 0);
+	r300->hw.zb_zmask.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_ZMASK_OFFSET, 2);
 	ALLOC_STATE(zb_hiz_offset, always, 2, 0);
-	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(R300_ZB_HIZ_OFFSET, 1);
+	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_OFFSET, 1);
 	ALLOC_STATE(zb_hiz_pitch, always, 2, 0);
-	r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(R300_ZB_HIZ_PITCH, 1);
+	r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_PITCH, 1);
 
 	/* VPU only on TCL */
 	if (has_tcl) {
-   	        int i;
+		int i;
+		if (r300->radeon.radeonScreen->kernel_mm) {
+			ALLOC_STATE(vap_flush, always, 10, 0);
+			/* flush processing vertices */
+			r300->hw.vap_flush.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_SCREENDOOR, 1);
+			r300->hw.vap_flush.cmd[1] = 0;
+			r300->hw.vap_flush.cmd[2] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DSTCACHE_CTLSTAT, 1);
+			r300->hw.vap_flush.cmd[3] = R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D;
+			r300->hw.vap_flush.cmd[4] = cmdpacket0(r300->radeon.radeonScreen, RADEON_WAIT_UNTIL, 1);
+			r300->hw.vap_flush.cmd[5] = RADEON_WAIT_3D_IDLECLEAN;
+			r300->hw.vap_flush.cmd[6] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_SCREENDOOR, 1);
+			r300->hw.vap_flush.cmd[7] = 0xffffff;
+			r300->hw.vap_flush.cmd[8] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_STATE_FLUSH_REG, 1);
+			r300->hw.vap_flush.cmd[9] = 0;
+		} else {
+			ALLOC_STATE(vap_flush, never, 10, 0);
+		}
+
+
 		ALLOC_STATE(vpi, vpu, R300_VPI_CMDSIZE, 0);
-		r300->hw.vpi.cmd[R300_VPI_CMD_0] =
-		    cmdvpu(R300_PVS_CODE_START, 0);
+		r300->hw.vpi.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R300_PVS_CODE_START, 0);
+		if (r300->radeon.radeonScreen->kernel_mm)
+			r300->hw.vpi.emit = emit_vpu;
 
 		if (is_r500) {
-		    ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
-		    r300->hw.vpp.cmd[R300_VPP_CMD_0] =
-			cmdvpu(R500_PVS_CONST_START, 0);
-
-		    ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
-		    r300->hw.vps.cmd[R300_VPS_CMD_0] =
-			cmdvpu(R500_POINT_VPORT_SCALE_OFFSET, 1);
+			ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
+			r300->hw.vpp.cmd[0] =
+				cmdvpu(r300->radeon.radeonScreen, R500_PVS_CONST_START, 0);
+			if (r300->radeon.radeonScreen->kernel_mm)
+				r300->hw.vpp.emit = emit_vpu;
+
+			ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
+			r300->hw.vps.cmd[0] =
+				cmdvpu(r300->radeon.radeonScreen, R500_POINT_VPORT_SCALE_OFFSET, 1);
+			if (r300->radeon.radeonScreen->kernel_mm)
+				r300->hw.vps.emit = emit_vpu;
 
 			for (i = 0; i < 6; i++) {
 				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
-				r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
-					cmdvpu(R500_PVS_UCP_START + i, 1);
+				r300->hw.vpucp[i].cmd[0] =
+					cmdvpu(r300->radeon.radeonScreen,
+							R500_PVS_UCP_START + i, 1);
+				if (r300->radeon.radeonScreen->kernel_mm)
+					r300->hw.vpucp[i].emit = emit_vpu;
 			}
 		} else {
-		    ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
-		    r300->hw.vpp.cmd[R300_VPP_CMD_0] =
-			cmdvpu(R300_PVS_CONST_START, 0);
-
-		    ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
-		    r300->hw.vps.cmd[R300_VPS_CMD_0] =
-			cmdvpu(R300_POINT_VPORT_SCALE_OFFSET, 1);
+			ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
+			r300->hw.vpp.cmd[0] =
+				cmdvpu(r300->radeon.radeonScreen, R300_PVS_CONST_START, 0);
+			if (r300->radeon.radeonScreen->kernel_mm)
+				r300->hw.vpp.emit = emit_vpu;
+
+			ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
+			r300->hw.vps.cmd[0] =
+				cmdvpu(r300->radeon.radeonScreen, R300_POINT_VPORT_SCALE_OFFSET, 1);
+			if (r300->radeon.radeonScreen->kernel_mm)
+				r300->hw.vps.emit = emit_vpu;
 
 			for (i = 0; i < 6; i++) {
 				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
-				r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
-					cmdvpu(R300_PVS_UCP_START + i, 1);
+				r300->hw.vpucp[i].cmd[0] =
+					cmdvpu(r300->radeon.radeonScreen,
+							R300_PVS_UCP_START + i, 1);
+				if (r300->radeon.radeonScreen->kernel_mm)
+					r300->hw.vpucp[i].emit = emit_vpu;
 			}
 		}
 	}
@@ -546,130 +793,48 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	/* Textures */
 	ALLOC_STATE(tex.filter, variable, mtu + 1, 0);
 	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER0_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 0);
 
 	ALLOC_STATE(tex.filter_1, variable, mtu + 1, 0);
 	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER1_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, 0);
 
 	ALLOC_STATE(tex.size, variable, mtu + 1, 0);
-	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_SIZE_0, 0);
+	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, 0);
 
 	ALLOC_STATE(tex.format, variable, mtu + 1, 0);
 	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, 0);
 
 	ALLOC_STATE(tex.pitch, variable, mtu + 1, 0);
-	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_FORMAT2_0, 0);
+	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, 0);
 
-	ALLOC_STATE(tex.offset, variable, mtu + 1, 0);
+	ALLOC_STATE(tex.offset, tex_offsets, 1, 0);
 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_OFFSET_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, 0);
+	r300->hw.tex.offset.emit = &emit_tex_offsets;
 
 	ALLOC_STATE(tex.chroma_key, variable, mtu + 1, 0);
 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_CHROMA_KEY_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, 0);
 
 	ALLOC_STATE(tex.border_color, variable, mtu + 1, 0);
 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_BORDER_COLOR_0, 0);
-
-	r300->hw.is_dirty = GL_TRUE;
-	r300->hw.all_dirty = GL_TRUE;
-
-	/* Initialize command buffer */
-	size =
-	    256 * driQueryOptioni(&r300->radeon.optionCache,
-				  "command_buffer_size");
-	if (size < 2 * r300->hw.max_state_size) {
-		size = 2 * r300->hw.max_state_size + 65535;
-	}
-	if (size > 64 * 256)
-		size = 64 * 256;
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA)) {
-		fprintf(stderr, "sizeof(drm_r300_cmd_header_t)=%zd\n",
-			sizeof(drm_r300_cmd_header_t));
-		fprintf(stderr, "sizeof(drm_radeon_cmd_buffer_t)=%zd\n",
-			sizeof(drm_radeon_cmd_buffer_t));
-		fprintf(stderr,
-			"Allocating %d bytes command buffer (max state is %d bytes)\n",
-			size * 4, r300->hw.max_state_size * 4);
-	}
-
-	r300->cmdbuf.size = size;
-	r300->cmdbuf.cmd_buf = (uint32_t *) CALLOC(size * 4);
-	r300->cmdbuf.count_used = 0;
-	r300->cmdbuf.count_reemit = 0;
-}
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, 0);
 
-/**
- * Destroy the command buffer and state atoms.
- */
-void r300DestroyCmdBuf(r300ContextPtr r300)
-{
-	struct r300_state_atom *atom;
-
-	FREE(r300->cmdbuf.cmd_buf);
-
-	foreach(atom, &r300->hw.atomlist) {
-		FREE(atom->cmd);
+	radeon_init_query_stateobj(&r300->radeon, R300_QUERYOBJ_CMDSIZE);
+	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) {
+		r300->radeon.query.queryobj.cmd[R300_QUERYOBJ_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, RV530_FG_ZBREG_DEST, 1);
+		r300->radeon.query.queryobj.cmd[R300_QUERYOBJ_DATA_0] = RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL;
+	} else {
+		r300->radeon.query.queryobj.cmd[R300_QUERYOBJ_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_REG_DEST, 1);
+		r300->radeon.query.queryobj.cmd[R300_QUERYOBJ_DATA_0] = R300_RASTER_PIPE_SELECT_ALL;
 	}
-}
-
-void r300EmitBlit(r300ContextPtr rmesa,
-		  GLuint color_fmt,
-		  GLuint src_pitch,
-		  GLuint src_offset,
-		  GLuint dst_pitch,
-		  GLuint dst_offset,
-		  GLint srcx, GLint srcy,
-		  GLint dstx, GLint dsty, GLuint w, GLuint h)
-{
-	drm_r300_cmd_header_t *cmd;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr,
-			"%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
-			__FUNCTION__, src_pitch, src_offset, srcx, srcy,
-			dst_pitch, dst_offset, dstx, dsty, w, h);
-
-	assert((src_pitch & 63) == 0);
-	assert((dst_pitch & 63) == 0);
-	assert((src_offset & 1023) == 0);
-	assert((dst_offset & 1023) == 0);
-	assert(w < (1 << 16));
-	assert(h < (1 << 16));
-
-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 8, __FUNCTION__);
-
-	cmd[0].header.cmd_type = R300_CMD_PACKET3;
-	cmd[0].header.pad0 = R300_CMD_PACKET3_RAW;
-	cmd[1].u = R300_CP_CMD_BITBLT_MULTI | (5 << 16);
-	cmd[2].u = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
-		    RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-		    RADEON_GMC_BRUSH_NONE |
-		    (color_fmt << 8) |
-		    RADEON_GMC_SRC_DATATYPE_COLOR |
-		    RADEON_ROP3_S |
-		    RADEON_DP_SRC_SOURCE_MEMORY |
-		    RADEON_GMC_CLR_CMP_CNTL_DIS | RADEON_GMC_WR_MSK_DIS);
-
-	cmd[3].u = ((src_pitch / 64) << 22) | (src_offset >> 10);
-	cmd[4].u = ((dst_pitch / 64) << 22) | (dst_offset >> 10);
-	cmd[5].u = (srcx << 16) | srcy;
-	cmd[6].u = (dstx << 16) | dsty;	/* dst */
-	cmd[7].u = (w << 16) | h;
-}
-
-void r300EmitWait(r300ContextPtr rmesa, GLuint flags)
-{
-	drm_r300_cmd_header_t *cmd;
+	r300->radeon.query.queryobj.cmd[R300_QUERYOBJ_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_ZPASS_DATA, 1);
+	r300->radeon.query.queryobj.cmd[R300_QUERYOBJ_DATA_1] = 0;
 
-	assert(!(flags & ~(R300_WAIT_2D | R300_WAIT_3D)));
+	r300->radeon.hw.is_dirty = GL_TRUE;
+	r300->radeon.hw.all_dirty = GL_TRUE;
 
-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].u = 0;
-	cmd[0].wait.cmd_type = R300_CMD_WAIT;
-	cmd[0].wait.flags = flags;
+	rcommonInitCmdBuf(&r300->radeon);
 }
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.h b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
index a8eaa580bd..1b703e518a 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.h
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
@@ -38,79 +38,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "r300_context.h"
 
-extern int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller);
-extern int r300FlushCmdBuf(r300ContextPtr r300, const char *caller);
-
-extern void r300EmitState(r300ContextPtr r300);
+#define CACHE_FLUSH_BUFSZ      (4*2)
+#define PRE_EMIT_STATE_BUFSZ   (2+2)
+#define AOS_BUFSZ(nr)          (3+(nr >>1)*3 + (nr&1)*2 + (nr*2))
+#define FIREAOS_BUFSZ          (3)
+#define SCISSORS_BUFSZ         (3)
 
 extern void r300InitCmdBuf(r300ContextPtr r300);
-extern void r300DestroyCmdBuf(r300ContextPtr r300);
-
-/**
- * Make sure that enough space is available in the command buffer
- * by flushing if necessary.
- *
- * \param dwords The number of dwords we need to be free on the command buffer
- */
-static INLINE void r300EnsureCmdBufSpace(r300ContextPtr r300,
-					     int dwords, const char *caller)
-{
-	assert(dwords < r300->cmdbuf.size);
-
-	if (r300->cmdbuf.count_used + dwords > r300->cmdbuf.size)
-		r300FlushCmdBuf(r300, caller);
-}
-
-/**
- * Allocate the given number of dwords in the command buffer and return
- * a pointer to the allocated area.
- * When necessary, these functions cause a flush. r300AllocCmdBuf() also
- * causes state reemission after a flush. This is necessary to ensure
- * correct hardware state after an unlock.
- */
-static INLINE uint32_t *r300RawAllocCmdBuf(r300ContextPtr r300,
-					       int dwords, const char *caller)
-{
-	uint32_t *ptr;
-
-	r300EnsureCmdBufSpace(r300, dwords, caller);
-
-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
-	r300->cmdbuf.count_used += dwords;
-	return ptr;
-}
-
-static INLINE uint32_t *r300AllocCmdBuf(r300ContextPtr r300,
-					    int dwords, const char *caller)
-{
-	uint32_t *ptr;
-
-	r300EnsureCmdBufSpace(r300, dwords, caller);
-
-	if (!r300->cmdbuf.count_used) {
-		if (RADEON_DEBUG & DEBUG_IOCTL)
-			fprintf(stderr,
-				"Reemit state after flush (from %s)\n", caller);
-		r300EmitState(r300);
-	}
-
-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
-	r300->cmdbuf.count_used += dwords;
-	return ptr;
-}
+void r300_emit_scissor(GLcontext *ctx);
 
-extern void r300EmitBlit(r300ContextPtr rmesa,
-			 GLuint color_fmt,
-			 GLuint src_pitch,
-			 GLuint src_offset,
-			 GLuint dst_pitch,
-			 GLuint dst_offset,
-			 GLint srcx, GLint srcy,
-			 GLint dstx, GLint dsty, GLuint w, GLuint h);
+void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom);
+int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom);
 
-extern void r300EmitWait(r300ContextPtr rmesa, GLuint flags);
-extern void r300EmitLOAD_VBPNTR(r300ContextPtr rmesa, int start);
-extern void r300EmitVertexShader(r300ContextPtr rmesa);
-extern void r300EmitPixelShader(r300ContextPtr rmesa);
+void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom);
+int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom);
+int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom);
 
 #endif				/* __R300_CMDBUF_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 37436275e3..6fcf209af6 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -44,6 +44,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/extensions.h"
 #include "main/state.h"
 #include "main/bufferobj.h"
+#include "main/texobj.h"
 
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
@@ -55,70 +56,68 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "drivers/common/driverfuncs.h"
 
-#include "radeon_ioctl.h"
-#include "radeon_span.h"
 #include "r300_context.h"
+#include "radeon_context.h"
+#include "radeon_span.h"
 #include "r300_cmdbuf.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
 #include "r300_swtcl.h"
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
+#include "radeon_bocs_wrapper.h"
+#include "radeon_buffer_objects.h"
+#include "radeon_queryobj.h"
 
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h"		/* for symbolic values of enum-type options */
 
-/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
-int future_hw_tcl_on = 1;
-int hw_tcl_on = 1;
-
-#define need_GL_EXT_stencil_two_side
-#define need_GL_ARB_multisample
+#define need_GL_VERSION_2_0
+#define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
 #define need_GL_ARB_vertex_program
-#define need_GL_EXT_blend_minmax
-//#define need_GL_EXT_fog_coord
-#define need_GL_EXT_multi_draw_arrays
-#define need_GL_EXT_secondary_color
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_framebuffer_blit
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_fog_coord
 #define need_GL_EXT_gpu_program_parameters
+#define need_GL_EXT_provoking_vertex
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_stencil_two_side
+#define need_GL_ATI_separate_stencil
 #define need_GL_NV_vertex_program
+
 #include "extension_helper.h"
 
+
 const struct dri_extension card_extensions[] = {
   /* *INDENT-OFF* */
   {"GL_ARB_depth_texture",		NULL},
   {"GL_ARB_fragment_program",		NULL},
-  {"GL_ARB_multisample",		GL_ARB_multisample_functions},
+  {"GL_ARB_occlusion_query",		GL_ARB_occlusion_query_functions},
   {"GL_ARB_multitexture",		NULL},
   {"GL_ARB_point_parameters",		GL_ARB_point_parameters_functions},
   {"GL_ARB_shadow",			NULL},
   {"GL_ARB_shadow_ambient",		NULL},
   {"GL_ARB_texture_border_clamp",	NULL},
-  {"GL_ARB_texture_compression",	GL_ARB_texture_compression_functions},
   {"GL_ARB_texture_cube_map",		NULL},
   {"GL_ARB_texture_env_add",		NULL},
   {"GL_ARB_texture_env_combine",	NULL},
   {"GL_ARB_texture_env_crossbar",	NULL},
   {"GL_ARB_texture_env_dot3",		NULL},
   {"GL_ARB_texture_mirrored_repeat",	NULL},
-  {"GL_ARB_vertex_buffer_object",	GL_ARB_vertex_buffer_object_functions},
   {"GL_ARB_vertex_program",		GL_ARB_vertex_program_functions},
   {"GL_EXT_blend_equation_separate",	GL_EXT_blend_equation_separate_functions},
   {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
   {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
   {"GL_EXT_blend_subtract",		NULL},
-//  {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
-  {"GL_EXT_multi_draw_arrays",		GL_EXT_multi_draw_arrays_functions},
+  {"GL_EXT_packed_depth_stencil",	NULL},
+  {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
   {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+  {"GL_EXT_provoking_vertex",           GL_EXT_provoking_vertex_functions },
   {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
   {"GL_EXT_shadow_funcs",		NULL},
   {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
@@ -130,6 +129,9 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_texture_lod_bias",		NULL},
   {"GL_EXT_texture_mirror_clamp",	NULL},
   {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_EXT_texture_sRGB",		NULL},
+  {"GL_EXT_vertex_array_bgra",		NULL},
+  {"GL_ATI_separate_stencil",		GL_ATI_separate_stencil_functions},
   {"GL_ATI_texture_env_combine3",	NULL},
   {"GL_ATI_texture_mirror_once",	NULL},
   {"GL_MESA_pack_invert",		NULL},
@@ -142,15 +144,22 @@ const struct dri_extension card_extensions[] = {
   /* *INDENT-ON* */
 };
 
-extern struct tnl_pipeline_stage _r300_render_stage;
-extern const struct tnl_pipeline_stage _r300_tcl_stage;
 
-static const struct tnl_pipeline_stage *r300_pipeline[] = {
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_blit",	GL_EXT_framebuffer_blit_functions },
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
 
-	/* Try and go straight to t&l
-	 */
-	&_r300_tcl_stage,
+/**
+ * The GL 2.0 functions are needed to make display lists work with
+ * functions added by GL_ATI_separate_stencil.
+ */
+const struct dri_extension gl_20_extension[] = {
+  {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
+};
 
+static const struct tnl_pipeline_stage *r300_pipeline[] = {
 	/* Catch any t&l fallbacks
 	 */
 	&_tnl_vertex_transform_stage,
@@ -159,133 +168,187 @@ static const struct tnl_pipeline_stage *r300_pipeline[] = {
 	&_tnl_fog_coordinate_stage,
 	&_tnl_texgen_stage,
 	&_tnl_texture_transform_stage,
+	&_tnl_point_attenuation_stage,
 	&_tnl_vertex_program_stage,
-
-	/* Try again to go to tcl?
-	 *     - no good for asymmetric-twoside (do with multipass)
-	 *     - no good for asymmetric-unfilled (do with multipass)
-	 *     - good for material
-	 *     - good for texgen
-	 *     - need to manipulate a bit of state
-	 *
-	 * - worth it/not worth it?
-	 */
-
-	/* Else do them here.
-	 */
-	&_r300_render_stage,
-	&_tnl_render_stage,	/* FALLBACK  */
+	&_tnl_render_stage,
 	0,
 };
 
-/* Create the device specific rendering context.
- */
-GLboolean r300CreateContext(const __GLcontextModes * glVisual,
-			    __DRIcontextPrivate * driContextPriv,
-			    void *sharedContextPrivate)
+static void r300_get_lock(radeonContextPtr rmesa)
 {
-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
-	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
-	struct dd_function_table functions;
-	r300ContextPtr r300;
-	GLcontext *ctx;
-	int tcl_mode, i;
-
-	assert(glVisual);
-	assert(driContextPriv);
-	assert(screen);
+	drm_radeon_sarea_t *sarea = rmesa->sarea;
 
-	/* Allocate the R300 context */
-	r300 = (r300ContextPtr) CALLOC(sizeof(*r300));
-	if (!r300)
-		return GL_FALSE;
+	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+		sarea->ctx_owner = rmesa->dri.hwContext;
+		if (!rmesa->radeonScreen->kernel_mm)
+			radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
+	}
+}
 
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-		hw_tcl_on = future_hw_tcl_on = 0;
+static void r300_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+    /* please flush pipe do all pending work */
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_SCREENDOOR, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_SCREENDOOR, 1));
+    radeon_cs_write_dword(cs, 0x00FFFFFF);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_HYPERZ, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_US_CONFIG, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_ZB_CNTL, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen, R300_WAIT_3D));
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_RB3D_DSTCACHE_CTLSTAT, 1));
+    radeon_cs_write_dword(cs, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_ZB_ZCACHE_CTLSTAT, 1));
+    radeon_cs_write_dword(cs, R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE);
+    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen,
+                               R300_WAIT_3D | R300_WAIT_3D_CLEAN));
+}
 
-	/* Parse configuration files.
-	 * Do this here so that initialMaxAnisotropy is set before we create
-	 * the default textures.
-	 */
-	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
-			    screen->driScreen->myNum, "r300");
-	r300->initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
-						     "def_max_anisotropy");
+static void r300_vtbl_pre_emit_atoms(radeonContextPtr radeon)
+{
+	BATCH_LOCALS(radeon);
 
-	/* Init default driver functions then plug in our R300-specific functions
-	 * (the texture functions are especially important)
-	 */
-	_mesa_init_driver_functions(&functions);
-	r300InitIoctlFuncs(&functions);
-	r300InitStateFuncs(&functions);
-	r300InitTextureFuncs(&functions);
-	r300InitShaderFuncs(&functions);
+	cp_wait(radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	BEGIN_BATCH_NO_AUTOSTATE(2);
+	OUT_BATCH_REGVAL(R300_TX_INVALTAGS, R300_TX_FLUSH);
+	END_BATCH();
+	end_3d(radeon);
+}
 
-#ifdef USER_BUFFERS
-	r300_mem_init(r300);
-#endif
+static void r300_fallback(GLcontext *ctx, GLuint bit, GLboolean mode)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	if (mode)
+		r300->radeon.Fallback |= bit;
+	else
+		r300->radeon.Fallback &= ~bit;
+}
 
-	if (!radeonInitContext(&r300->radeon, &functions,
-			       glVisual, driContextPriv,
-			       sharedContextPrivate)) {
-		FREE(r300);
-		return GL_FALSE;
+static void r300_emit_query_finish(radeonContextPtr radeon)
+{
+	r300ContextPtr r300 = (r300ContextPtr)radeon;
+	struct radeon_query_object *query = radeon->query.current;
+	BATCH_LOCALS(radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(3 * 2 *r300->radeon.radeonScreen->num_gb_pipes + 2);
+	switch (r300->radeon.radeonScreen->num_gb_pipes) {
+	case 4:
+		OUT_BATCH_REGVAL(R300_SU_REG_DEST, R300_RASTER_PIPE_SELECT_3);
+		OUT_BATCH_REGSEQ(R300_ZB_ZPASS_ADDR, 1);
+		OUT_BATCH_RELOC(0, query->bo, query->curr_offset+3*sizeof(uint32_t), 0, RADEON_GEM_DOMAIN_GTT, 0);
+	case 3:
+		OUT_BATCH_REGVAL(R300_SU_REG_DEST, R300_RASTER_PIPE_SELECT_2);
+		OUT_BATCH_REGSEQ(R300_ZB_ZPASS_ADDR, 1);
+		OUT_BATCH_RELOC(0, query->bo, query->curr_offset+2*sizeof(uint32_t), 0, RADEON_GEM_DOMAIN_GTT, 0);
+	case 2:
+		if (r300->radeon.radeonScreen->chip_family <= CHIP_FAMILY_RV380) {
+			OUT_BATCH_REGVAL(R300_SU_REG_DEST, R300_RASTER_PIPE_SELECT_3);
+		} else {
+			OUT_BATCH_REGVAL(R300_SU_REG_DEST, R300_RASTER_PIPE_SELECT_1);
+		}
+		OUT_BATCH_REGSEQ(R300_ZB_ZPASS_ADDR, 1);
+		OUT_BATCH_RELOC(0, query->bo, query->curr_offset+1*sizeof(uint32_t), 0, RADEON_GEM_DOMAIN_GTT, 0);
+	case 1:
+	default:
+		OUT_BATCH_REGVAL(R300_SU_REG_DEST, R300_RASTER_PIPE_SELECT_0);
+		OUT_BATCH_REGSEQ(R300_ZB_ZPASS_ADDR, 1);
+		OUT_BATCH_RELOC(0, query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0);
+		break;
 	}
+	OUT_BATCH_REGVAL(R300_SU_REG_DEST, R300_RASTER_PIPE_SELECT_ALL);
+	END_BATCH();
+	query->curr_offset += r300->radeon.radeonScreen->num_gb_pipes * sizeof(uint32_t);
+	assert(query->curr_offset < RADEON_QUERY_PAGE_SIZE);
+	query->emitted_begin = GL_FALSE;
+}
 
-	/* Init r300 context data */
-	r300->dma.buf0_address =
-	    r300->radeon.radeonScreen->buffers->list[0].address;
-
-	(void)memset(r300->texture_heaps, 0, sizeof(r300->texture_heaps));
-	make_empty_list(&r300->swapped);
-
-	r300->nr_heaps = 1 /* screen->numTexHeaps */ ;
-	assert(r300->nr_heaps < RADEON_NR_TEX_HEAPS);
-	for (i = 0; i < r300->nr_heaps; i++) {
-		/* *INDENT-OFF* */
-		r300->texture_heaps[i] = driCreateTextureHeap(i, r300,
-							       screen->
-							       texSize[i], 12,
-							       RADEON_NR_TEX_REGIONS,
-							       (drmTextureRegionPtr)
-							       r300->radeon.sarea->
-							       tex_list[i],
-							       &r300->radeon.sarea->
-							       tex_age[i],
-							       &r300->swapped,
-							       sizeof
-							       (r300TexObj),
-							       (destroy_texture_object_t
-								*)
-							       r300DestroyTexObj);
-		/* *INDENT-ON* */
-	}
-	r300->texture_depth = driQueryOptioni(&r300->radeon.optionCache,
-					      "texture_depth");
-	if (r300->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
-		r300->texture_depth = (screen->cpp == 4) ?
-		    DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
-
-	/* Set the maximum texture size small enough that we can guarentee that
-	 * all texture units can bind a maximal texture and have them both in
-	 * texturable memory at once.
-	 */
+static void rv530_emit_query_finish_single_z(radeonContextPtr radeon)
+{
+	BATCH_LOCALS(radeon);
+	struct radeon_query_object *query = radeon->query.current;
+
+	BEGIN_BATCH_NO_AUTOSTATE(8);
+	OUT_BATCH_REGVAL(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_0);
+	OUT_BATCH_REGSEQ(R300_ZB_ZPASS_ADDR, 1);
+	OUT_BATCH_RELOC(0, query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0);
+	OUT_BATCH_REGVAL(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
+	END_BATCH();
+
+	query->curr_offset += sizeof(uint32_t);
+	assert(query->curr_offset < RADEON_QUERY_PAGE_SIZE);
+	query->emitted_begin = GL_FALSE;
+}
 
-	ctx = r300->radeon.glCtx;
+static void rv530_emit_query_finish_double_z(radeonContextPtr radeon)
+{
+	BATCH_LOCALS(radeon);
+	struct radeon_query_object *query = radeon->query.current;
+
+	BEGIN_BATCH_NO_AUTOSTATE(14);
+	OUT_BATCH_REGVAL(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_0);
+	OUT_BATCH_REGSEQ(R300_ZB_ZPASS_ADDR, 1);
+	OUT_BATCH_RELOC(0, query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0);
+	OUT_BATCH_REGVAL(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_1);
+	OUT_BATCH_REGSEQ(R300_ZB_ZPASS_ADDR, 1);
+	OUT_BATCH_RELOC(0, query->bo, query->curr_offset + sizeof(uint32_t), 0, RADEON_GEM_DOMAIN_GTT, 0);
+	OUT_BATCH_REGVAL(RV530_FG_ZBREG_DEST, RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL);
+	END_BATCH();
+
+	query->curr_offset += 2 * sizeof(uint32_t);
+	assert(query->curr_offset < RADEON_QUERY_PAGE_SIZE);
+	query->emitted_begin = GL_FALSE;
+}
+
+static void r300_init_vtbl(radeonContextPtr radeon)
+{
+	radeon->vtbl.get_lock = r300_get_lock;
+	radeon->vtbl.update_viewport_offset = r300UpdateViewportOffset;
+	radeon->vtbl.emit_cs_header = r300_vtbl_emit_cs_header;
+	radeon->vtbl.swtcl_flush = r300_swtcl_flush;
+	radeon->vtbl.pre_emit_atoms = r300_vtbl_pre_emit_atoms;
+	radeon->vtbl.fallback = r300_fallback;
+	if (radeon->radeonScreen->chip_family == CHIP_FAMILY_RV530) {
+		if (radeon->radeonScreen->num_z_pipes == 2)
+			radeon->vtbl.emit_query_finish = rv530_emit_query_finish_double_z;
+		else
+			radeon->vtbl.emit_query_finish = rv530_emit_query_finish_single_z;
+	} else
+		radeon->vtbl.emit_query_finish = r300_emit_query_finish;
+}
+
+static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
 
 	ctx->Const.MaxTextureImageUnits =
 	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
 	ctx->Const.MaxTextureCoordUnits =
 	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
-	ctx->Const.MaxTextureUnits =
-	    MIN2(ctx->Const.MaxTextureImageUnits,
+	ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureImageUnits,
 		 ctx->Const.MaxTextureCoordUnits);
+
 	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 	ctx->Const.MaxTextureLodBias = 16.0;
 
 	if (screen->chip_family >= CHIP_FAMILY_RV515) {
-	    ctx->Const.MaxTextureLevels = 13;
-	    ctx->Const.MaxTextureRectSize = 4096;
+		ctx->Const.MaxTextureLevels = 13;
+		ctx->Const.MaxCubeTextureLevels = 13;
+		ctx->Const.MaxTextureRectSize = 4096;
+	}
+	else {
+		ctx->Const.MaxTextureLevels = 12;
+		ctx->Const.MaxCubeTextureLevels = 12;
+		ctx->Const.MaxTextureRectSize = 2048;
 	}
 
 	ctx->Const.MinPointSize = 1.0;
@@ -298,253 +361,173 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
 	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
 
-#ifdef USER_BUFFERS
-	/* Needs further modifications */
-#if 0
-	ctx->Const.MaxArrayLockSize =
-	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
-#endif
-#endif
-
-	/* Initialize the software rasterizer and helper modules.
-	 */
-	_swrast_CreateContext(ctx);
-	_vbo_CreateContext(ctx);
-	_tnl_CreateContext(ctx);
-	_swsetup_CreateContext(ctx);
-	_swsetup_Wakeup(ctx);
-	_ae_create_context(ctx);
-
-	/* Install the customized pipeline:
-	 */
-	_tnl_destroy_pipeline(ctx);
-	_tnl_install_pipeline(ctx, r300_pipeline);
-
-	/* Try and keep materials and vertices separate:
-	 */
-/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
-
-	/* Configure swrast and TNL to match hardware characteristics:
-	 */
-	_swrast_allow_pixel_fog(ctx, GL_FALSE);
-	_swrast_allow_vertex_fog(ctx, GL_TRUE);
-	_tnl_allow_pixel_fog(ctx, GL_FALSE);
-	_tnl_allow_vertex_fog(ctx, GL_TRUE);
+	ctx->Const.MaxDrawBuffers = 1;
 
 	/* currently bogus data */
-	if (screen->chip_flags & RADEON_CHIPSET_TCL) {
-	        ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeInstructions =
-		  VSF_MAX_FRAGMENT_LENGTH / 4;
+	if (r300->options.hw_tcl_enabled) {
+		ctx->Const.VertexProgram.MaxNativeInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+		ctx->Const.VertexProgram.MaxNativeAluInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
 		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
-		ctx->Const.VertexProgram.MaxTemps = 32;
-		ctx->Const.VertexProgram.MaxNativeTemps =
-		  /*VSF_MAX_FRAGMENT_TEMPS */ 32;
+		ctx->Const.VertexProgram.MaxNativeTemps = 32;
 		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
 		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
 	}
 
-	ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
-	ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
-	ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
-	ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
-	ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
-	ctx->Const.FragmentProgram.MaxNativeInstructions =
-	    PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
-	ctx->Const.FragmentProgram.MaxNativeTexIndirections =
-	    PFS_MAX_TEX_INDIRECT;
-	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
-	ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
-	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+	if (screen->chip_family >= CHIP_FAMILY_RV515) {
+		ctx->Const.FragmentProgram.MaxNativeTemps = R500_PFS_NUM_TEMP_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+		ctx->Const.FragmentProgram.MaxNativeParameters = R500_PFS_NUM_CONST_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+	} else {
+		ctx->Const.FragmentProgram.MaxNativeTemps = R300_PFS_NUM_TEMP_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+		ctx->Const.FragmentProgram.MaxNativeParameters = R300_PFS_NUM_CONST_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R300_PFS_MAX_ALU_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R300_PFS_MAX_TEX_INST;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = R300_PFS_MAX_ALU_INST + R300_PFS_MAX_TEX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R300_PFS_MAX_TEX_INDIRECT;
+		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+	}
 
-	driInitExtensions(ctx, card_extensions, GL_TRUE);
+}
 
-	if (driQueryOptionb
-	    (&r300->radeon.optionCache, "disable_stencil_two_side"))
-		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+static void r300ParseOptions(r300ContextPtr r300, radeonScreenPtr screen)
+{
+	struct r300_options options = { 0 };
 
-	if (r300->radeon.glCtx->Mesa_DXTn
-	    && !driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc")) {
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
-		_mesa_enable_extension(ctx, "GL_S3_s3tc");
-	} else
-	    if (driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable"))
-	{
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
-	}
+	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r300");
 
-	r300->disable_lowimpact_fallback =
-	    driQueryOptionb(&r300->radeon.optionCache,
-			    "disable_lowimpact_fallback");
+	r300->radeon.initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache, "def_max_anisotropy");
 
-	radeonInitSpanFuncs(ctx);
-	r300InitCmdBuf(r300);
-	r300InitState(r300);
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-	        r300InitSwtcl(ctx);
+	options.stencil_two_side_disabled = driQueryOptionb(&r300->radeon.optionCache, "disable_stencil_two_side");
+	options.s3tc_force_enabled = driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable");
+	options.s3tc_force_disabled = driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc");
 
-	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
+	if (!(screen->chip_flags & RADEON_CHIPSET_TCL) || driQueryOptioni(&r300->radeon.optionCache, "tcl_mode") == DRI_CONF_TCL_SW)
+		options.hw_tcl_enabled = 0;
+	else
+		options.hw_tcl_enabled = 1;
 
-	tcl_mode = driQueryOptioni(&r300->radeon.optionCache, "tcl_mode");
-	if (driQueryOptionb(&r300->radeon.optionCache, "no_rast")) {
-		fprintf(stderr, "disabling 3D acceleration\n");
-#if R200_MERGED
-		FALLBACK(&r300->radeon, RADEON_FALLBACK_DISABLE, 1);
-#endif
-	}
-	if (tcl_mode == DRI_CONF_TCL_SW ||
-	    !(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
-		if (r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
-			r300->radeon.radeonScreen->chip_flags &=
-			    ~RADEON_CHIPSET_TCL;
-			fprintf(stderr, "Disabling HW TCL support\n");
-		}
-		TCL_FALLBACK(r300->radeon.glCtx,
-			     RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
-	}
+	options.conformance_mode = !driQueryOptionb(&r300->radeon.optionCache, "disable_lowimpact_fallback");
 
-	return GL_TRUE;
+	r300->options = options;
 }
 
-static void r300FreeGartAllocations(r300ContextPtr r300)
+static void r300InitGLExtensions(GLcontext *ctx)
 {
-	int i, ret, tries = 0, done_age, in_use = 0;
-	drm_radeon_mem_free_t memfree;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
 
-	memfree.region = RADEON_MEM_REGION_GART;
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+	if (r300->radeon.radeonScreen->kernel_mm)
+		driInitExtensions(ctx, mm_extensions, GL_FALSE);
 
-#ifdef USER_BUFFERS
-	for (i = r300->rmm->u_last; i > 0; i--) {
-		if (r300->rmm->u_list[i].ptr == NULL) {
-			continue;
-		}
+	if (r300->options.stencil_two_side_disabled)
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
 
-		/* check whether this buffer is still in use */
-		if (r300->rmm->u_list[i].pending) {
-			in_use++;
-		}
+	if (r300->options.s3tc_force_enabled) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else if (r300->options.s3tc_force_disabled) {
+		_mesa_disable_extension(ctx, "GL_EXT_texture_compression_s3tc");
 	}
-	/* Cannot flush/lock if no context exists. */
-	if (in_use)
-		r300FlushCmdBuf(r300, __FUNCTION__);
 
-	done_age = radeonGetAge((radeonContextPtr) r300);
+	if (!r300->radeon.radeonScreen->drmSupportsOcclusionQueries) {
+		_mesa_disable_extension(ctx, "GL_ARB_occlusion_query");
+	}
+}
 
-	for (i = r300->rmm->u_last; i > 0; i--) {
-		if (r300->rmm->u_list[i].ptr == NULL) {
-			continue;
-		}
+/* Create the device specific rendering context.
+ */
+GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	struct dd_function_table functions;
+	r300ContextPtr r300;
+	GLcontext *ctx;
 
-		/* check whether this buffer is still in use */
-		if (!r300->rmm->u_list[i].pending) {
-			continue;
-		}
+	assert(glVisual);
+	assert(driContextPriv);
+	assert(screen);
 
-		assert(r300->rmm->u_list[i].h_pending == 0);
+	r300 = (r300ContextPtr) CALLOC(sizeof(*r300));
+	if (!r300)
+		return GL_FALSE;
 
-		tries = 0;
-		while (r300->rmm->u_list[i].age > done_age && tries++ < 1000) {
-			usleep(10);
-			done_age = radeonGetAge((radeonContextPtr) r300);
-		}
-		if (tries >= 1000) {
-			WARN_ONCE("Failed to idle region!");
-		}
+	r300ParseOptions(r300, screen);
 
-		memfree.region_offset = (char *)r300->rmm->u_list[i].ptr -
-		    (char *)r300->radeon.radeonScreen->gartTextures.map;
+	r300->radeon.radeonScreen = screen;
+	r300_init_vtbl(&r300->radeon);
 
-		ret = drmCommandWrite(r300->radeon.radeonScreen->driScreen->fd,
-				      DRM_RADEON_FREE, &memfree,
-				      sizeof(memfree));
-		if (ret) {
-			fprintf(stderr, "Failed to free at %p\nret = %s\n",
-				r300->rmm->u_list[i].ptr, strerror(-ret));
-		} else {
-			if (i == r300->rmm->u_last)
-				r300->rmm->u_last--;
+	_mesa_init_driver_functions(&functions);
+	r300InitIoctlFuncs(&functions);
+	r300InitStateFuncs(&functions);
+	r300InitTextureFuncs(&functions);
+	r300InitShaderFuncs(&functions);
+	radeonInitQueryObjFunctions(&functions);
+	radeonInitBufferObjectFuncs(&functions);
 
-			r300->rmm->u_list[i].pending = 0;
-			r300->rmm->u_list[i].ptr = NULL;
-		}
+	if (!radeonInitContext(&r300->radeon, &functions,
+			       glVisual, driContextPriv,
+			       sharedContextPrivate)) {
+		FREE(r300);
+		return GL_FALSE;
 	}
-	r300->rmm->u_head = i;
-#endif				/* USER_BUFFERS */
-}
 
-/* Destroy the device specific context.
- */
-void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
-{
-	GET_CURRENT_CONTEXT(ctx);
-	r300ContextPtr r300 = (r300ContextPtr) driContextPriv->driverPrivate;
-	radeonContextPtr radeon = (radeonContextPtr) r300;
-	radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
+	ctx = r300->radeon.glCtx;
 
-	if (RADEON_DEBUG & DEBUG_DRI) {
-		fprintf(stderr, "Destroying context !\n");
-	}
+	r300->fallback = 0;
+	if (r300->options.hw_tcl_enabled)
+		ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
 
-	/* check if we're deleting the currently bound context */
-	if (&r300->radeon == current) {
-		radeonFlush(r300->radeon.glCtx);
-		_mesa_make_current(NULL, NULL, NULL);
-	}
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
-	/* Free r300 context resources */
-	assert(r300);		/* should never be null */
-
-	if (r300) {
-		GLboolean release_texture_heaps;
-
-		release_texture_heaps =
-		    (r300->radeon.glCtx->Shared->RefCount == 1);
-		_swsetup_DestroyContext(r300->radeon.glCtx);
-		_tnl_DestroyContext(r300->radeon.glCtx);
-		_vbo_DestroyContext(r300->radeon.glCtx);
-		_swrast_DestroyContext(r300->radeon.glCtx);
-
-		if (r300->dma.current.buf) {
-			r300ReleaseDmaRegion(r300, &r300->dma.current,
-					     __FUNCTION__);
-#ifndef USER_BUFFERS
-			r300FlushCmdBuf(r300, __FUNCTION__);
-#endif
-		}
-		r300FreeGartAllocations(r300);
-		r300DestroyCmdBuf(r300);
+	r300InitConstValues(ctx, screen);
 
-		if (radeon->state.scissor.pClipRects) {
-			FREE(radeon->state.scissor.pClipRects);
-			radeon->state.scissor.pClipRects = NULL;
-		}
+	_mesa_set_mvp_with_dp4( ctx, GL_TRUE );
 
-		if (release_texture_heaps) {
-			/* This share group is about to go away, free our private
-			 * texture object data.
-			 */
-			int i;
+	/* Initialize the software rasterizer and helper modules.
+	 */
+	_swrast_CreateContext(ctx);
+	_vbo_CreateContext(ctx);
+	_tnl_CreateContext(ctx);
+	_swsetup_CreateContext(ctx);
+	_swsetup_Wakeup(ctx);
 
-			for (i = 0; i < r300->nr_heaps; i++) {
-				driDestroyTextureHeap(r300->texture_heaps[i]);
-				r300->texture_heaps[i] = NULL;
-			}
+	/* Install the customized pipeline:
+	 */
+	_tnl_destroy_pipeline(ctx);
+	_tnl_install_pipeline(ctx, r300_pipeline);
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
 
-			assert(is_empty_list(&r300->swapped));
-		}
+	/* Configure swrast and TNL to match hardware characteristics:
+	 */
+	_swrast_allow_pixel_fog(ctx, GL_FALSE);
+	_swrast_allow_vertex_fog(ctx, GL_TRUE);
+	_tnl_allow_pixel_fog(ctx, GL_FALSE);
+	_tnl_allow_vertex_fog(ctx, GL_TRUE);
 
-		radeonCleanupContext(&r300->radeon);
+	if (r300->options.hw_tcl_enabled) {
+		r300InitDraw(ctx);
+	} else {
+		r300InitSwtcl(ctx);
+	}
 
-#ifdef USER_BUFFERS
-		/* the memory manager might be accessed when Mesa frees the shared
-		 * state, so don't destroy it earlier
-		 */
-		r300_mem_destroy(r300);
-#endif
+	radeon_fbo_init(&r300->radeon);
+	radeonInitSpanFuncs( ctx );
+	r300InitCmdBuf(r300);
+	r300InitState(r300);
+	r300InitShaderFunctions(r300);
 
-		/* free the option cache */
-		driDestroyOptionCache(&r300->radeon.optionCache);
+	r300InitGLExtensions(ctx);
 
-		FREE(r300);
-	}
+	return GL_TRUE;
 }
+
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index c15e9fa300..518d5cdbf4 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -37,212 +37,29 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __R300_CONTEXT_H__
 #define __R300_CONTEXT_H__
 
-#include "tnl/t_vertex.h"
 #include "drm.h"
 #include "radeon_drm.h"
 #include "dri_util.h"
-#include "texmem.h"
+#include "radeon_common.h"
 
-#include "main/macros.h"
 #include "main/mtypes.h"
-#include "main/colormac.h"
-
-#define USER_BUFFERS
+#include "shader/prog_instruction.h"
+#include "compiler/radeon_code.h"
 
 struct r300_context;
 typedef struct r300_context r300ContextRec;
 typedef struct r300_context *r300ContextPtr;
 
-#include "radeon_lock.h"
-#include "main/mm.h"
-
-/* From http://gcc.gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
-   I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
-   with other compilers ... GLUE!
-*/
-#define WARN_ONCE(a, ...)	{ \
-	static int warn##__LINE__=1; \
-	if(warn##__LINE__){ \
-		fprintf(stderr, "*********************************WARN_ONCE*********************************\n"); \
-		fprintf(stderr, "File %s function %s line %d\n", \
-			__FILE__, __FUNCTION__, __LINE__); \
-		fprintf(stderr,  a, ## __VA_ARGS__);\
-		fprintf(stderr, "***************************************************************************\n"); \
-		warn##__LINE__=0;\
-		} \
-	}
 
 #include "r300_vertprog.h"
-#include "r500_fragprog.h"
-
-/**
- * This function takes a float and packs it into a uint32_t
- */
-static INLINE uint32_t r300PackFloat32(float fl)
-{
-	union {
-		float fl;
-		uint32_t u;
-	} u;
-
-	u.fl = fl;
-	return u.u;
-}
-
-/* This is probably wrong for some values, I need to test this
- * some more.  Range checking would be a good idea also..
- *
- * But it works for most things.  I'll fix it later if someone
- * else with a better clue doesn't
- */
-static INLINE uint32_t r300PackFloat24(float f)
-{
-	float mantissa;
-	int exponent;
-	uint32_t float24 = 0;
-
-	if (f == 0.0)
-		return 0;
-
-	mantissa = frexpf(f, &exponent);
-
-	/* Handle -ve */
-	if (mantissa < 0) {
-		float24 |= (1 << 23);
-		mantissa = mantissa * -1.0;
-	}
-	/* Handle exponent, bias of 63 */
-	exponent += 62;
-	float24 |= (exponent << 16);
-	/* Kill 7 LSB of mantissa */
-	float24 |= (r300PackFloat32(mantissa) & 0x7FFFFF) >> 7;
-
-	return float24;
-}
-
-/************ DMA BUFFERS **************/
-
-/* Need refcounting on dma buffers:
- */
-struct r300_dma_buffer {
-	int refcount;		/**< the number of retained regions in buf */
-	drmBufPtr buf;
-	int id;
-};
-#undef GET_START
-#ifdef USER_BUFFERS
-#define GET_START(rvb) (r300GartOffsetFromVirtual(rmesa, (rvb)->address+(rvb)->start))
-#else
-#define GET_START(rvb) (rmesa->radeon.radeonScreen->gart_buffer_offset +		\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-#endif
-/* A retained region, eg vertices for indexed vertices.
- */
-struct r300_dma_region {
-	struct r300_dma_buffer *buf;
-	char *address;		/* == buf->address */
-	int start, end, ptr;	/* offsets from start of buf */
-
-	int aos_offset;		/* address in GART memory */
-	int aos_stride;		/* distance between elements, in dwords */
-	int aos_size;		/* number of components (1-4) */
-};
-
-struct r300_dma {
-	/* Active dma region.  Allocations for vertices and retained
-	 * regions come from here.  Also used for emitting random vertices,
-	 * these may be flushed by calling flush_current();
-	 */
-	struct r300_dma_region current;
-
-	void (*flush) (r300ContextPtr);
-
-	char *buf0_address;	/* start of buf[0], for index calcs */
-
-	/* Number of "in-flight" DMA buffers, i.e. the number of buffers
-	 * for which a DISCARD command is currently queued in the command buffer.
-	 */
-	GLuint nr_released_bufs;
-};
-
-       /* Texture related */
 
-typedef struct r300_tex_obj r300TexObj, *r300TexObjPtr;
-
-/* Texture object in locally shared texture space.
- */
-struct r300_tex_obj {
-	driTextureObject base;
-
-	GLuint bufAddr;		/* Offset to start of locally
-				   shared texture block */
-
-	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
-	/* Six, for the cube faces */
-
-	GLboolean image_override;	/* Image overridden by GLX_EXT_tfp */
-
-	GLuint pitch;		/* this isn't sent to hardware just used in calculations */
-	/* hardware register values */
-	/* Note that R200 has 8 registers per texture and R300 only 7 */
-	GLuint filter;
-	GLuint filter_1;
-	GLuint pitch_reg;
-	GLuint size;		/* npot only */
-	GLuint format;
-	GLuint offset;		/* Image location in the card's address space.
-				   All cube faces follow. */
-	GLuint unknown4;
-	GLuint unknown5;
-	/* end hardware registers */
-
-	/* registers computed by r200 code - keep them here to
-	   compare against what is actually written.
-
-	   to be removed later.. */
-	GLuint pp_border_color;
-	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
-	GLuint format_x;
-
-	GLboolean border_fallback;
-
-	GLuint tile_bits;	/* hw texture tile bits used on this texture */
-};
-
-struct r300_texture_env_state {
-	r300TexObjPtr texobj;
-	GLenum format;
-	GLenum envMode;
-};
 
 /* The blit width for texture uploads
  */
 #define R300_BLIT_WIDTH_BYTES 1024
 #define R300_MAX_TEXTURE_UNITS 8
 
-struct r300_texture_state {
-	struct r300_texture_env_state unit[R300_MAX_TEXTURE_UNITS];
-	int tc_count;		/* number of incoming texture coordinates from VAP */
-};
 
-/**
- * A block of hardware state.
- *
- * When check returns non-zero, the returned number of dwords must be
- * copied verbatim into the command buffer in order to update a state atom
- * when it is dirty.
- */
-struct r300_state_atom {
-	struct r300_state_atom *next, *prev;
-	const char *name;	/* for debug */
-	int cmd_size;		/* maximum size in dwords */
-	GLuint idx;		/* index in an array (e.g. textures) */
-	uint32_t *cmd;
-	GLboolean dirty;
-
-	int (*check) (r300ContextPtr, struct r300_state_atom * atom);
-};
 
 #define R300_VPT_CMD_0		0
 #define R300_VPT_XSCALE		1
@@ -284,9 +101,11 @@ struct r300_state_atom {
 #define R300_GB_MISC_MSPOS_0		1
 #define R300_GB_MISC_MSPOS_1		2
 #define R300_GB_MISC_TILE_CONFIG	3
-#define R300_GB_MISC_SELECT		4
-#define R300_GB_MISC_AA_CONFIG		5
-#define R300_GB_MISC_CMDSIZE		6
+#define R300_GB_MISC_CMDSIZE		4
+#define R300_GB_MISC2_CMD_0		    0
+#define R300_GB_MISC2_SELECT		1
+#define R300_GB_MISC2_AA_CONFIG		2
+#define R300_GB_MISC2_CMDSIZE		3
 
 #define R300_TXE_CMD_0		0
 #define R300_TXE_ENABLE		1
@@ -415,6 +234,10 @@ struct r300_state_atom {
 #define R300_ZS_CNTL_2		3
 #define R300_ZS_CMDSIZE		4
 
+#define R300_ZSB_CMD_0		0
+#define R300_ZSB_CNTL_0		1
+#define R300_ZSB_CMDSIZE	2
+
 #define R300_ZB_CMD_0		0
 #define R300_ZB_OFFSET		1
 #define R300_ZB_PITCH		2
@@ -455,423 +278,181 @@ struct r300_state_atom {
 #define R300_TEX_CMDSIZE	(MAX_TEXTURE_UNITS+1)
 */
 
+#define R300_QUERYOBJ_CMD_0  0
+#define R300_QUERYOBJ_DATA_0 1
+#define R300_QUERYOBJ_CMD_1  2
+#define R300_QUERYOBJ_DATA_1  3
+#define R300_QUERYOBJ_CMDSIZE  4
+
 /**
  * Cache for hardware register state.
  */
 struct r300_hw_state {
-	struct r300_state_atom atomlist;
-
-	GLboolean is_dirty;
-	GLboolean all_dirty;
-	int max_state_size;	/* in dwords */
-
-	struct r300_state_atom vpt;	/* viewport (1D98) */
-	struct r300_state_atom vap_cntl;
-        struct r300_state_atom vap_index_offset; /* 0x208c r5xx only */
-	struct r300_state_atom vof;	/* VAP output format register 0x2090 */
-	struct r300_state_atom vte;	/* (20B0) */
-	struct r300_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
-	struct r300_state_atom vap_cntl_status;
-	struct r300_state_atom vir[2];	/* vap input route (2150/21E0) */
-	struct r300_state_atom vic;	/* vap input control (2180) */
-	struct r300_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
-	struct r300_state_atom vap_clip_cntl;
-	struct r300_state_atom vap_clip;
-	struct r300_state_atom vap_pvs_vtx_timeout_reg;	/* Vertex timeout register (2288) */
-	struct r300_state_atom pvs;	/* pvs_cntl (22D0) */
-	struct r300_state_atom gb_enable;	/* (4008) */
-	struct r300_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
-	struct r300_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
-	struct r300_state_atom ga_triangle_stipple;	/* (4214) */
-	struct r300_state_atom ps;	/* pointsize (421C) */
-	struct r300_state_atom ga_point_minmax;	/* (4230) */
-	struct r300_state_atom lcntl;	/* line control */
-	struct r300_state_atom ga_line_stipple;	/* (4260) */
-	struct r300_state_atom shade;
-	struct r300_state_atom polygon_mode;
-	struct r300_state_atom fogp;	/* fog parameters (4294) */
-	struct r300_state_atom ga_soft_reset;	/* (429C) */
-	struct r300_state_atom zbias_cntl;
-	struct r300_state_atom zbs;	/* zbias (42A4) */
-	struct r300_state_atom occlusion_cntl;
-	struct r300_state_atom cul;	/* cull cntl (42B8) */
-	struct r300_state_atom su_depth_scale;	/* (42C0) */
-	struct r300_state_atom rc;	/* rs control (4300) */
-	struct r300_state_atom ri;	/* rs interpolators (4310) */
-	struct r300_state_atom rr;	/* rs route (4330) */
-	struct r300_state_atom sc_hyperz;	/* (43A4) */
-	struct r300_state_atom sc_screendoor;	/* (43E8) */
-	struct r300_state_atom fp;	/* fragment program cntl + nodes (4600) */
-	struct r300_state_atom fpt;	/* texi - (4620) */
-	struct r300_state_atom us_out_fmt;	/* (46A4) */
-	struct r300_state_atom r500fp;	/* r500 fp instructions */
-	struct r300_state_atom r500fp_const;	/* r500 fp constants */
-	struct r300_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
-	struct r300_state_atom fogs;	/* fog state (4BC0) */
-	struct r300_state_atom fogc;	/* fog color (4BC8) */
-	struct r300_state_atom at;	/* alpha test (4BD4) */
-	struct r300_state_atom fg_depth_src;	/* (4BD8) */
-	struct r300_state_atom fpp;	/* 0x4C00 and following */
-	struct r300_state_atom rb3d_cctl;	/* (4E00) */
-	struct r300_state_atom bld;	/* blending (4E04) */
-	struct r300_state_atom cmk;	/* colormask (4E0C) */
-	struct r300_state_atom blend_color;	/* constant blend color */
-	struct r300_state_atom rop;	/* ropcntl */
-	struct r300_state_atom cb;	/* colorbuffer (4E28) */
-	struct r300_state_atom rb3d_dither_ctl;	/* (4E50) */
-	struct r300_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
-	struct r300_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
-	struct r300_state_atom zs;	/* zstencil control (4F00) */
-	struct r300_state_atom zstencil_format;
-	struct r300_state_atom zb;	/* z buffer (4F20) */
-	struct r300_state_atom zb_depthclearvalue;	/* (4F28) */
-	struct r300_state_atom unk4F30;	/* (4F30) */
-	struct r300_state_atom zb_hiz_offset;	/* (4F44) */
-	struct r300_state_atom zb_hiz_pitch;	/* (4F54) */
-
-	struct r300_state_atom vpi;	/* vp instructions */
-	struct r300_state_atom vpp;	/* vp parameters */
-	struct r300_state_atom vps;	/* vertex point size (?) */
-	struct r300_state_atom vpucp[6];	/* vp user clip plane - 6 */
+	struct radeon_state_atom vpt;	/* viewport (1D98) */
+	struct radeon_state_atom vap_cntl;
+	struct radeon_state_atom vap_index_offset; /* 0x208c r5xx only */
+	struct radeon_state_atom vof;	/* VAP output format register 0x2090 */
+	struct radeon_state_atom vte;	/* (20B0) */
+	struct radeon_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
+	struct radeon_state_atom vap_cntl_status;
+	struct radeon_state_atom vir[2];	/* vap input route (2150/21E0) */
+	struct radeon_state_atom vic;	/* vap input control (2180) */
+	struct radeon_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
+	struct radeon_state_atom vap_clip_cntl;
+	struct radeon_state_atom vap_clip;
+	struct radeon_state_atom vap_pvs_vtx_timeout_reg;	/* Vertex timeout register (2288) */
+	struct radeon_state_atom pvs;	/* pvs_cntl (22D0) */
+	struct radeon_state_atom gb_enable;	/* (4008) */
+	struct radeon_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
+	struct radeon_state_atom gb_misc2;	/* Multisampling position shifts ? (4010) */
+	struct radeon_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
+	struct radeon_state_atom ga_triangle_stipple;	/* (4214) */
+	struct radeon_state_atom ps;	/* pointsize (421C) */
+	struct radeon_state_atom ga_point_minmax;	/* (4230) */
+	struct radeon_state_atom lcntl;	/* line control */
+	struct radeon_state_atom ga_line_stipple;	/* (4260) */
+	struct radeon_state_atom shade;
+	struct radeon_state_atom shade2;
+	struct radeon_state_atom polygon_mode;
+	struct radeon_state_atom fogp;	/* fog parameters (4294) */
+	struct radeon_state_atom ga_soft_reset;	/* (429C) */
+	struct radeon_state_atom zbias_cntl;
+	struct radeon_state_atom zbs;	/* zbias (42A4) */
+	struct radeon_state_atom occlusion_cntl;
+	struct radeon_state_atom cul;	/* cull cntl (42B8) */
+	struct radeon_state_atom su_depth_scale;	/* (42C0) */
+	struct radeon_state_atom rc;	/* rs control (4300) */
+	struct radeon_state_atom ri;	/* rs interpolators (4310) */
+	struct radeon_state_atom rr;	/* rs route (4330) */
+	struct radeon_state_atom sc_hyperz;	/* (43A4) */
+	struct radeon_state_atom sc_screendoor;	/* (43E8) */
+	struct radeon_state_atom fp;	/* fragment program cntl + nodes (4600) */
+	struct radeon_state_atom fpt;	/* texi - (4620) */
+	struct radeon_state_atom us_out_fmt;	/* (46A4) */
+	struct radeon_state_atom r500fp;	/* r500 fp instructions */
+	struct radeon_state_atom r500fp_const;	/* r500 fp constants */
+	struct radeon_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
+	struct radeon_state_atom fogs;	/* fog state (4BC0) */
+	struct radeon_state_atom fogc;	/* fog color (4BC8) */
+	struct radeon_state_atom at;	/* alpha test (4BD4) */
+	struct radeon_state_atom fg_depth_src;	/* (4BD8) */
+	struct radeon_state_atom fpp;	/* 0x4C00 and following */
+	struct radeon_state_atom rb3d_cctl;	/* (4E00) */
+	struct radeon_state_atom bld;	/* blending (4E04) */
+	struct radeon_state_atom cmk;	/* colormask (4E0C) */
+	struct radeon_state_atom blend_color;	/* constant blend color */
+	struct radeon_state_atom rop;	/* ropcntl */
+	struct radeon_state_atom cb;	/* colorbuffer (4E28) */
+	struct radeon_state_atom rb3d_dither_ctl;	/* (4E50) */
+	struct radeon_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
+	struct radeon_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
+	struct radeon_state_atom zs;	/* zstencil control (4F00) */
+	struct radeon_state_atom zsb;	/* zstencil bf */
+	struct radeon_state_atom zstencil_format;
+	struct radeon_state_atom zb;	/* z buffer (4F20) */
+	struct radeon_state_atom zb_depthclearvalue;	/* (4F28) */
+	struct radeon_state_atom zb_zmask;	/* (4F30) */
+	struct radeon_state_atom zb_hiz_offset;	/* (4F44) */
+	struct radeon_state_atom zb_hiz_pitch;	/* (4F54) */
+
+	struct radeon_state_atom vap_flush;
+	struct radeon_state_atom vpi;	/* vp instructions */
+	struct radeon_state_atom vpp;	/* vp parameters */
+	struct radeon_state_atom vps;	/* vertex point size (?) */
+	struct radeon_state_atom vpucp[6];	/* vp user clip plane - 6 */
 	/* 8 texture units */
 	/* the state is grouped by function and not by
 	   texture unit. This makes single unit updates
 	   really awkward - we are much better off
 	   updating the whole thing at once */
 	struct {
-		struct r300_state_atom filter;
-		struct r300_state_atom filter_1;
-		struct r300_state_atom size;
-		struct r300_state_atom format;
-		struct r300_state_atom pitch;
-		struct r300_state_atom offset;
-		struct r300_state_atom chroma_key;
-		struct r300_state_atom border_color;
+		struct radeon_state_atom filter;
+		struct radeon_state_atom filter_1;
+		struct radeon_state_atom size;
+		struct radeon_state_atom format;
+		struct radeon_state_atom pitch;
+		struct radeon_state_atom offset;
+		struct radeon_state_atom chroma_key;
+		struct radeon_state_atom border_color;
 	} tex;
-	struct r300_state_atom txe;	/* tex enable (4104) */
-};
-
-/**
- * This structure holds the command buffer while it is being constructed.
- *
- * The first batch of commands in the buffer is always the state that needs
- * to be re-emitted when the context is lost. This batch can be skipped
- * otherwise.
- */
-struct r300_cmdbuf {
-	int size;		/* DWORDs allocated for buffer */
-	uint32_t *cmd_buf;
-	int count_used;		/* DWORDs filled so far */
-	int count_reemit;	/* size of re-emission batch */
+	struct radeon_state_atom txe;	/* tex enable (4104) */
+	radeonTexObj *textures[R300_MAX_TEXTURE_UNITS];
 };
 
 /**
  * State cache
  */
 
-struct r300_depthbuffer_state {
-	GLfloat scale;
-};
-
-struct r300_stencilbuffer_state {
-	GLboolean hw_stencil;
-};
-
 /* Vertex shader state */
 
-/* Perhaps more if we store programs in vmem? */
-/* drm_r300_cmd_header_t->vpu->count is unsigned char */
-#define VSF_MAX_FRAGMENT_LENGTH (255*4)
-
-/* Can be tested with colormat currently. */
-#define VSF_MAX_FRAGMENT_TEMPS (14)
-
-#define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
-#define STATE_R300_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
-
-struct r300_vertex_shader_fragment {
-	int length;
-	union {
-		GLuint d[VSF_MAX_FRAGMENT_LENGTH];
-		float f[VSF_MAX_FRAGMENT_LENGTH];
-		GLuint i[VSF_MAX_FRAGMENT_LENGTH];
-	} body;
-};
-
-struct r300_vertex_shader_state {
-	struct r300_vertex_shader_fragment program;
-};
-
-extern int hw_tcl_on;
-
 #define COLOR_IS_RGBA
 #define TAG(x) r300##x
 #include "tnl_dd/t_dd_vertex.h"
 #undef TAG
 
-//#define CURRENT_VERTEX_SHADER(ctx) (ctx->VertexProgram._Current)
-#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->selected_vp)
-
-/* Should but doesnt work */
-//#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->curr_vp)
-
-/* r300_vertex_shader_state and r300_vertex_program should probably be merged together someday.
- * Keeping them them seperate for now should ensure fixed pipeline keeps functioning properly.
- */
-
 struct r300_vertex_program_key {
-	GLuint InputsRead;
-	GLuint OutputsWritten;
-	GLuint OutputsAdded;
+	GLbitfield FpReads;
+	GLuint FogAttr;
+	GLuint WPosAttr;
 };
 
 struct r300_vertex_program {
+	struct gl_vertex_program *Base;
 	struct r300_vertex_program *next;
+
 	struct r300_vertex_program_key key;
-	int translated;
-
-	struct r300_vertex_shader_fragment program;
-
-	int pos_end;
-	int num_temporaries;	/* Number of temp vars used by program */
-	int wpos_idx;
-	int inputs[VERT_ATTRIB_MAX];
-	int outputs[VERT_RESULT_MAX];
-	int native;
-	int ref_count;
-	int use_ref_count;
+	struct r300_vertex_program_code code;
+
+	GLboolean error;
 };
 
 struct r300_vertex_program_cont {
-	struct gl_vertex_program mesa_program;	/* Must be first */
-	struct r300_vertex_shader_fragment params;
+	/* This is the unmodified vertex program mesa provided us with.
+	 * We need to keep it unchanged because we may need to create another
+	 * hw specific vertex program based on this.
+	 */
+	struct gl_vertex_program mesa_program;
+	/* This is the list of hw specific vertex programs derived from mesa_program */
 	struct r300_vertex_program *progs;
 };
 
-#define PFS_MAX_ALU_INST	64
-#define PFS_MAX_TEX_INST	64
-#define PFS_MAX_TEX_INDIRECT 4
-#define PFS_NUM_TEMP_REGS	32
-#define PFS_NUM_CONST_REGS	16
-
-struct r300_pfs_compile_state;
-
-
-/**
- * Stores state that influences the compilation of a fragment program.
- */
-struct r300_fragment_program_external_state {
-	struct {
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is:
-		 *  0 - GL_LUMINANCE
-		 *  1 - GL_INTENSITY
-		 *  2 - GL_ALPHA
-		 * depending on the depth texture mode.
-		 */
-		GLuint depth_texture_mode : 2;
-
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is (texture_compare_func - GL_NEVER).
-		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
-		 *
-		 * Otherwise, this field is 0.
-		 */
-		GLuint texture_compare_func : 3;
-	} unit[16];
-};
-
-
-struct r300_fragment_program_node {
-	int tex_offset; /**< first tex instruction */
-	int tex_end; /**< last tex instruction, relative to tex_offset */
-	int alu_offset; /**< first ALU instruction */
-	int alu_end; /**< last ALU instruction, relative to alu_offset */
-	int flags;
-};
-
-/**
- * Stores an R300 fragment program in its compiled-to-hardware form.
- */
-struct r300_fragment_program_code {
-	struct {
-		int length; /**< total # of texture instructions used */
-		GLuint inst[PFS_MAX_TEX_INST];
-	} tex;
-
-	struct {
-		int length; /**< total # of ALU instructions used */
-		struct {
-			GLuint inst0;
-			GLuint inst1;
-			GLuint inst2;
-			GLuint inst3;
-		} inst[PFS_MAX_ALU_INST];
-	} alu;
-
-	struct r300_fragment_program_node node[4];
-	int cur_node;
-	int first_node_has_tex;
-
-	/**
-	 * Remember which program register a given hardware constant
-	 * belongs to.
-	 */
-	struct prog_src_register constant[PFS_NUM_CONST_REGS];
-	int const_nr;
-
-	int max_temp_idx;
-};
 
 /**
- * Store everything about a fragment program that is needed
- * to render with that program.
- */
+* Store everything about a fragment program that is needed
+* to render with that program.
+*/
 struct r300_fragment_program {
-	struct gl_fragment_program mesa_program;
-
-	GLboolean translated;
 	GLboolean error;
-
+	struct r300_fragment_program *next;
 	struct r300_fragment_program_external_state state;
-	struct r300_fragment_program_code code;
 
-	GLboolean WritesDepth;
-	GLuint optimization;
-};
-
-struct r500_pfs_compile_state;
+	struct rX00_fragment_program_code code;
+	GLbitfield InputsRead;
 
-struct r500_fragment_program_external_state {
-	struct {
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is:
-		 *  0 - GL_LUMINANCE
-		 *  1 - GL_INTENSITY
-		 *  2 - GL_ALPHA
-		 * depending on the depth texture mode.
-		 */
-		GLuint depth_texture_mode : 2;
-
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is (texture_compare_func - GL_NEVER).
-		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
-		 *
-		 * Otherwise, this field is 0.
-		 */
-		GLuint texture_compare_func : 3;
-	} unit[16];
+	/* attribute that we are sending the WPOS in */
+	gl_frag_attrib wpos_attr;
+	/* attribute that we are sending the fog coordinate in */
+	gl_frag_attrib fog_attr;
 };
 
-struct r500_fragment_program_code {
-	struct {
-		GLuint inst0;
-		GLuint inst1;
-		GLuint inst2;
-		GLuint inst3;
-		GLuint inst4;
-		GLuint inst5;
-	} inst[512];
-
-	int inst_offset;
-	int inst_end;
-
-	/**
-	 * Remember which program register a given hardware constant
-	 * belongs to.
+struct r300_fragment_program_cont {
+	/* This is the unmodified fragment program mesa provided us with.
+	 * We need to keep it unchanged because we may need to create another
+	 * hw specific fragment program based on this.
 	 */
-	struct prog_src_register constant[PFS_NUM_CONST_REGS];
-	int const_nr;
-
-	int max_temp_idx;
+	struct gl_fragment_program Base;
+	/* This is the list of hw specific fragment programs derived from Base */
+	struct r300_fragment_program *progs;
 };
 
-struct r500_fragment_program {
-	struct gl_fragment_program mesa_program;
-
-	GLcontext *ctx;
-	GLboolean translated;
-	GLboolean error;
-
-	struct r500_fragment_program_external_state state;
-	struct r500_fragment_program_code code;
-
-	GLboolean writes_depth;
-
-	GLuint optimization;
-};
 
 #define R300_MAX_AOS_ARRAYS		16
 
-#define REG_COORDS	0
-#define REG_COLOR0	1
-#define REG_TEX0	2
-
-struct r300_state {
-	struct r300_depthbuffer_state depth;
-	struct r300_texture_state texture;
-	int sw_tcl_inputs[VERT_ATTRIB_MAX];
-	struct r300_vertex_shader_state vertex_shader;
-	struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
-	int aos_count;
-
-	GLuint *Elts;
-	struct r300_dma_region elt_dma;
-
-	struct r300_dma_region swtcl_dma;
-	DECLARE_RENDERINPUTS(render_inputs_bitset);	/* actual render inputs that R300 was configured for.
-							   They are the same as tnl->render_inputs for fixed pipeline */
-
-	struct r300_stencilbuffer_state stencil;
-
-};
-
-#define R300_FALLBACK_NONE 0
-#define R300_FALLBACK_TCL 1
-#define R300_FALLBACK_RAST 2
 
 /* r300_swtcl.c
  */
 struct r300_swtcl_info {
-   GLuint RenderIndex;
-
-   /**
-    * Size of a hardware vertex.  This is calculated when \c ::vertex_attrs is
-    * installed in the Mesa state vector.
-    */
-   GLuint vertex_size;
-
-   /**
-    * Attributes instructing the Mesa TCL pipeline where / how to put vertex
-    * data in the hardware buffer.
-    */
-   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
-
-   /**
-    * Number of elements of \c ::vertex_attrs that are actually used.
-    */
-   GLuint vertex_attr_count;
-
-   /**
-    * Cached pointer to the buffer where Mesa will store vertex data.
-    */
-   GLubyte *verts;
-
-   /* Fallback rasterization functions
-    */
-  //   r200_point_func draw_point;
-  //   r200_line_func draw_line;
-  //   r200_tri_func draw_tri;
-
-   GLuint hw_primitive;
-   GLenum render_primitive;
-   GLuint numverts;
-
-   /**
+  /*
     * Offset of the 4UB color data within a hardware (swtcl) vertex.
     */
    GLuint coloroffset;
@@ -880,13 +461,43 @@ struct r300_swtcl_info {
     * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
     */
    GLuint specoffset;
+};
 
-   /**
-    * Should Mesa project vertex data or will the hardware do it?
-    */
-   GLboolean needproj;
+struct r300_vtable {
+	void (* SetupRSUnit)(GLcontext *ctx);
+	void (* SetupFragmentShaderTextures)(GLcontext *ctx, int *tmu_mappings);
+	void (* SetupPixelShader)(GLcontext *ctx);
+};
 
-   struct r300_dma_region indexed_verts;
+struct r300_vertex_buffer {
+	struct vertex_attribute {
+		/* generic */
+		GLubyte element;
+		GLuint stride;
+		GLuint dwords;
+		GLubyte size; /* number of components */
+		GLboolean is_named_bo;
+		struct radeon_bo *bo;
+		GLint bo_offset;
+
+		/* hw specific */
+		uint32_t data_type:4;
+		uint32_t dst_loc:5;
+		uint32_t _signed:1;
+		uint32_t normalize:1;
+		uint32_t swizzle:12;
+		uint32_t write_mask:4;
+	} attribs[VERT_ATTRIB_MAX];
+
+	GLubyte num_attribs;
+};
+
+struct r300_index_buffer {
+	struct radeon_bo *bo;
+	int bo_offset;
+
+	GLboolean is_32bit;
+	GLuint count;
 };
 
 
@@ -896,46 +507,33 @@ struct r300_swtcl_info {
 struct r300_context {
 	struct radeon_context radeon;	/* parent class, must be first */
 
+	struct r300_vtable vtbl;
+
 	struct r300_hw_state hw;
-	struct r300_cmdbuf cmdbuf;
-	struct r300_state state;
-	struct gl_vertex_program *curr_vp;
+
 	struct r300_vertex_program *selected_vp;
+	struct r300_fragment_program *selected_fp;
 
 	/* Vertex buffers
 	 */
-	struct r300_dma dma;
-	GLboolean save_on_next_unlock;
-	GLuint NewGLState;
-
-	/* Texture object bookkeeping
-	 */
-	unsigned nr_heaps;
-	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
-	driTextureObject swapped;
-	int texture_depth;
-	float initialMaxAnisotropy;
-
-	/* Clientdata textures;
-	 */
-	GLuint prefer_gart_client_texturing;
-
-#ifdef USER_BUFFERS
-	struct r300_memory_manager *rmm;
-#endif
-
 	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
 	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
 
-	GLboolean disable_lowimpact_fallback;
+	struct r300_options {
+		uint32_t conformance_mode:1;
+		uint32_t hw_tcl_enabled:1;
+		uint32_t s3tc_force_enabled:1;
+		uint32_t s3tc_force_disabled:1;
+		uint32_t stencil_two_side_disabled:1;
+	} options;
 
-	DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
 	struct r300_swtcl_info swtcl;
-};
+	struct r300_vertex_buffer vbuf;
+	struct r300_index_buffer ind_buf;
 
-struct r300_buffer_object {
-	struct gl_buffer_object mesa_obj;
-	int id;
+	uint32_t fallback;
+
+	DECLARE_RENDERINPUTS(render_inputs_bitset);
 };
 
 #define R300_CONTEXT(ctx)		((r300ContextPtr)(ctx->DriverCtx))
@@ -945,15 +543,13 @@ extern GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 				   __DRIcontextPrivate * driContextPriv,
 				   void *sharedContextPrivate);
 
-extern void r300SelectVertexShader(r300ContextPtr r300);
 extern void r300InitShaderFuncs(struct dd_function_table *functions);
-extern int r300VertexProgUpdateParams(GLcontext * ctx,
-				      struct r300_vertex_program_cont *vp,
-				      float *dst);
-
-#define RADEON_D_CAPTURE 0
-#define RADEON_D_PLAYBACK 1
-#define RADEON_D_PLAYBACK_RAW 2
-#define RADEON_D_T 3
+
+extern void r300InitShaderFunctions(r300ContextPtr r300);
+
+extern void r300InitDraw(GLcontext *ctx);
+
+#define r300PackFloat32 radeonPackFloat32
+#define r300PackFloat24 radeonPackFloat24
 
 #endif				/* __R300_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
new file mode 100644
index 0000000000..e9968f9ffe
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -0,0 +1,718 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Maciej Cencora
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHOR(S) AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/state.h"
+#include "main/api_validate.h"
+#include "main/enums.h"
+#include "main/simple_list.h"
+
+#include "r300_reg.h"
+#include "r300_context.h"
+#include "r300_emit.h"
+#include "r300_render.h"
+#include "r300_state.h"
+#include "r300_tex.h"
+#include "r300_cmdbuf.h"
+
+#include "radeon_buffer_objects.h"
+#include "radeon_common_context.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "vbo/vbo_context.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+
+
+static int getTypeSize(GLenum type)
+{
+	switch (type) {
+		case GL_DOUBLE:
+			return sizeof(GLdouble);
+		case GL_FLOAT:
+			return sizeof(GLfloat);
+		case GL_INT:
+			return sizeof(GLint);
+		case GL_UNSIGNED_INT:
+			return sizeof(GLuint);
+		case GL_SHORT:
+			return sizeof(GLshort);
+		case GL_UNSIGNED_SHORT:
+			return sizeof(GLushort);
+		case GL_BYTE:
+			return sizeof(GLbyte);
+		case GL_UNSIGNED_BYTE:
+			return sizeof(GLubyte);
+		default:
+			assert(0);
+			return 0;
+	}
+}
+
+static void r300FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLvoid *src_ptr;
+	GLuint *out;
+	int i;
+	GLboolean mapped_named_bo = GL_FALSE;
+
+	if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer) {
+		ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+		mapped_named_bo = GL_TRUE;
+		assert(mesa_ind_buf->obj->Pointer != NULL);
+	}
+	src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
+
+	radeon_print(RADEON_FALLBACKS, RADEON_IMPORTANT,
+			"%s: Fixing index buffer format. type %d\n",
+			__func__, mesa_ind_buf->type);
+
+	if (mesa_ind_buf->type == GL_UNSIGNED_BYTE) {
+		GLuint size = sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1);
+		GLubyte *in = (GLubyte *)src_ptr;
+
+		radeonAllocDmaRegion(&r300->radeon, &r300->ind_buf.bo, &r300->ind_buf.bo_offset, size, 4);
+
+		assert(r300->ind_buf.bo->ptr != NULL);
+		out = (GLuint *)ADD_POINTERS(r300->ind_buf.bo->ptr, r300->ind_buf.bo_offset);
+
+		for (i = 0; i + 1 < mesa_ind_buf->count; i += 2) {
+			*out++ = in[i] | in[i + 1] << 16;
+		}
+
+		if (i < mesa_ind_buf->count) {
+			*out++ = in[i];
+		}
+
+#if MESA_BIG_ENDIAN
+	} else { /* if (mesa_ind_buf->type == GL_UNSIGNED_SHORT) */
+		GLushort *in = (GLushort *)src_ptr;
+		GLuint size = sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1);
+
+		radeonAllocDmaRegion(&r300->radeon, &r300->ind_buf.bo,
+				     &r300->ind_buf.bo_offset, size, 4);
+
+		assert(r300->ind_buf.bo->ptr != NULL);
+		out = (GLuint *)ADD_POINTERS(r300->ind_buf.bo->ptr, r300->ind_buf.bo_offset);
+
+		for (i = 0; i + 1 < mesa_ind_buf->count; i += 2) {
+			*out++ = in[i] | in[i + 1] << 16;
+		}
+
+		if (i < mesa_ind_buf->count) {
+			*out++ = in[i];
+		}
+#endif
+	}
+
+	r300->ind_buf.is_32bit = GL_FALSE;
+	r300->ind_buf.count = mesa_ind_buf->count;
+
+	if (mapped_named_bo) {
+		ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+	}
+}
+
+
+static void r300SetupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	if (!mesa_ind_buf) {
+		r300->ind_buf.bo = NULL;
+		return;
+	}
+	radeon_print(RADEON_RENDER, RADEON_TRACE, "%s\n", __func__);
+
+#if MESA_BIG_ENDIAN
+	if (mesa_ind_buf->type == GL_UNSIGNED_INT) {
+#else
+	if (mesa_ind_buf->type != GL_UNSIGNED_BYTE) {
+#endif
+		const GLvoid *src_ptr;
+		GLvoid *dst_ptr;
+		GLboolean mapped_named_bo = GL_FALSE;
+
+		if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer) {
+			ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+			assert(mesa_ind_buf->obj->Pointer != NULL);
+			mapped_named_bo = GL_TRUE;
+		}
+
+		src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
+
+		const GLuint size = mesa_ind_buf->count * getTypeSize(mesa_ind_buf->type);
+
+		radeonAllocDmaRegion(&r300->radeon, &r300->ind_buf.bo, &r300->ind_buf.bo_offset, size, 4);
+
+		assert(r300->ind_buf.bo->ptr != NULL);
+		dst_ptr = ADD_POINTERS(r300->ind_buf.bo->ptr, r300->ind_buf.bo_offset);
+		_mesa_memcpy(dst_ptr, src_ptr, size);
+
+		r300->ind_buf.is_32bit = (mesa_ind_buf->type == GL_UNSIGNED_INT);
+		r300->ind_buf.count = mesa_ind_buf->count;
+
+		if (mapped_named_bo) {
+			ctx->Driver.UnmapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, mesa_ind_buf->obj);
+		}
+	} else {
+		r300FixupIndexBuffer(ctx, mesa_ind_buf);
+	}
+}
+
+#define CONVERT( TYPE, MACRO ) do {		\
+	GLuint i, j, sz;				\
+	sz = input->Size;				\
+	if (input->Normalized) {			\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = MACRO(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	} else {					\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = (GLfloat)(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	}						\
+} while (0)
+
+/**
+ * Convert attribute data type to float
+ * If the attribute uses named buffer object replace the bo with newly allocated bo
+ */
+static void r300ConvertAttrib(GLcontext *ctx, int count, const struct gl_client_array *input, struct vertex_attribute *attr)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	const GLvoid *src_ptr;
+	GLboolean mapped_named_bo = GL_FALSE;
+	GLfloat *dst_ptr;
+	GLuint stride;
+
+	stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size : input->StrideB;
+
+	/* Convert value for first element only */
+	if (input->StrideB == 0)
+		count = 1;
+
+	if (input->BufferObj->Name) {
+		if (!input->BufferObj->Pointer) {
+			ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+			mapped_named_bo = GL_TRUE;
+		}
+
+		src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
+	} else {
+		src_ptr = input->Ptr;
+	}
+
+	radeonAllocDmaRegion(&r300->radeon, &attr->bo, &attr->bo_offset, sizeof(GLfloat) * input->Size * count, 32);
+	dst_ptr = (GLfloat *)ADD_POINTERS(attr->bo->ptr, attr->bo_offset);
+
+	radeon_print(RADEON_FALLBACKS, RADEON_IMPORTANT,
+			"%s: Converting vertex attributes, attribute data format %x,"
+			"stride %d, components %d\n"
+			, __FUNCTION__, input->Type
+			, stride, input->Size);
+
+	assert(src_ptr != NULL);
+
+	switch (input->Type) {
+		case GL_DOUBLE:
+			CONVERT(GLdouble, (GLfloat));
+			break;
+		case GL_UNSIGNED_INT:
+			CONVERT(GLuint, UINT_TO_FLOAT);
+			break;
+		case GL_INT:
+			CONVERT(GLint, INT_TO_FLOAT);
+			break;
+		case GL_UNSIGNED_SHORT:
+			CONVERT(GLushort, USHORT_TO_FLOAT);
+			break;
+		case GL_SHORT:
+			CONVERT(GLshort, SHORT_TO_FLOAT);
+			break;
+		case GL_UNSIGNED_BYTE:
+			assert(input->Format != GL_BGRA);
+			CONVERT(GLubyte, UBYTE_TO_FLOAT);
+			break;
+		case GL_BYTE:
+			CONVERT(GLbyte, BYTE_TO_FLOAT);
+			break;
+		default:
+			assert(0);
+			break;
+	}
+
+	if (mapped_named_bo) {
+		ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+	}
+}
+
+static void r300AlignDataToDword(GLcontext *ctx, const struct gl_client_array *input, int count, struct vertex_attribute *attr)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	const int dst_stride = (input->StrideB + 3) & ~3;
+	const int size = getTypeSize(input->Type) * input->Size * count;
+	GLboolean mapped_named_bo = GL_FALSE;
+
+	radeonAllocDmaRegion(&r300->radeon, &attr->bo, &attr->bo_offset, size, 32);
+
+	if (!input->BufferObj->Pointer) {
+		ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+		mapped_named_bo = GL_TRUE;
+	}
+
+	radeon_print(RADEON_FALLBACKS, RADEON_IMPORTANT, "%s. Vertex alignment doesn't match hw requirements.\n", __func__);
+
+	{
+		GLvoid *src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
+		GLvoid *dst_ptr = ADD_POINTERS(attr->bo->ptr, attr->bo_offset);
+		int i;
+
+		for (i = 0; i < count; ++i) {
+			_mesa_memcpy(dst_ptr, src_ptr, input->StrideB);
+			src_ptr += input->StrideB;
+			dst_ptr += dst_stride;
+		}
+	}
+
+	if (mapped_named_bo) {
+		ctx->Driver.UnmapBuffer(ctx, GL_ARRAY_BUFFER, input->BufferObj);
+	}
+
+	attr->stride = dst_stride;
+}
+
+static void r300TranslateAttrib(GLcontext *ctx, GLuint attr, int count, const struct gl_client_array *input)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	struct vertex_attribute r300_attr;
+	GLenum type;
+	GLuint stride;
+
+	radeon_print(RADEON_RENDER, RADEON_TRACE, "%s\n", __func__);
+	stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size : input->StrideB;
+
+	if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT ||
+#if MESA_BIG_ENDIAN
+	    getTypeSize(input->Type) != 4 ||
+#endif
+	    stride < 4) {
+
+		type = GL_FLOAT;
+
+		if (input->StrideB == 0) {
+			r300_attr.stride = 0;
+		} else {
+			r300_attr.stride = sizeof(GLfloat) * input->Size;
+		}
+		r300_attr.dwords = input->Size;
+		r300_attr.is_named_bo = GL_FALSE;
+	} else {
+		type = input->Type;
+		r300_attr.dwords = (getTypeSize(type) * input->Size + 3)/ 4;
+		if (!input->BufferObj->Name) {
+
+			if (input->StrideB == 0) {
+				r300_attr.stride = 0;
+			} else {
+				r300_attr.stride = (getTypeSize(type) * input->Size + 3) & ~3;
+			}
+
+			r300_attr.is_named_bo = GL_FALSE;
+		}
+	}
+
+	r300_attr.size = input->Size;
+	r300_attr.element = attr;
+	r300_attr.dst_loc = vbuf->num_attribs;
+
+	switch (type) {
+		case GL_FLOAT:
+			switch (input->Size) {
+				case 1: r300_attr.data_type = R300_DATA_TYPE_FLOAT_1; break;
+				case 2: r300_attr.data_type = R300_DATA_TYPE_FLOAT_2; break;
+				case 3: r300_attr.data_type = R300_DATA_TYPE_FLOAT_3; break;
+				case 4: r300_attr.data_type = R300_DATA_TYPE_FLOAT_4; break;
+			}
+			r300_attr._signed = 0;
+			r300_attr.normalize = 0;
+			break;
+		case GL_SHORT:
+			r300_attr._signed = 1;
+			r300_attr.normalize = input->Normalized;
+			switch (input->Size) {
+				case 1:
+				case 2:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_2;
+					break;
+				case 3:
+				case 4:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_4;
+					break;
+			}
+			break;
+		case GL_BYTE:
+			r300_attr._signed = 1;
+			r300_attr.normalize = input->Normalized;
+			r300_attr.data_type = R300_DATA_TYPE_BYTE;
+			break;
+		case GL_UNSIGNED_SHORT:
+			r300_attr._signed = 0;
+			r300_attr.normalize = input->Normalized;
+			switch (input->Size) {
+				case 1:
+				case 2:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_2;
+					break;
+				case 3:
+				case 4:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_4;
+					break;
+			}
+			break;
+		case GL_UNSIGNED_BYTE:
+			r300_attr._signed = 0;
+			r300_attr.normalize = input->Normalized;
+			if (input->Format == GL_BGRA)
+				r300_attr.data_type = R300_DATA_TYPE_D3DCOLOR;
+			else
+				r300_attr.data_type = R300_DATA_TYPE_BYTE;
+			break;
+
+		default:
+		case GL_DOUBLE:
+		case GL_INT:
+		case GL_UNSIGNED_INT:
+			assert(0);
+			break;
+	}
+
+	switch (input->Size) {
+		case 4:
+			r300_attr.swizzle = SWIZZLE_XYZW;
+			break;
+		case 3:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+			break;
+		case 2:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
+			break;
+		case 1:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+			break;
+	}
+
+	r300_attr.write_mask = MASK_XYZW;
+
+	vbuf->attribs[vbuf->num_attribs] = r300_attr;
+	++vbuf->num_attribs;
+}
+
+static void r300SetVertexFormat(GLcontext *ctx, const struct gl_client_array *arrays[], int count)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s\n", __func__);
+	{
+		int i, tmp;
+
+		tmp = r300->selected_vp->code.InputsRead;
+		i = 0;
+		vbuf->num_attribs = 0;
+		while (tmp) {
+			/* find first enabled bit */
+			while (!(tmp & 1)) {
+				tmp >>= 1;
+				++i;
+			}
+
+			r300TranslateAttrib(ctx, i, count, arrays[i]);
+
+			tmp >>= 1;
+			++i;
+		}
+	}
+
+	r300SwitchFallback(ctx, R300_FALLBACK_AOS_LIMIT, vbuf->num_attribs > R300_MAX_AOS_ARRAYS);
+	if (r300->fallback)
+		return;
+}
+
+static void r300AllocDmaRegions(GLcontext *ctx, const struct gl_client_array *input[], int count)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	GLuint stride;
+	int ret;
+	int i, index;
+	radeon_print(RADEON_RENDER, RADEON_VERBOSE,
+			"%s: count %d num_attribs %d\n",
+			__func__, count, vbuf->num_attribs);
+
+	for (index = 0; index < vbuf->num_attribs; index++) {
+		struct radeon_aos *aos = &r300->radeon.tcl.aos[index];
+		i = vbuf->attribs[index].element;
+
+		stride = (input[i]->StrideB == 0) ? getTypeSize(input[i]->Type) * input[i]->Size : input[i]->StrideB;
+
+		if (input[i]->Type == GL_DOUBLE || input[i]->Type == GL_UNSIGNED_INT || input[i]->Type == GL_INT ||
+#if MESA_BIG_ENDIAN
+				getTypeSize(input[i]->Type) != 4 ||
+#endif
+				stride < 4) {
+
+			r300ConvertAttrib(ctx, count, input[i], &vbuf->attribs[index]);
+		} else {
+			if (input[i]->BufferObj->Name) {
+				if (stride % 4 != 0) {
+					assert(((intptr_t) input[i]->Ptr) % input[i]->StrideB == 0);
+					r300AlignDataToDword(ctx, input[i], count, &vbuf->attribs[index]);
+					vbuf->attribs[index].is_named_bo = GL_FALSE;
+				} else {
+					vbuf->attribs[index].stride = input[i]->StrideB;
+					vbuf->attribs[index].bo_offset = (intptr_t) input[i]->Ptr;
+					vbuf->attribs[index].bo = get_radeon_buffer_object(input[i]->BufferObj)->bo;
+					vbuf->attribs[index].is_named_bo = GL_TRUE;
+				}
+			} else {
+
+				int size;
+				int local_count = count;
+				uint32_t *dst;
+
+				if (input[i]->StrideB == 0) {
+					size = getTypeSize(input[i]->Type) * input[i]->Size;
+					local_count = 1;
+				} else {
+					size = getTypeSize(input[i]->Type) * input[i]->Size * local_count;
+				}
+
+				radeonAllocDmaRegion(&r300->radeon, &vbuf->attribs[index].bo, &vbuf->attribs[index].bo_offset, size, 32);
+				assert(vbuf->attribs[index].bo->ptr != NULL);
+				dst = (uint32_t *)ADD_POINTERS(vbuf->attribs[index].bo->ptr, vbuf->attribs[index].bo_offset);
+				switch (vbuf->attribs[index].dwords) {
+					case 1: radeonEmitVec4(dst, input[i]->Ptr, input[i]->StrideB, local_count); break;
+					case 2: radeonEmitVec8(dst, input[i]->Ptr, input[i]->StrideB, local_count); break;
+					case 3: radeonEmitVec12(dst, input[i]->Ptr, input[i]->StrideB, local_count); break;
+					case 4: radeonEmitVec16(dst, input[i]->Ptr, input[i]->StrideB, local_count); break;
+					default: assert(0); break;
+				}
+
+			}
+		}
+
+		aos->count = vbuf->attribs[index].stride == 0 ? 1 : count;
+		aos->stride = vbuf->attribs[index].stride / sizeof(float);
+		aos->components = vbuf->attribs[index].dwords;
+		aos->bo = vbuf->attribs[index].bo;
+		aos->offset = vbuf->attribs[index].bo_offset;
+
+		if (vbuf->attribs[index].is_named_bo) {
+			radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs, r300->vbuf.attribs[index].bo, RADEON_GEM_DOMAIN_GTT, 0);
+		}
+	}
+
+	r300->radeon.tcl.aos_count = vbuf->num_attribs;
+	ret = radeon_cs_space_check_with_bo(r300->radeon.cmdbuf.cs, first_elem(&r300->radeon.dma.reserved)->bo, RADEON_GEM_DOMAIN_GTT, 0);
+	r300SwitchFallback(ctx, R300_FALLBACK_INVALID_BUFFERS, ret);
+
+}
+
+static void r300FreeData(GLcontext *ctx)
+{
+	/* Need to zero tcl.aos[n].bo and tcl.elt_dma_bo
+	 * to prevent double unref in radeonReleaseArrays
+	 * called during context destroy
+	 */
+	radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s\n", __func__);
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	{
+		int i;
+
+		for (i = 0; i < r300->vbuf.num_attribs; i++) {
+			if (!r300->vbuf.attribs[i].is_named_bo) {
+				radeon_bo_unref(r300->vbuf.attribs[i].bo);
+			}
+			r300->radeon.tcl.aos[i].bo = NULL;
+		}
+	}
+
+	{
+		if (r300->ind_buf.bo != NULL) {
+			radeon_bo_unref(r300->ind_buf.bo);
+		}
+	}
+}
+
+static GLuint r300PredictTryDrawPrimsSize(GLcontext *ctx, GLuint nr_prims)
+{
+	struct r300_context *r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	GLboolean flushed;
+	GLuint dwords;
+	GLuint state_size;
+
+	dwords = 2*CACHE_FLUSH_BUFSZ;
+	dwords += PRE_EMIT_STATE_BUFSZ;
+	dwords += (AOS_BUFSZ(vbuf->num_attribs)
+		+ SCISSORS_BUFSZ*2
+		+ FIREAOS_BUFSZ )*nr_prims;
+
+	state_size = radeonCountStateEmitSize(&r300->radeon);
+	flushed = rcommonEnsureCmdBufSpace(&r300->radeon,
+			dwords + state_size,
+			__FUNCTION__);
+	if (flushed)
+		dwords += radeonCountStateEmitSize(&r300->radeon);
+	else
+		dwords += state_size;
+
+	radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s: total prediction size is %d.\n", __FUNCTION__, dwords);
+	return dwords;
+}
+
+static GLboolean r300TryDrawPrims(GLcontext *ctx,
+					 const struct gl_client_array *arrays[],
+					 const struct _mesa_prim *prim,
+					 GLuint nr_prims,
+					 const struct _mesa_index_buffer *ib,
+					 GLuint min_index,
+					 GLuint max_index )
+{
+	struct r300_context *r300 = R300_CONTEXT(ctx);
+	GLuint i;
+
+	radeon_print(RADEON_RENDER, RADEON_NORMAL, "%s: %u (%d-%d) cs begin at %d\n",
+				__FUNCTION__, nr_prims, min_index, max_index, r300->radeon.cmdbuf.cs->cdw );
+
+	if (ctx->NewState)
+		_mesa_update_state( ctx );
+
+	if (r300->options.hw_tcl_enabled)
+		_tnl_UpdateFixedFunctionProgram(ctx);
+
+	r300UpdateShaders(r300);
+
+	r300SwitchFallback(ctx, R300_FALLBACK_INVALID_BUFFERS, !r300ValidateBuffers(ctx));
+
+	r300SetVertexFormat(ctx, arrays, max_index + 1);
+
+	if (r300->fallback)
+		return GL_FALSE;
+
+	r300SetupVAP(ctx, r300->selected_vp->code.InputsRead, r300->selected_vp->code.OutputsWritten);
+
+	r300UpdateShaderStates(r300);
+
+	/* ensure we have the cmd buf space in advance to cover
+	 * the state + DMA AOS pointers */
+	GLuint emit_end = r300PredictTryDrawPrimsSize(ctx, nr_prims)
+		+ r300->radeon.cmdbuf.cs->cdw;
+
+	r300SetupIndexBuffer(ctx, ib);
+
+	r300AllocDmaRegions(ctx, arrays, max_index + 1);
+
+	if (r300->fallback)
+		return GL_FALSE;
+
+	r300EmitCacheFlush(r300);
+	radeonEmitState(&r300->radeon);
+
+	for (i = 0; i < nr_prims; ++i) {
+		r300RunRenderPrimitive(ctx, prim[i].start, prim[i].start + prim[i].count, prim[i].mode);
+	}
+
+	r300EmitCacheFlush(r300);
+
+	r300FreeData(ctx);
+
+	radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s: %u (%d-%d) cs ending at %d\n",
+			__FUNCTION__, nr_prims, min_index, max_index, r300->radeon.cmdbuf.cs->cdw );
+
+	if (emit_end < r300->radeon.cmdbuf.cs->cdw)
+		WARN_ONCE("Rendering was %d commands larger than predicted size."
+				" We might overflow  command buffer.\n", r300->radeon.cmdbuf.cs->cdw - emit_end);
+
+	return GL_TRUE;
+}
+
+static void r300DrawPrims(GLcontext *ctx,
+			 const struct gl_client_array *arrays[],
+			 const struct _mesa_prim *prim,
+			 GLuint nr_prims,
+			 const struct _mesa_index_buffer *ib,
+			 GLboolean index_bounds_valid,
+			 GLuint min_index,
+			 GLuint max_index)
+{
+	GLboolean retval;
+
+	/* This check should get folded into just the places that
+	 * min/max index are really needed.
+	 */
+	if (!index_bounds_valid) {
+		vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+	}
+
+	if (min_index) {
+		radeon_print(RADEON_FALLBACKS, RADEON_IMPORTANT,
+				"%s: Rebasing primitives. %p nr_prims %d min_index %u max_index %u\n",
+				__func__, prim, nr_prims, min_index, max_index);
+		vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r300DrawPrims );
+		return;
+	}
+
+	/* Make an attempt at drawing */
+	retval = r300TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+
+	/* If failed run tnl pipeline - it should take care of fallbacks */
+	if (!retval)
+		_tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+}
+
+void r300InitDraw(GLcontext *ctx)
+{
+	struct vbo_context *vbo = vbo_context(ctx);
+
+	vbo->draw_prims = r300DrawPrims;
+}
diff --git a/src/mesa/drivers/dri/r300/r300_emit.c b/src/mesa/drivers/dri/r300/r300_emit.c
index 80bd3389ae..07e6223087 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_emit.c
@@ -31,6 +31,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \file
  *
  * \author Keith Whitwell <keith@tungstengraphics.com>
+ * \author Maciej Cencora <m.cencora@gmail.com>
  */
 
 #include "main/glheader.h"
@@ -46,222 +47,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_context.h"
 
 #include "r300_context.h"
-#include "radeon_ioctl.h"
 #include "r300_state.h"
 #include "r300_emit.h"
 #include "r300_ioctl.h"
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
-
-#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
-    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
-    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
-    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
-    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
-    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
-#error Cannot change these!
-#endif
-
-#define DEBUG_ALL DEBUG_VERTS
-
-#if defined(USE_X86_ASM)
-#define COPY_DWORDS( dst, src, nr )					\
-do {									\
-	int __tmp;							\
-	__asm__ __volatile__( "rep ; movsl"				\
-			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
-			      : "0" (nr),				\
-			        "D" ((long)dst),			\
-			        "S" ((long)src) );			\
-} while (0)
-#else
-#define COPY_DWORDS( dst, src, nr )		\
-do {						\
-   int j;					\
-   for ( j = 0 ; j < nr ; j++ )			\
-      dst[j] = ((int *)src)[j];			\
-   dst += nr;					\
-} while (0)
-#endif
-
-static void r300EmitVec4(GLcontext * ctx, struct r300_dma_region *rvb,
-			 GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 4)
-		COPY_DWORDS(out, data, count);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out++;
-			data += stride;
-		}
-}
-
-static void r300EmitVec8(GLcontext * ctx, struct r300_dma_region *rvb,
-			 GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 8)
-		COPY_DWORDS(out, data, count * 2);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out += 2;
-			data += stride;
-		}
-}
-
-static void r300EmitVec12(GLcontext * ctx, struct r300_dma_region *rvb,
-			  GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 12)
-		COPY_DWORDS(out, data, count * 3);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out[2] = *(int *)(data + 8);
-			out += 3;
-			data += stride;
-		}
-}
-
-static void r300EmitVec16(GLcontext * ctx, struct r300_dma_region *rvb,
-			  GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 16)
-		COPY_DWORDS(out, data, count * 4);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out[2] = *(int *)(data + 8);
-			out[3] = *(int *)(data + 12);
-			out += 4;
-			data += stride;
-		}
-}
-
-static void r300EmitVec(GLcontext * ctx, struct r300_dma_region *rvb,
-			GLvoid * data, int size, int stride, int count)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-	if (stride == 0) {
-		r300AllocDmaRegion(rmesa, rvb, size * 4, 4);
-		count = 1;
-		rvb->aos_offset = GET_START(rvb);
-		rvb->aos_stride = 0;
-	} else {
-		r300AllocDmaRegion(rmesa, rvb, size * count * 4, 4);
-		rvb->aos_offset = GET_START(rvb);
-		rvb->aos_stride = size;
-	}
-
-	switch (size) {
-	case 1:
-		r300EmitVec4(ctx, rvb, data, stride, count);
-		break;
-	case 2:
-		r300EmitVec8(ctx, rvb, data, stride, count);
-		break;
-	case 3:
-		r300EmitVec12(ctx, rvb, data, stride, count);
-		break;
-	case 4:
-		r300EmitVec16(ctx, rvb, data, stride, count);
-		break;
-	default:
-		assert(0);
-		break;
-	}
-}
-
-#define DW_SIZE(x) ((inputs[tab[(x)]] << R300_DST_VEC_LOC_SHIFT) |	\
-		    (attribptr[tab[(x)]]->size - 1) << R300_DATA_TYPE_0_SHIFT)
-
-GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
-				 int *inputs, GLint * tab, GLuint nr)
-{
-	GLuint i, dw;
-
-	/* type, inputs, stop bit, size */
-	for (i = 0; i < nr; i += 2) {
-		/* make sure input is valid, would lockup the gpu */
-		assert(inputs[tab[i]] != -1);
-		dw = (R300_SIGNED | DW_SIZE(i));
-		if (i + 1 == nr) {
-			dw |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
-		} else {
-			assert(inputs[tab[i + 1]] != -1);
-			dw |= (R300_SIGNED |
-			       DW_SIZE(i + 1)) << R300_DATA_TYPE_1_SHIFT;
-			if (i + 2 == nr) {
-				dw |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
-			}
-		}
-		dst[i >> 1] = dw;
-	}
-
-	return (nr + 1) >> 1;
-}
-
-static GLuint r300VAPInputRoute1Swizzle(int swizzle[4])
-{
-	return (swizzle[0] << R300_SWIZZLE_SELECT_X_SHIFT) |
-	    (swizzle[1] << R300_SWIZZLE_SELECT_Y_SHIFT) |
-	    (swizzle[2] << R300_SWIZZLE_SELECT_Z_SHIFT) |
-	    (swizzle[3] << R300_SWIZZLE_SELECT_W_SHIFT);
-}
-
-GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr)
-{
-	GLuint i, dw;
-
-	for (i = 0; i < nr; i += 2) {
-		dw = (r300VAPInputRoute1Swizzle(swizzle[i]) |
-		      ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
-			R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE0_SHIFT;
-		if (i + 1 < nr) {
-			dw |= (r300VAPInputRoute1Swizzle(swizzle[i + 1]) |
-			       ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
-				 R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
-		}
-		dst[i >> 1] = dw;
-	}
-
-	return (nr + 1) >> 1;
-}
+#include "r300_render.h"
+#include "r300_swtcl.h"
 
 GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
 {
@@ -272,7 +62,6 @@ GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
 
 GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	GLuint i, vic_1 = 0;
 
 	if (InputsRead & (1 << VERT_ATTRIB_POS))
@@ -284,276 +73,68 @@ GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
 	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
 		vic_1 |= R300_INPUT_CNTL_COLOR;
 
-	rmesa->state.texture.tc_count = 0;
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
 		if (InputsRead & (1 << (VERT_ATTRIB_TEX0 + i))) {
-			rmesa->state.texture.tc_count++;
 			vic_1 |= R300_INPUT_CNTL_TC0 << i;
 		}
 
 	return vic_1;
 }
 
-GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten)
+GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint vp_writes)
 {
 	GLuint ret = 0;
 
-	if (OutputsWritten & (1 << VERT_RESULT_HPOS))
+	if (vp_writes & (1 << VERT_RESULT_HPOS))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_COL0))
+	if (vp_writes & (1 << VERT_RESULT_COL0))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_COL1))
+	if (vp_writes & (1 << VERT_RESULT_COL1))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_BFC0)
-	    || OutputsWritten & (1 << VERT_RESULT_BFC1))
-		ret |=
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT |
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
+	/* Two sided lighting works only if all 4 colors are written */
+	if (vp_writes & (1 << VERT_RESULT_BFC0) || vp_writes & (1 << VERT_RESULT_BFC1))
+		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT | R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
+			   R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT | R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
 
-#if 0
-	if (OutputsWritten & (1 << VERT_RESULT_FOGC)) ;
-#endif
-
-	if (OutputsWritten & (1 << VERT_RESULT_PSIZ))
+	if (vp_writes & (1 << VERT_RESULT_PSIZ))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
 
 	return ret;
 }
 
-GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten)
+GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes)
 {
-	GLuint i, ret = 0;
+	GLuint i, ret = 0, first_free_texcoord = 0;
 
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-		if (OutputsWritten & (1 << (VERT_RESULT_TEX0 + i))) {
-			ret |= (4 << (3 * i));
-		}
-	}
-
-	return ret;
-}
-
-/* Emit vertex data to GART memory
- * Route inputs to the vertex processor
- * This function should never return R300_FALLBACK_TCL when using software tcl.
- */
-int r300EmitArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
-	GLuint nr;
-	GLuint count = vb->Count;
-	GLuint i;
-	GLuint InputsRead = 0, OutputsWritten = 0;
-	int *inputs = NULL;
-	int vir_inputs[VERT_ATTRIB_MAX];
-	GLint tab[VERT_ATTRIB_MAX];
-	int swizzle[VERT_ATTRIB_MAX][4];
-	struct r300_vertex_program *prog =
-	    (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-
-	if (hw_tcl_on) {
-		inputs = prog->inputs;
-		InputsRead = prog->key.InputsRead;
-		OutputsWritten = prog->key.OutputsWritten;
-	} else {
-		inputs = rmesa->state.sw_tcl_inputs;
-
-		DECLARE_RENDERINPUTS(render_inputs_bitset);
-		RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
-
-		vb->AttribPtr[VERT_ATTRIB_POS] = vb->ClipPtr;
-
-		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS));
-		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_NORMAL) == 0);
-		//assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0));
-
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS)) {
-			InputsRead |= 1 << VERT_ATTRIB_POS;
-			OutputsWritten |= 1 << VERT_RESULT_HPOS;
-		}
-
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0)) {
-			InputsRead |= 1 << VERT_ATTRIB_COLOR0;
-			OutputsWritten |= 1 << VERT_RESULT_COL0;
-		}
-
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR1)) {
-			InputsRead |= 1 << VERT_ATTRIB_COLOR1;
-			OutputsWritten |= 1 << VERT_RESULT_COL1;
-		}
-
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-			if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_TEX(i))) {
-				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
-				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-			}
-		}
-
-		for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-			if (InputsRead & (1 << i)) {
-				inputs[i] = nr++;
-			} else {
-				inputs[i] = -1;
-			}
-		}
-
-		/* Fixed, apply to vir0 only */
-		memcpy(vir_inputs, inputs, VERT_ATTRIB_MAX * sizeof(int));
-		inputs = vir_inputs;
-		if (InputsRead & VERT_ATTRIB_POS)
-			inputs[VERT_ATTRIB_POS] = 0;
-		if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
-			inputs[VERT_ATTRIB_COLOR0] = 2;
-		if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
-			inputs[VERT_ATTRIB_COLOR1] = 3;
-		for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
-			if (InputsRead & (1 << i))
-				inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
-
-		RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
-	}
-
-	assert(InputsRead);
-	assert(OutputsWritten);
-
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			tab[nr++] = i;
-		}
-	}
-
-	if (nr > R300_MAX_AOS_ARRAYS) {
-		return R300_FALLBACK_TCL;
-	}
-
-	for (i = 0; i < nr; i++) {
-		int ci, fix, found = 0;
-
-		swizzle[i][0] = SWIZZLE_ZERO;
-		swizzle[i][1] = SWIZZLE_ZERO;
-		swizzle[i][2] = SWIZZLE_ZERO;
-		swizzle[i][3] = SWIZZLE_ONE;
-
-		for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
-			swizzle[i][ci] = ci;
-		}
-
-		if (r300IsGartMemory(rmesa, vb->AttribPtr[tab[i]]->data, 4)) {
-			if (vb->AttribPtr[tab[i]]->stride % 4) {
-				return R300_FALLBACK_TCL;
-			}
-			rmesa->state.aos[i].address = (void *)(vb->AttribPtr[tab[i]]->data);
-			rmesa->state.aos[i].start = 0;
-			rmesa->state.aos[i].aos_offset = r300GartOffsetFromVirtual(rmesa, vb->AttribPtr[tab[i]]->data);
-			rmesa->state.aos[i].aos_stride = vb->AttribPtr[tab[i]]->stride / 4;
-			rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
-		} else {
-			r300EmitVec(ctx, &rmesa->state.aos[i],
-				    vb->AttribPtr[tab[i]]->data,
-				    vb->AttribPtr[tab[i]]->size,
-				    vb->AttribPtr[tab[i]]->stride, count);
-		}
-
-		rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
-
-		for (fix = 0; fix <= 4 - vb->AttribPtr[tab[i]]->size; fix++) {
-			if ((rmesa->state.aos[i].aos_offset - _mesa_sizeof_type(GL_FLOAT) * fix) % 4) {
-				continue;
-			}
-			found = 1;
-			break;
-		}
-
-		if (found) {
-			if (fix > 0) {
-				WARN_ONCE("Feeling lucky?\n");
-			}
-			rmesa->state.aos[i].aos_offset -= _mesa_sizeof_type(GL_FLOAT) * fix;
-			for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
-				swizzle[i][ci] += fix;
-			}
-		} else {
-			WARN_ONCE
-			    ("Cannot handle offset %x with stride %d, comp %d\n",
-			     rmesa->state.aos[i].aos_offset,
-			     rmesa->state.aos[i].aos_stride,
-			     vb->AttribPtr[tab[i]]->size);
-			return R300_FALLBACK_TCL;
+		if (vp_writes & (1 << (VERT_RESULT_TEX0 + i))) {
+			ret |= (4 << (3 * first_free_texcoord));
+			++first_free_texcoord;
 		}
 	}
 
-	/* Setup INPUT_ROUTE. */
-	R300_STATECHANGE(rmesa, vir[0]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
-	    r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-			       vb->AttribPtr, inputs, tab, nr);
-	R300_STATECHANGE(rmesa, vir[1]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
-	    r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-			       nr);
-
-	/* Setup INPUT_CNTL. */
-	R300_STATECHANGE(rmesa, vic);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-
-	/* Setup OUTPUT_VTX_FMT. */
-	R300_STATECHANGE(rmesa, vof);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] =
-	    r300VAPOutputCntl0(ctx, OutputsWritten);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] =
-	    r300VAPOutputCntl1(ctx, OutputsWritten);
-
-	rmesa->state.aos_count = nr;
-
-	return R300_FALLBACK_NONE;
-}
-
-#ifdef USER_BUFFERS
-void r300UseArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	int i;
-
-	if (rmesa->state.elt_dma.buf)
-		r300_mem_use(rmesa, rmesa->state.elt_dma.buf->id);
-
-	for (i = 0; i < rmesa->state.aos_count; i++) {
-		if (rmesa->state.aos[i].buf)
-			r300_mem_use(rmesa, rmesa->state.aos[i].buf->id);
+	if (first_free_texcoord > 8) {
+		fprintf(stderr, "\tout of free texcoords\n");
+		_mesa_exit(-1);
 	}
-}
-#endif
 
-void r300ReleaseArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	int i;
-
-	r300ReleaseDmaRegion(rmesa, &rmesa->state.elt_dma, __FUNCTION__);
-	for (i = 0; i < rmesa->state.aos_count; i++) {
-		r300ReleaseDmaRegion(rmesa, &rmesa->state.aos[i], __FUNCTION__);
-	}
+	return ret;
 }
 
 void r300EmitCacheFlush(r300ContextPtr rmesa)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
-	e32(R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
-	    R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
-
-	reg_start(R300_ZB_ZCACHE_CTLSTAT, 0);
-	e32(R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
-	    R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+	BATCH_LOCALS(&rmesa->radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(4);
+	OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT,
+		R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+		R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+	OUT_BATCH_REGVAL(R300_ZB_ZCACHE_CTLSTAT,
+		R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+		R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+	END_BATCH();
+	COMMIT_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_emit.h b/src/mesa/drivers/dri/r300/r300_emit.h
index 89d738339f..8e57e354d1 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.h
+++ b/src/mesa/drivers/dri/r300/r300_emit.h
@@ -44,28 +44,31 @@
 #include "r300_cmdbuf.h"
 #include "radeon_reg.h"
 
-/* TODO: move these defines (and the ones from DRM) into r300_reg.h and sync up
- * with DRM */
-#define CP_PACKET0(reg, n)	(RADEON_CP_PACKET0 | ((n)<<16) | ((reg)>>2))
-#define CP_PACKET3( pkt, n )						\
-	(RADEON_CP_PACKET3 | (pkt) | ((n) << 16))
-
-static INLINE uint32_t cmdpacket0(int reg, int count)
+static INLINE uint32_t cmdpacket0(struct radeon_screen *rscrn,
+                                  int reg, int count)
 {
-	drm_r300_cmd_header_t cmd;
-
-	cmd.packet0.cmd_type = R300_CMD_PACKET0;
-	cmd.packet0.count = count;
-	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
-	cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
-
-	return cmd.u;
+    if (!rscrn->kernel_mm) {
+	    drm_r300_cmd_header_t cmd;
+
+	cmd.u = 0;
+    	cmd.packet0.cmd_type = R300_CMD_PACKET0;
+	    cmd.packet0.count = count;
+    	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
+	    cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
+
+    	return cmd.u;
+    }
+    if (count) {
+        return CP_PACKET0(reg, count - 1);
+    }
+    return CP_PACKET2;
 }
 
-static INLINE uint32_t cmdvpu(int addr, int count)
+static INLINE uint32_t cmdvpu(struct radeon_screen *rscrn, int addr, int count)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.vpu.cmd_type = R300_CMD_VPU;
 	cmd.vpu.count = count;
 	cmd.vpu.adrhi = ((unsigned int)addr & 0xFF00) >> 8;
@@ -74,10 +77,12 @@ static INLINE uint32_t cmdvpu(int addr, int count)
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdr500fp(int addr, int count, int type, int clamp)
+static INLINE uint32_t cmdr500fp(struct radeon_screen *rscrn,
+                                 int addr, int count, int type, int clamp)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.r500fp.cmd_type = R300_CMD_R500FP;
 	cmd.r500fp.count = count;
 	cmd.r500fp.adrhi_flags = ((unsigned int)addr & 0x100) >> 8;
@@ -88,181 +93,137 @@ static INLINE uint32_t cmdr500fp(int addr, int count, int type, int clamp)
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdpacket3(int packet)
+static INLINE uint32_t cmdpacket3(struct radeon_screen *rscrn, int packet)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.packet3.cmd_type = R300_CMD_PACKET3;
 	cmd.packet3.packet = packet;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdcpdelay(unsigned short count)
+static INLINE uint32_t cmdcpdelay(struct radeon_screen *rscrn,
+                                  unsigned short count)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
+
 	cmd.delay.cmd_type = R300_CMD_CP_DELAY;
 	cmd.delay.count = count;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdwait(unsigned char flags)
+static INLINE uint32_t cmdwait(struct radeon_screen *rscrn,
+                               unsigned char flags)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.wait.cmd_type = R300_CMD_WAIT;
 	cmd.wait.flags = flags;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdpacify(void)
+static INLINE uint32_t cmdpacify(struct radeon_screen *rscrn)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.header.cmd_type = R300_CMD_END3D;
 
 	return cmd.u;
 }
 
 /**
- * Prepare to write a register value to register at address reg.
- * If num_extra > 0 then the following extra values are written
- * to registers with address +4, +8 and so on..
- */
-#define reg_start(reg, num_extra)					\
-	do {								\
-		int _n;							\
-		_n=(num_extra);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+2),				\
-					__FUNCTION__);			\
-		cmd_reserved=_n+2;					\
-		cmd_written=1;						\
-		cmd[0].i=cmdpacket0((reg), _n+1);			\
-	} while (0);
-
-/**
- * Emit GLuint freestyle
+ * Write the header of a packet3 to the command buffer.
+ * Outputs 2 dwords and expects (num_extra+1) additional dwords afterwards.
  */
-#define e32(dword)							\
-	do {								\
-		if(cmd_written<cmd_reserved) {				\
-			cmd[cmd_written].i=(dword);			\
-			cmd_written++;					\
-		} else {						\
-			fprintf(stderr,					\
-				"e32 but no previous packet "		\
-				"declaration.\n"			\
-				"Aborting! in %s::%s at line %d, "	\
-				"cmd_written=%d cmd_reserved=%d\n",	\
-				__FILE__, __FUNCTION__, __LINE__,	\
-				cmd_written, cmd_reserved);		\
-			_mesa_exit(-1);					\
-		}							\
+#define OUT_BATCH_PACKET3(packet, num_extra) do {\
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		\
+    	OUT_BATCH(cmdpacket3(b_l_rmesa->radeonScreen,\
+                  R300_CMD_PACKET3_RAW)); \
+    } else b_l_rmesa->cmdbuf.cs->section_cdw++;\
+	OUT_BATCH(CP_PACKET3((packet), (num_extra))); \
 	} while(0)
 
-#define	efloat(f) e32(r300PackFloat32(f))
-
-#define vsf_start_fragment(dest, length)				\
-	do {								\
-		int _n;							\
-		_n = (length);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+1),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+2;					\
-		cmd_written =1;						\
-		cmd[0].i = cmdvpu((dest), _n/4);			\
-	} while (0);
-
-#define r500fp_start_fragment(dest, length)				\
-	do {								\
-		int _n;							\
-		_n = (length);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+1),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+1;					\
-		cmd_written =1;						\
-		cmd[0].i = cmdr500fp((dest), _n/6, 0, 0);		\
-	} while (0);
-
-#define start_packet3(packet, count)					\
-	{								\
-		int _n;							\
-		GLuint _p;						\
-		_n = (count);						\
-		_p = (packet);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+3),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+3;					\
-		cmd_written = 2;					\
-		if(_n > 0x3fff) {					\
-			fprintf(stderr,"Too big packet3 %08x: cannot "	\
-				"store %d dwords\n",			\
-				_p, _n);				\
-			_mesa_exit(-1);					\
-		}							\
-		cmd[0].i = cmdpacket3(R300_CMD_PACKET3_RAW);		\
-		cmd[1].i = _p | ((_n & 0x3fff)<<16);			\
-	}
-
 /**
  * Must be sent to switch to 2d commands
  */
-void static INLINE end_3d(r300ContextPtr rmesa)
+void static INLINE end_3d(radeonContextPtr radeon)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(radeon);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].header.cmd_type = R300_CMD_END3D;
+	if (!radeon->radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdpacify(radeon->radeonScreen));
+		END_BATCH();
+	}
 }
 
 void static INLINE cp_delay(r300ContextPtr rmesa, unsigned short count)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(&rmesa->radeon);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].i = cmdcpdelay(count);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdcpdelay(rmesa->radeon.radeonScreen, count));
+		END_BATCH();
+	}
 }
 
-void static INLINE cp_wait(r300ContextPtr rmesa, unsigned char flags)
+void static INLINE cp_wait(radeonContextPtr radeon, unsigned char flags)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].i = cmdwait(flags);
+	BATCH_LOCALS(radeon);
+	uint32_t wait_until;
+
+	if (!radeon->radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdwait(radeon->radeonScreen, flags));
+		END_BATCH();
+	} else {
+		switch(flags) {
+		case R300_WAIT_2D:
+			wait_until = (1 << 14);
+			break;
+		case R300_WAIT_3D:
+			wait_until = (1 << 15);
+			break;
+		case R300_NEW_WAIT_2D_3D:
+			wait_until = (1 << 14) | (1 << 15);
+			break;
+		case R300_NEW_WAIT_2D_2D_CLEAN:
+			wait_until = (1 << 14) | (1 << 16) | (1 << 18);
+			break;
+		case R300_NEW_WAIT_3D_3D_CLEAN:
+			wait_until = (1 << 15) | (1 << 17) | (1 << 18);
+			break;
+		case R300_NEW_WAIT_2D_2D_CLEAN_3D_3D_CLEAN:
+			wait_until  = (1 << 14) | (1 << 16) | (1 << 18);
+			wait_until |= (1 << 15) | (1 << 17) | (1 << 18);
+			break;
+		default:
+			return;
+		}
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		OUT_BATCH(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+		OUT_BATCH(wait_until);
+		END_BATCH();
+	}
 }
 
-extern int r300EmitArrays(GLcontext * ctx);
-
-#ifdef USER_BUFFERS
-void r300UseArrays(GLcontext * ctx);
-#endif
-
-extern void r300ReleaseArrays(GLcontext * ctx);
 extern int r300PrimitiveType(r300ContextPtr rmesa, int prim);
 extern int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim);
 
 extern void r300EmitCacheFlush(r300ContextPtr rmesa);
 
-extern GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
-				 int *inputs, GLint * tab, GLuint nr);
-extern GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr);
 extern GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead);
 extern GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead);
-extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten);
-extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten);
+extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint vp_writes);
+extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
deleted file mode 100644
index 4ef7f2bd78..0000000000
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ /dev/null
@@ -1,686 +0,0 @@
-/*
- * Copyright (C) 2005 Ben Skeggs.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-/**
- * \file
- *
- * Fragment program compiler. Perform transformations on the intermediate
- * representation until the program is in a form where we can translate
- * it more or less directly into machine-readable form.
- *
- * \author Ben Skeggs <darktama@iinet.net.au>
- * \author Jerome Glisse <j.glisse@gmail.com>
- */
-
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/prog_instruction.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-
-#include "r300_context.h"
-#include "r300_fragprog.h"
-#include "r300_fragprog_swizzle.h"
-#include "r300_state.h"
-
-#include "radeon_nqssadce.h"
-#include "radeon_program_alu.h"
-
-
-static void reset_srcreg(struct prog_src_register* reg)
-{
-	_mesa_bzero(reg, sizeof(*reg));
-	reg->Swizzle = SWIZZLE_NOOP;
-}
-
-static struct prog_src_register shadow_ambient(struct gl_program *program, int tmu)
-{
-	gl_state_index fail_value_tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_SHADOW_AMBIENT, 0, 0, 0
-	};
-	struct prog_src_register reg = { 0, };
-
-	fail_value_tokens[2] = tmu;
-	reg.File = PROGRAM_STATE_VAR;
-	reg.Index = _mesa_add_state_reference(program->Parameters, fail_value_tokens);
-	reg.Swizzle = SWIZZLE_WWWW;
-	return reg;
-}
-
-/**
- * Transform TEX, TXP, TXB, and KIL instructions in the following way:
- *  - premultiply texture coordinates for RECT
- *  - extract operand swizzles
- *  - introduce a temporary register when write masks are needed
- *
- * \todo If/when r5xx uses the radeon_program architecture, this can probably
- * be reused.
- */
-static GLboolean transform_TEX(
-	struct radeon_transform_context *t,
-	struct prog_instruction* orig_inst, void* data)
-{
-	struct r300_fragment_program_compiler *compiler =
-		(struct r300_fragment_program_compiler*)data;
-	struct prog_instruction inst = *orig_inst;
-	struct prog_instruction* tgt;
-	GLboolean destredirect = GL_FALSE;
-
-	if (inst.Opcode != OPCODE_TEX &&
-	    inst.Opcode != OPCODE_TXB &&
-	    inst.Opcode != OPCODE_TXP &&
-	    inst.Opcode != OPCODE_KIL)
-		return GL_FALSE;
-
-	if (inst.Opcode != OPCODE_KIL &&
-	    t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) {
-		GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
-
-		if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
-			tgt = radeonAppendInstructions(t->Program, 1);
-
-			tgt->Opcode = OPCODE_MOV;
-			tgt->DstReg = inst.DstReg;
-			if (comparefunc == GL_ALWAYS) {
-				tgt->SrcReg[0].File = PROGRAM_BUILTIN;
-				tgt->SrcReg[0].Swizzle = SWIZZLE_1111;
-			} else {
-				tgt->SrcReg[0] = shadow_ambient(t->Program, inst.TexSrcUnit);
-			}
-			return GL_TRUE;
-		}
-
-		inst.DstReg.File = PROGRAM_TEMPORARY;
-		inst.DstReg.Index = radeonFindFreeTemporary(t);
-		inst.DstReg.WriteMask = WRITEMASK_XYZW;
-	}
-
-
-	/* Hardware uses [0..1]x[0..1] range for rectangle textures
-	 * instead of [0..Width]x[0..Height].
-	 * Add a scaling instruction.
-	 */
-	if (inst.Opcode != OPCODE_KIL && inst.TexSrcTarget == TEXTURE_RECT_INDEX) {
-		gl_state_index tokens[STATE_LENGTH] = {
-			STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
-			0
-		};
-
-		int tempreg = radeonFindFreeTemporary(t);
-		int factor_index;
-
-		tokens[2] = inst.TexSrcUnit;
-		factor_index = _mesa_add_state_reference(t->Program->Parameters, tokens);
-
-		tgt = radeonAppendInstructions(t->Program, 1);
-
-		tgt->Opcode = OPCODE_MUL;
-		tgt->DstReg.File = PROGRAM_TEMPORARY;
-		tgt->DstReg.Index = tempreg;
-		tgt->SrcReg[0] = inst.SrcReg[0];
-		tgt->SrcReg[1].File = PROGRAM_STATE_VAR;
-		tgt->SrcReg[1].Index = factor_index;
-
-		reset_srcreg(&inst.SrcReg[0]);
-		inst.SrcReg[0].File = PROGRAM_TEMPORARY;
-		inst.SrcReg[0].Index = tempreg;
-	}
-
-	if (inst.Opcode != OPCODE_KIL) {
-		if (inst.DstReg.File != PROGRAM_TEMPORARY ||
-		    inst.DstReg.WriteMask != WRITEMASK_XYZW) {
-			int tempreg = radeonFindFreeTemporary(t);
-
-			inst.DstReg.File = PROGRAM_TEMPORARY;
-			inst.DstReg.Index = tempreg;
-			inst.DstReg.WriteMask = WRITEMASK_XYZW;
-			destredirect = GL_TRUE;
-		}
-	}
-
-	tgt = radeonAppendInstructions(t->Program, 1);
-	_mesa_copy_instructions(tgt, &inst, 1);
-
-	if (inst.Opcode != OPCODE_KIL &&
-	    t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) {
-		GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
-		GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode;
-		int rcptemp = radeonFindFreeTemporary(t);
-		int pass, fail;
-
-		tgt = radeonAppendInstructions(t->Program, 3);
-
-		tgt[0].Opcode = OPCODE_RCP;
-		tgt[0].DstReg.File = PROGRAM_TEMPORARY;
-		tgt[0].DstReg.Index = rcptemp;
-		tgt[0].DstReg.WriteMask = WRITEMASK_W;
-		tgt[0].SrcReg[0] = inst.SrcReg[0];
-		tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-
-		tgt[1].Opcode = OPCODE_MAD;
-		tgt[1].DstReg = inst.DstReg;
-		tgt[1].DstReg.WriteMask = orig_inst->DstReg.WriteMask;
-		tgt[1].SrcReg[0] = inst.SrcReg[0];
-		tgt[1].SrcReg[0].Swizzle = SWIZZLE_ZZZZ;
-		tgt[1].SrcReg[1].File = PROGRAM_TEMPORARY;
-		tgt[1].SrcReg[1].Index = rcptemp;
-		tgt[1].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-		tgt[1].SrcReg[2].File = PROGRAM_TEMPORARY;
-		tgt[1].SrcReg[2].Index = inst.DstReg.Index;
-		if (depthmode == 0) /* GL_LUMINANCE */
-			tgt[1].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
-		else if (depthmode == 2) /* GL_ALPHA */
-			tgt[1].SrcReg[2].Swizzle = SWIZZLE_WWWW;
-
-		/* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
-		 *   r  < tex  <=>      -tex+r < 0
-		 *   r >= tex  <=> not (-tex+r < 0 */
-		if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
-			tgt[1].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW;
-		else
-			tgt[1].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
-
-		tgt[2].Opcode = OPCODE_CMP;
-		tgt[2].DstReg = orig_inst->DstReg;
-		tgt[2].SrcReg[0].File = PROGRAM_TEMPORARY;
-		tgt[2].SrcReg[0].Index = tgt[1].DstReg.Index;
-
-		if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
-			pass = 1;
-			fail = 2;
-		} else {
-			pass = 2;
-			fail = 1;
-		}
-
-		tgt[2].SrcReg[pass].File = PROGRAM_BUILTIN;
-		tgt[2].SrcReg[pass].Swizzle = SWIZZLE_1111;
-		tgt[2].SrcReg[fail] = shadow_ambient(t->Program, inst.TexSrcUnit);
-	} else if (destredirect) {
-		tgt = radeonAppendInstructions(t->Program, 1);
-
-		tgt->Opcode = OPCODE_MOV;
-		tgt->DstReg = orig_inst->DstReg;
-		tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
-		tgt->SrcReg[0].Index = inst.DstReg.Index;
-	}
-
-	return GL_TRUE;
-}
-
-
-static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
-}
-
-
-/**
- * Transform the program to support fragment.position.
- *
- * Introduce a small fragment at the start of the program that will be
- * the only code that directly reads the FRAG_ATTRIB_WPOS input.
- * All other code pieces that reference that input will be rewritten
- * to read from a newly allocated temporary.
- *
- * \todo if/when r5xx supports the radeon_program architecture, this is a
- * likely candidate for code sharing.
- */
-static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
-{
-	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
-
-	if (!(InputsRead & FRAG_BIT_WPOS))
-		return;
-
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
-
-	_mesa_insert_instructions(compiler->program, 0, 3);
-	fpi = compiler->program->Instructions;
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
-
-	for (; i < compiler->program->NumInstructions; ++i) {
-		int reg;
-		for (reg = 0; reg < 3; reg++) {
-			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
-			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
-				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
-				fpi[i].SrcReg[reg].Index = tempregi;
-			}
-		}
-	}
-}
-
-
-static void nqssadce_init(struct nqssadce_state* s)
-{
-	s->Outputs[FRAG_RESULT_COLR].Sourced = WRITEMASK_XYZW;
-	s->Outputs[FRAG_RESULT_DEPR].Sourced = WRITEMASK_W;
-}
-
-
-static GLuint build_dtm(GLuint depthmode)
-{
-	switch(depthmode) {
-	default:
-	case GL_LUMINANCE: return 0;
-	case GL_INTENSITY: return 1;
-	case GL_ALPHA: return 2;
-	}
-}
-
-static GLuint build_func(GLuint comparefunc)
-{
-	return comparefunc - GL_NEVER;
-}
-
-
-/**
- * Collect all external state that is relevant for compiling the given
- * fragment program.
- */
-static void build_state(
-	r300ContextPtr r300,
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_external_state *state)
-{
-	int unit;
-
-	_mesa_bzero(state, sizeof(*state));
-
-	for(unit = 0; unit < 16; ++unit) {
-		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
-			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
-
-			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
-			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
-		}
-	}
-}
-
-
-void r300TranslateFragmentShader(r300ContextPtr r300,
-				 struct r300_fragment_program *fp)
-{
-	struct r300_fragment_program_external_state state;
-
-	build_state(r300, fp, &state);
-	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
-		/* TODO: cache compiled programs */
-		fp->translated = GL_FALSE;
-		_mesa_memcpy(&fp->state, &state, sizeof(state));
-	}
-
-	if (!fp->translated) {
-		struct r300_fragment_program_compiler compiler;
-
-		compiler.r300 = r300;
-		compiler.fp = fp;
-		compiler.code = &fp->code;
-		compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Fragment Program: Initial program:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		insert_WPOS_trailer(&compiler);
-
-		struct radeon_program_transformation transformations[] = {
-			{ &transform_TEX, &compiler },
-			{ &radeonTransformALU, 0 },
-			{ &radeonTransformTrigSimple, 0 }
-		};
-		radeonLocalTransform(
-			r300->radeon.glCtx,
-			compiler.program,
-			3, transformations);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Fragment Program: After native rewrite:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &r300FPIsNativeSwizzle,
-			.BuildSwizzle = &r300FPBuildSwizzle,
-			.RewriteDepthOut = GL_TRUE
-		};
-		radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after NqSSA-DCE:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		if (!r300FragmentProgramEmit(&compiler))
-			fp->error = GL_TRUE;
-
-		/* Subtle: Rescue any parameters that have been added during transformations */
-		_mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
-		fp->mesa_program.Base.Parameters = compiler.program->Parameters;
-		compiler.program->Parameters = 0;
-
-		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, NULL);
-
-		if (!fp->error)
-			fp->translated = GL_TRUE;
-		if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
-			r300FragmentProgramDump(fp, &fp->code);
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
-	}
-
-	update_params(r300, fp);
-}
-
-/* just some random things... */
-void r300FragmentProgramDump(
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_code *code)
-{
-	int n, i, j;
-	static int pc = 0;
-
-	fprintf(stderr, "pc=%d*************************************\n", pc++);
-
-	fprintf(stderr, "Hardware program\n");
-	fprintf(stderr, "----------------\n");
-
-	for (n = 0; n < (code->cur_node + 1); n++) {
-		fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
-			"alu_end: %d, tex_end: %d, flags: %08x\n", n,
-			code->node[n].alu_offset,
-			code->node[n].tex_offset,
-			code->node[n].alu_end, code->node[n].tex_end,
-			code->node[n].flags);
-
-		if (n > 0 || code->first_node_has_tex) {
-			fprintf(stderr, "  TEX:\n");
-			for (i = code->node[n].tex_offset;
-			     i <= code->node[n].tex_offset + code->node[n].tex_end;
-			     ++i) {
-				const char *instr;
-
-				switch ((code->tex.
-					 inst[i] >> R300_TEX_INST_SHIFT) &
-					15) {
-				case R300_TEX_OP_LD:
-					instr = "TEX";
-					break;
-				case R300_TEX_OP_KIL:
-					instr = "KIL";
-					break;
-				case R300_TEX_OP_TXP:
-					instr = "TXP";
-					break;
-				case R300_TEX_OP_TXB:
-					instr = "TXB";
-					break;
-				default:
-					instr = "UNKNOWN";
-				}
-
-				fprintf(stderr,
-					"    %s t%i, %c%i, texture[%i]   (%08x)\n",
-					instr,
-					(code->tex.
-					 inst[i] >> R300_DST_ADDR_SHIFT) & 31,
-					't',
-					(code->tex.
-					 inst[i] >> R300_SRC_ADDR_SHIFT) & 31,
-					(code->tex.
-					 inst[i] & R300_TEX_ID_MASK) >>
-					R300_TEX_ID_SHIFT,
-					code->tex.inst[i]);
-			}
-		}
-
-		for (i = code->node[n].alu_offset;
-		     i <= code->node[n].alu_offset + code->node[n].alu_end; ++i) {
-			char srcc[3][10], dstc[20];
-			char srca[3][10], dsta[20];
-			char argc[3][20];
-			char arga[3][20];
-			char flags[5], tmp[10];
-
-			for (j = 0; j < 3; ++j) {
-				int regc = code->alu.inst[i].inst1 >> (j * 6);
-				int rega = code->alu.inst[i].inst3 >> (j * 6);
-
-				sprintf(srcc[j], "%c%i",
-					(regc & 32) ? 'c' : 't', regc & 31);
-				sprintf(srca[j], "%c%i",
-					(rega & 32) ? 'c' : 't', rega & 31);
-			}
-
-			dstc[0] = 0;
-			sprintf(flags, "%s%s%s",
-				(code->alu.inst[i].
-				 inst1 & R300_ALU_DSTC_REG_X) ? "x" : "",
-				(code->alu.inst[i].
-				 inst1 & R300_ALU_DSTC_REG_Y) ? "y" : "",
-				(code->alu.inst[i].
-				 inst1 & R300_ALU_DSTC_REG_Z) ? "z" : "");
-			if (flags[0] != 0) {
-				sprintf(dstc, "t%i.%s ",
-					(code->alu.inst[i].
-					 inst1 >> R300_ALU_DSTC_SHIFT) & 31,
-					flags);
-			}
-			sprintf(flags, "%s%s%s",
-				(code->alu.inst[i].
-				 inst1 & R300_ALU_DSTC_OUTPUT_X) ? "x" : "",
-				(code->alu.inst[i].
-				 inst1 & R300_ALU_DSTC_OUTPUT_Y) ? "y" : "",
-				(code->alu.inst[i].
-				 inst1 & R300_ALU_DSTC_OUTPUT_Z) ? "z" : "");
-			if (flags[0] != 0) {
-				sprintf(tmp, "o%i.%s",
-					(code->alu.inst[i].
-					 inst1 >> R300_ALU_DSTC_SHIFT) & 31,
-					flags);
-				strcat(dstc, tmp);
-			}
-
-			dsta[0] = 0;
-			if (code->alu.inst[i].inst3 & R300_ALU_DSTA_REG) {
-				sprintf(dsta, "t%i.w ",
-					(code->alu.inst[i].
-					 inst3 >> R300_ALU_DSTA_SHIFT) & 31);
-			}
-			if (code->alu.inst[i].inst3 & R300_ALU_DSTA_OUTPUT) {
-				sprintf(tmp, "o%i.w ",
-					(code->alu.inst[i].
-					 inst3 >> R300_ALU_DSTA_SHIFT) & 31);
-				strcat(dsta, tmp);
-			}
-			if (code->alu.inst[i].inst3 & R300_ALU_DSTA_DEPTH) {
-				strcat(dsta, "Z");
-			}
-
-			fprintf(stderr,
-				"%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
-				"       w: %3s %3s %3s -> %-20s (%08x)\n", i,
-				srcc[0], srcc[1], srcc[2], dstc,
-				code->alu.inst[i].inst1, srca[0], srca[1],
-				srca[2], dsta, code->alu.inst[i].inst3);
-
-			for (j = 0; j < 3; ++j) {
-				int regc = code->alu.inst[i].inst0 >> (j * 7);
-				int rega = code->alu.inst[i].inst2 >> (j * 7);
-				int d;
-				char buf[20];
-
-				d = regc & 31;
-				if (d < 12) {
-					switch (d % 4) {
-					case R300_ALU_ARGC_SRC0C_XYZ:
-						sprintf(buf, "%s.xyz",
-							srcc[d / 4]);
-						break;
-					case R300_ALU_ARGC_SRC0C_XXX:
-						sprintf(buf, "%s.xxx",
-							srcc[d / 4]);
-						break;
-					case R300_ALU_ARGC_SRC0C_YYY:
-						sprintf(buf, "%s.yyy",
-							srcc[d / 4]);
-						break;
-					case R300_ALU_ARGC_SRC0C_ZZZ:
-						sprintf(buf, "%s.zzz",
-							srcc[d / 4]);
-						break;
-					}
-				} else if (d < 15) {
-					sprintf(buf, "%s.www", srca[d - 12]);
-				} else if (d == 20) {
-					sprintf(buf, "0.0");
-				} else if (d == 21) {
-					sprintf(buf, "1.0");
-				} else if (d == 22) {
-					sprintf(buf, "0.5");
-				} else if (d >= 23 && d < 32) {
-					d -= 23;
-					switch (d / 3) {
-					case 0:
-						sprintf(buf, "%s.yzx",
-							srcc[d % 3]);
-						break;
-					case 1:
-						sprintf(buf, "%s.zxy",
-							srcc[d % 3]);
-						break;
-					case 2:
-						sprintf(buf, "%s.Wzy",
-							srcc[d % 3]);
-						break;
-					}
-				} else {
-					sprintf(buf, "%i", d);
-				}
-
-				sprintf(argc[j], "%s%s%s%s",
-					(regc & 32) ? "-" : "",
-					(regc & 64) ? "|" : "",
-					buf, (regc & 64) ? "|" : "");
-
-				d = rega & 31;
-				if (d < 9) {
-					sprintf(buf, "%s.%c", srcc[d / 3],
-						'x' + (char)(d % 3));
-				} else if (d < 12) {
-					sprintf(buf, "%s.w", srca[d - 9]);
-				} else if (d == 16) {
-					sprintf(buf, "0.0");
-				} else if (d == 17) {
-					sprintf(buf, "1.0");
-				} else if (d == 18) {
-					sprintf(buf, "0.5");
-				} else {
-					sprintf(buf, "%i", d);
-				}
-
-				sprintf(arga[j], "%s%s%s%s",
-					(rega & 32) ? "-" : "",
-					(rega & 64) ? "|" : "",
-					buf, (rega & 64) ? "|" : "");
-			}
-
-			fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
-				"       w: %8s %8s %8s    op: %08x\n",
-				argc[0], argc[1], argc[2],
-				code->alu.inst[i].inst0, arga[0], arga[1],
-				arga[2], code->alu.inst[i].inst2);
-		}
-	}
-}
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h
deleted file mode 100644
index 94fb554fb3..0000000000
--- a/src/mesa/drivers/dri/r300/r300_fragprog.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (C) 2005 Ben Skeggs.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-/*
- * Authors:
- *   Ben Skeggs <darktama@iinet.net.au>
- *   Jerome Glisse <j.glisse@gmail.com>
- */
-#ifndef __R300_FRAGPROG_H_
-#define __R300_FRAGPROG_H_
-
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/program.h"
-#include "shader/prog_instruction.h"
-
-#include "r300_context.h"
-#include "radeon_program.h"
-
-#define DRI_CONF_FP_OPTIMIZATION_SPEED   0
-#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
-
-#if 1
-
-/**
- * Fragment program helper macros
- */
-
-/* Produce unshifted source selectors */
-#define FP_TMP(idx) (idx)
-#define FP_CONST(idx) ((idx) | (1 << 5))
-
-/* Produce source/dest selector dword */
-#define FP_SELC_MASK_NO		0
-#define FP_SELC_MASK_X		1
-#define FP_SELC_MASK_Y		2
-#define FP_SELC_MASK_XY		3
-#define FP_SELC_MASK_Z		4
-#define FP_SELC_MASK_XZ		5
-#define FP_SELC_MASK_YZ		6
-#define FP_SELC_MASK_XYZ	7
-
-#define FP_SELC(destidx,regmask,outmask,src0,src1,src2) \
-	(((destidx) << R300_ALU_DSTC_SHIFT) |		\
-	 (FP_SELC_MASK_##regmask << 23) |		\
-	 (FP_SELC_MASK_##outmask << 26) |		\
-	 ((src0) << R300_ALU_SRC0C_SHIFT) |		\
-	 ((src1) << R300_ALU_SRC1C_SHIFT) |		\
-	 ((src2) << R300_ALU_SRC2C_SHIFT))
-
-#define FP_SELA_MASK_NO		0
-#define FP_SELA_MASK_W		1
-
-#define FP_SELA(destidx,regmask,outmask,src0,src1,src2) \
-	(((destidx) << R300_ALU_DSTA_SHIFT) |		\
-	 (FP_SELA_MASK_##regmask << 23) |		\
-	 (FP_SELA_MASK_##outmask << 24) |		\
-	 ((src0) << R300_ALU_SRC0A_SHIFT) |		\
-	 ((src1) << R300_ALU_SRC1A_SHIFT) |		\
-	 ((src2) << R300_ALU_SRC2A_SHIFT))
-
-/* Produce unshifted argument selectors */
-#define FP_ARGC(source)	R300_ALU_ARGC_##source
-#define FP_ARGA(source) R300_ALU_ARGA_##source
-#define FP_ABS(arg) ((arg) | (1 << 6))
-#define FP_NEG(arg) ((arg) ^ (1 << 5))
-
-/* Produce instruction dword */
-#define FP_INSTRC(opcode,arg0,arg1,arg2) \
-	(R300_ALU_OUTC_##opcode | 		\
-	((arg0) << R300_ALU_ARG0C_SHIFT) |	\
-	((arg1) << R300_ALU_ARG1C_SHIFT) |	\
-	((arg2) << R300_ALU_ARG2C_SHIFT))
-
-#define FP_INSTRA(opcode,arg0,arg1,arg2) \
-	(R300_ALU_OUTA_##opcode | 		\
-	((arg0) << R300_ALU_ARG0A_SHIFT) |	\
-	((arg1) << R300_ALU_ARG1A_SHIFT) |	\
-	((arg2) << R300_ALU_ARG2A_SHIFT))
-
-#endif
-
-struct r300_fragment_program;
-
-extern void r300TranslateFragmentShader(r300ContextPtr r300,
-					struct r300_fragment_program *fp);
-
-
-/**
- * Used internally by the r300 fragment program code to store compile-time
- * only data.
- */
-struct r300_fragment_program_compiler {
-	r300ContextPtr r300;
-	struct r300_fragment_program *fp;
-	struct r300_fragment_program_code *code;
-	struct gl_program *program;
-};
-
-extern GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler);
-
-
-extern void r300FragmentProgramDump(
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_code *code);
-
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.c b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
new file mode 100644
index 0000000000..469c278b51
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * Fragment program compiler. Perform transformations on the intermediate
+ * representation until the program is in a form where we can translate
+ * it more or less directly into machine-readable form.
+ *
+ * \author Ben Skeggs <darktama@iinet.net.au>
+ * \author Jerome Glisse <j.glisse@gmail.com>
+ */
+
+#include "r300_fragprog_common.h"
+
+#include "shader/program.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "compiler/radeon_compiler.h"
+
+#include "r300_state.h"
+
+
+static GLuint build_dtm(GLuint depthmode)
+{
+	switch(depthmode) {
+	default:
+	case GL_LUMINANCE: return 0;
+	case GL_INTENSITY: return 1;
+	case GL_ALPHA: return 2;
+	}
+}
+
+static GLuint build_func(GLuint comparefunc)
+{
+	return comparefunc - GL_NEVER;
+}
+
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+	r300ContextPtr r300,
+	struct gl_fragment_program *fp,
+	struct r300_fragment_program_external_state *state)
+{
+	int unit;
+
+	_mesa_bzero(state, sizeof(*state));
+
+	for(unit = 0; unit < 16; ++unit) {
+		if (fp->Base.ShadowSamplers & (1 << unit)) {
+			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
+
+			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
+		}
+	}
+}
+
+
+/**
+ * Transform the program to support fragment.position.
+ *
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
+ *
+ */
+static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler, struct r300_fragment_program * fp)
+{
+	int i;
+
+	if (!(compiler->Base.Program.InputsRead & FRAG_BIT_WPOS)) {
+		fp->wpos_attr = FRAG_ATTRIB_MAX;
+		return;
+	}
+
+	for (i = FRAG_ATTRIB_TEX0; i <= FRAG_ATTRIB_TEX7; ++i)
+	{
+		if (!(compiler->Base.Program.InputsRead & (1 << i))) {
+			fp->wpos_attr = i;
+			break;
+		}
+	}
+
+	rc_transform_fragment_wpos(&compiler->Base, FRAG_ATTRIB_WPOS, fp->wpos_attr);
+}
+
+/**
+ * Rewrite fragment.fogcoord to use a texture coordinate slot.
+ * Note that fogcoord is forced into an X001 pattern, and this enforcement
+ * is done here.
+ *
+ * See also the counterpart rewriting for vertex programs.
+ */
+static void rewriteFog(struct r300_fragment_program_compiler *compiler, struct r300_fragment_program * fp)
+{
+	struct prog_src_register src;
+	int i;
+
+	if (!(compiler->Base.Program.InputsRead & FRAG_BIT_FOGC)) {
+		fp->fog_attr = FRAG_ATTRIB_MAX;
+		return;
+	}
+
+	for (i = FRAG_ATTRIB_TEX0; i <= FRAG_ATTRIB_TEX7; ++i)
+	{
+		if (!(compiler->Base.Program.InputsRead & (1 << i))) {
+			fp->fog_attr = i;
+			break;
+		}
+	}
+
+	memset(&src, 0, sizeof(src));
+	src.File = PROGRAM_INPUT;
+	src.Index = fp->fog_attr;
+	src.Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+	rc_move_input(&compiler->Base, FRAG_ATTRIB_FOGC, src);
+}
+
+
+/**
+ * Reserve hardware temporary registers for the program inputs.
+ *
+ * @note This allocation is performed explicitly, because the order of inputs
+ * is determined by the RS hardware.
+ */
+static void allocate_hw_inputs(
+	struct r300_fragment_program_compiler * c,
+	void (*allocate)(void * data, unsigned input, unsigned hwreg),
+	void * mydata)
+{
+	GLuint InputsRead = c->Base.Program.InputsRead;
+	int i;
+	GLuint hwindex = 0;
+
+	/* Primary colour */
+	if (InputsRead & FRAG_BIT_COL0)
+		allocate(mydata, FRAG_ATTRIB_COL0, hwindex++);
+	InputsRead &= ~FRAG_BIT_COL0;
+
+	/* Secondary color */
+	if (InputsRead & FRAG_BIT_COL1)
+		allocate(mydata, FRAG_ATTRIB_COL1, hwindex++);
+	InputsRead &= ~FRAG_BIT_COL1;
+
+	/* Texcoords */
+	for (i = 0; i < 8; i++) {
+		if (InputsRead & (FRAG_BIT_TEX0 << i))
+			allocate(mydata, FRAG_ATTRIB_TEX0+i, hwindex++);
+	}
+	InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+	/* Fogcoords treated as a texcoord */
+	if (InputsRead & FRAG_BIT_FOGC)
+		allocate(mydata, FRAG_ATTRIB_FOGC, hwindex++);
+	InputsRead &= ~FRAG_BIT_FOGC;
+
+	/* fragment position treated as a texcoord */
+	if (InputsRead & FRAG_BIT_WPOS)
+		allocate(mydata, FRAG_ATTRIB_WPOS, hwindex++);
+	InputsRead &= ~FRAG_BIT_WPOS;
+
+	/* Anything else */
+	if (InputsRead)
+		rc_error(&c->Base, "Don't know how to handle inputs 0x%x\n", InputsRead);
+}
+
+
+static void translate_fragment_program(GLcontext *ctx, struct r300_fragment_program_cont *cont, struct r300_fragment_program *fp)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program_compiler compiler;
+
+	rc_init(&compiler.Base);
+	compiler.Base.Debug = (RADEON_DEBUG & RADEON_PIXEL) ? GL_TRUE : GL_FALSE;
+
+	compiler.code = &fp->code;
+	compiler.state = fp->state;
+	compiler.is_r500 = (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) ? GL_TRUE : GL_FALSE;
+	compiler.OutputDepth = FRAG_RESULT_DEPTH;
+	compiler.OutputColor = FRAG_RESULT_COLOR;
+	compiler.AllocateHwInputs = &allocate_hw_inputs;
+
+	if (compiler.Base.Debug) {
+		fflush(stderr);
+		_mesa_printf("Fragment Program: Initial program:\n");
+		_mesa_print_program(&cont->Base.Base);
+		fflush(stderr);
+	}
+
+	rc_mesa_to_rc_program(&compiler.Base, &cont->Base.Base);
+
+	insert_WPOS_trailer(&compiler, fp);
+
+	rewriteFog(&compiler, fp);
+
+	r3xx_compile_fragment_program(&compiler);
+	fp->error = compiler.Base.Error;
+
+	fp->InputsRead = compiler.Base.Program.InputsRead;
+
+	rc_destroy(&compiler.Base);
+}
+
+struct r300_fragment_program *r300SelectAndTranslateFragmentShader(GLcontext *ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program_cont *fp_list;
+	struct r300_fragment_program *fp;
+	struct r300_fragment_program_external_state state;
+
+	fp_list = (struct r300_fragment_program_cont *)ctx->FragmentProgram._Current;
+	build_state(r300, ctx->FragmentProgram._Current, &state);
+
+	fp = fp_list->progs;
+	while (fp) {
+		if (_mesa_memcmp(&fp->state, &state, sizeof(state)) == 0) {
+			return r300->selected_fp = fp;
+		}
+		fp = fp->next;
+	}
+
+	fp = _mesa_calloc(sizeof(struct r300_fragment_program));
+
+	fp->state = state;
+
+	fp->next = fp_list->progs;
+	fp_list->progs = fp;
+
+	translate_fragment_program(ctx, fp_list, fp);
+
+	return r300->selected_fp = fp;
+}
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.h b/src/mesa/drivers/dri/r300/r300_fragprog_common.h
new file mode 100644
index 0000000000..3d64c08cee
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_FRAGPROG_COMMON_H_
+#define __R300_FRAGPROG_COMMON_H_
+
+#include "main/mtypes.h"
+
+#include "r300_context.h"
+
+struct r300_fragment_program *r300SelectAndTranslateFragmentShader(GLcontext *ctx);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
index ee85e229f0..5cb04e2bb6 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
@@ -44,82 +44,160 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/context.h"
+#include "main/simple_list.h"
 #include "swrast/swrast.h"
 
+#include "radeon_common.h"
+#include "radeon_lock.h"
 #include "r300_context.h"
-#include "radeon_ioctl.h"
 #include "r300_ioctl.h"
 #include "r300_cmdbuf.h"
 #include "r300_state.h"
 #include "r300_vertprog.h"
 #include "radeon_reg.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
+#include "r300_context.h"
 
 #include "vblank.h"
 
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+
 #define CLEARBUFFER_COLOR	0x1
 #define CLEARBUFFER_DEPTH	0x2
 #define CLEARBUFFER_STENCIL	0x4
 
-static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
-{
-	GLcontext *ctx = r300->radeon.glCtx;
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
-	GLuint cboffset, cbpitch;
-	drm_r300_cmd_header_t *cmd2;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
-	r300ContextPtr rmesa = r300;
+#if 1
 
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s: %s buffer (%i,%i %ix%i)\n",
-			__FUNCTION__, buffer ? "back" : "front",
-			dPriv->x, dPriv->y, dPriv->w, dPriv->h);
+/**
+ * Fragment program helper macros
+ */
 
-	if (buffer) {
-		cboffset = r300->radeon.radeonScreen->backOffset;
-		cbpitch = r300->radeon.radeonScreen->backPitch;
-	} else {
-		cboffset = r300->radeon.radeonScreen->frontOffset;
-		cbpitch = r300->radeon.radeonScreen->frontPitch;
-	}
+/* Produce unshifted source selectors */
+#define FP_TMP(idx) (idx)
+#define FP_CONST(idx) ((idx) | (1 << 5))
+
+/* Produce source/dest selector dword */
+#define FP_SELC_MASK_NO		0
+#define FP_SELC_MASK_X		1
+#define FP_SELC_MASK_Y		2
+#define FP_SELC_MASK_XY		3
+#define FP_SELC_MASK_Z		4
+#define FP_SELC_MASK_XZ		5
+#define FP_SELC_MASK_YZ		6
+#define FP_SELC_MASK_XYZ	7
+
+#define FP_SELC(destidx,regmask,outmask,src0,src1,src2) \
+	(((destidx) << R300_ALU_DSTC_SHIFT) |		\
+	 (FP_SELC_MASK_##regmask << 23) |		\
+	 (FP_SELC_MASK_##outmask << 26) |		\
+	 ((src0) << R300_ALU_SRC0C_SHIFT) |		\
+	 ((src1) << R300_ALU_SRC1C_SHIFT) |		\
+	 ((src2) << R300_ALU_SRC2C_SHIFT))
+
+#define FP_SELA_MASK_NO		0
+#define FP_SELA_MASK_W		1
+
+#define FP_SELA(destidx,regmask,outmask,src0,src1,src2) \
+	(((destidx) << R300_ALU_DSTA_SHIFT) |		\
+	 (FP_SELA_MASK_##regmask << 23) |		\
+	 (FP_SELA_MASK_##outmask << 24) |		\
+	 ((src0) << R300_ALU_SRC0A_SHIFT) |		\
+	 ((src1) << R300_ALU_SRC1A_SHIFT) |		\
+	 ((src2) << R300_ALU_SRC2A_SHIFT))
+
+/* Produce unshifted argument selectors */
+#define FP_ARGC(source)	R300_ALU_ARGC_##source
+#define FP_ARGA(source) R300_ALU_ARGA_##source
+#define FP_ABS(arg) ((arg) | (1 << 6))
+#define FP_NEG(arg) ((arg) ^ (1 << 5))
+
+/* Produce instruction dword */
+#define FP_INSTRC(opcode,arg0,arg1,arg2) \
+	(R300_ALU_OUTC_##opcode | 		\
+	((arg0) << R300_ALU_ARG0C_SHIFT) |	\
+	((arg1) << R300_ALU_ARG1C_SHIFT) |	\
+	((arg2) << R300_ALU_ARG2C_SHIFT))
+
+#define FP_INSTRA(opcode,arg0,arg1,arg2) \
+	(R300_ALU_OUTA_##opcode | 		\
+	((arg0) << R300_ALU_ARG0A_SHIFT) |	\
+	((arg1) << R300_ALU_ARG1A_SHIFT) |	\
+	((arg2) << R300_ALU_ARG2A_SHIFT))
 
-	cboffset += r300->radeon.radeonScreen->fbLocation;
+#endif
 
-	cp_wait(r300, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-	end_3d(rmesa);
+static void r300EmitClearState(GLcontext * ctx);
 
-	R300_STATECHANGE(r300, cb);
-	reg_start(R300_RB3D_COLOROFFSET0, 0);
-	e32(cboffset);
+static void r300ClearBuffer(r300ContextPtr r300, int flags,
+			    struct radeon_renderbuffer *rrb,
+			    struct radeon_renderbuffer *rrbd)
+{
+	BATCH_LOCALS(&r300->radeon);
+	GLcontext *ctx = r300->radeon.glCtx;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	GLuint cbpitch = 0;
+	r300ContextPtr rmesa = r300;
 
-	if (r300->radeon.radeonScreen->cpp == 4)
-		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		cbpitch |= R300_COLOR_FORMAT_RGB565;
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "%s: buffer %p (%i,%i %ix%i)\n",
+			__FUNCTION__, rrb, dPriv->x, dPriv->y,
+			dPriv->w, dPriv->h);
 
-	if (r300->radeon.sarea->tiling_enabled)
-		cbpitch |= R300_COLOR_TILE_ENABLE;
+	if (rrb) {
+		cbpitch = (rrb->pitch / rrb->cpp);
+		if (rrb->cpp == 4)
+			cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+		else
+			cbpitch |= R300_COLOR_FORMAT_RGB565;
 
-	reg_start(R300_RB3D_COLORPITCH0, 0);
-	e32(cbpitch);
+		if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
+			cbpitch |= R300_COLOR_TILE_ENABLE;
+        }
+	}
 
-	R300_STATECHANGE(r300, cmk);
-	reg_start(RB3D_COLOR_CHANNEL_MASK, 0);
+	/* TODO in bufmgr */
+	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	end_3d(&rmesa->radeon);
 
 	if (flags & CLEARBUFFER_COLOR) {
-		e32((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
-		    (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
-		    (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
-		    (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
+		assert(rrb != 0);
+		BEGIN_BATCH_NO_AUTOSTATE(6);
+		OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+		OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+		OUT_BATCH_REGVAL(R300_RB3D_COLORPITCH0, cbpitch);
+		END_BATCH();
+	}
+#if 1
+	if (flags & (CLEARBUFFER_DEPTH | CLEARBUFFER_STENCIL)) {
+		uint32_t zbpitch = (rrbd->pitch / rrbd->cpp);
+		if (rrbd->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
+			zbpitch |= R300_DEPTHMACROTILE_ENABLE;
+        }
+		if (rrbd->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
+            zbpitch |= R300_DEPTHMICROTILE_TILED;
+        }
+		BEGIN_BATCH_NO_AUTOSTATE(6);
+		OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
+		OUT_BATCH_RELOC(0, rrbd->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+		OUT_BATCH_REGSEQ(R300_ZB_DEPTHPITCH, 1);
+		if (!r300->radeon.radeonScreen->kernel_mm)
+			OUT_BATCH(zbpitch);
+		else
+			OUT_BATCH_RELOC(zbpitch, rrbd->bo, zbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+		END_BATCH();
+	}
+#endif
+	BEGIN_BATCH_NO_AUTOSTATE(6);
+	OUT_BATCH_REGSEQ(RB3D_COLOR_CHANNEL_MASK, 1);
+	if (flags & CLEARBUFFER_COLOR) {
+		OUT_BATCH((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
+			  (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
+			  (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
+			  (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
 	} else {
-		e32(0x0);
+		OUT_BATCH(0);
 	}
 
-	R300_STATECHANGE(r300, zs);
-	reg_start(R300_ZB_CNTL, 2);
 
 	{
 		uint32_t t1, t2;
@@ -146,73 +224,92 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
 			     R300_S_FRONT_ZFAIL_OP_SHIFT);
 		}
 
-		e32(t1);
-		e32(t2);
-		e32(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) << R300_STENCILWRITEMASK_SHIFT) |
-		    (ctx->Stencil.Clear & R300_STENCILREF_MASK));
+		OUT_BATCH_REGSEQ(R300_ZB_CNTL, 3);
+		OUT_BATCH(t1);
+		OUT_BATCH(t2);
+		OUT_BATCH(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
+                   R300_STENCILWRITEMASK_SHIFT) |
+			  (ctx->Stencil.Clear & R300_STENCILREF_MASK));
+		END_BATCH();
 	}
 
-	cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
-	cmd2[0].packet3.cmd_type = R300_CMD_PACKET3;
-	cmd2[0].packet3.packet = R300_CMD_PACKET3_CLEAR;
-	cmd2[1].u = r300PackFloat32(dPriv->w / 2.0);
-	cmd2[2].u = r300PackFloat32(dPriv->h / 2.0);
-	cmd2[3].u = r300PackFloat32(ctx->Depth.Clear);
-	cmd2[4].u = r300PackFloat32(1.0);
-	cmd2[5].u = r300PackFloat32(ctx->Color.ClearColor[0]);
-	cmd2[6].u = r300PackFloat32(ctx->Color.ClearColor[1]);
-	cmd2[7].u = r300PackFloat32(ctx->Color.ClearColor[2]);
-	cmd2[8].u = r300PackFloat32(ctx->Color.ClearColor[3]);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(9);
+		OUT_BATCH(cmdpacket3(r300->radeon.radeonScreen, R300_CMD_PACKET3_CLEAR));
+		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+		OUT_BATCH_FLOAT32(1.0);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+		END_BATCH();
+	} else {
+		OUT_BATCH(CP_PACKET3(R200_3D_DRAW_IMMD_2, 8));
+		OUT_BATCH(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
+			  (1 << R300_PRIM_NUM_VERTICES_SHIFT));
+		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+		OUT_BATCH_FLOAT32(1.0);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+	}
 
 	r300EmitCacheFlush(rmesa);
-	cp_wait(rmesa, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+
+	R300_STATECHANGE(r300, cb);
+	R300_STATECHANGE(r300, cmk);
+	R300_STATECHANGE(r300, zs);
 }
 
 static void r300EmitClearState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	r300ContextPtr rmesa = r300;
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	BATCH_LOCALS(&r300->radeon);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
 	int i;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
-	int has_tcl = 1;
+	int has_tcl;
 	int is_r500 = 0;
 	GLuint vap_cntl;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
-        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-                is_r500 = 1;
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+		is_r500 = 1;
 
-
-	/* FIXME: the values written to R300_VAP_INPUT_ROUTE_0_0 and
-	 * R300_VAP_INPUT_ROUTE_0_1 are in fact known, however, the values are
-	 * quite complex; see the functions in r300_emit.c.
+	/* State atom dirty tracking is a little subtle here.
+	 *
+	 * On the one hand, we need to make sure base state is emitted
+	 * here if we start with an empty batch buffer, otherwise clear
+	 * works incorrectly with multiple processes. Therefore, the first
+	 * BEGIN_BATCH cannot be a BEGIN_BATCH_NO_AUTOSTATE.
+	 *
+	 * On the other hand, implicit state emission clears the state atom
+	 * dirty bits, so we have to call R300_STATECHANGE later than the
+	 * first BEGIN_BATCH.
 	 *
-	 * I believe it would be a good idea to extend the functions in
-	 * r300_emit.c so that they can be used to setup the default values for
-	 * these registers, as well as the actual values used for rendering.
+	 * The final trickiness is that, because we change state, we need
+	 * to ensure that any stored swtcl primitives are flushed properly
+	 * before we start changing state. See the R300_NEWPRIM in r300Clear
+	 * for this.
 	 */
-	R300_STATECHANGE(r300, vir[0]);
-	reg_start(R300_VAP_PROG_STREAM_CNTL_0, 0);
+	BEGIN_BATCH(31);
+	OUT_BATCH_REGSEQ(R300_VAP_PROG_STREAM_CNTL_0, 1);
 	if (!has_tcl)
-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
 		 ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
 	else
-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
 		 ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
 
-	/* disable fog */
-	R300_STATECHANGE(r300, fogs);
-	reg_start(R300_FG_FOG_BLEND, 0);
-	e32(0x0);
-
-	R300_STATECHANGE(r300, vir[1]);
-	reg_start(R300_VAP_PROG_STREAM_CNTL_EXT_0, 0);
-	e32(((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+	OUT_BATCH_REGVAL(R300_FG_FOG_BLEND, 0);
+	OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_EXT_0,
+	   ((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
 	       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
 	       (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
 	       (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
@@ -226,619 +323,460 @@ static void r300EmitClearState(GLcontext * ctx)
 	      << R300_SWIZZLE1_SHIFT)));
 
 	/* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
-	R300_STATECHANGE(r300, vic);
-	reg_start(R300_VAP_VTX_STATE_CNTL, 1);
-	e32((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
-	e32(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
+	OUT_BATCH_REGSEQ(R300_VAP_VTX_STATE_CNTL, 2);
+	OUT_BATCH((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
+	OUT_BATCH(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
 
-	R300_STATECHANGE(r300, vte);
 	/* comes from fglrx startup of clear */
-	reg_start(R300_SE_VTE_CNTL, 1);
-	e32(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
-	    R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
-	    R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
-	    R300_VPORT_Z_OFFSET_ENA);
-	e32(0x8);
+	OUT_BATCH_REGSEQ(R300_SE_VTE_CNTL, 2);
+	OUT_BATCH(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
+		  R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+		  R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+		  R300_VPORT_Z_OFFSET_ENA);
+	OUT_BATCH(0x8);
 
-	reg_start(R300_VAP_PSC_SGN_NORM_CNTL, 0);
-	e32(0xaaaaaaaa);
+	OUT_BATCH_REGVAL(R300_VAP_PSC_SGN_NORM_CNTL, 0xaaaaaaaa);
 
-	R300_STATECHANGE(r300, vof);
-	reg_start(R300_VAP_OUTPUT_VTX_FMT_0, 1);
-	e32(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
-	    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
-	e32(0x0);		/* no textures */
+	OUT_BATCH_REGSEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+	OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
+		  R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
+	OUT_BATCH(0); /* no textures */
 
-	R300_STATECHANGE(r300, txe);
-	reg_start(R300_TX_ENABLE, 0);
-	e32(0x0);
+	OUT_BATCH_REGVAL(R300_TX_ENABLE, 0);
 
-	R300_STATECHANGE(r300, vpt);
-	reg_start(R300_SE_VPORT_XSCALE, 5);
-	efloat(1.0);
-	efloat(dPriv->x);
-	efloat(1.0);
-	efloat(dPriv->y);
-	efloat(1.0);
-	efloat(0.0);
+	OUT_BATCH_REGSEQ(R300_SE_VPORT_XSCALE, 6);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(dPriv->x);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(dPriv->y);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(0.0);
 
-	R300_STATECHANGE(r300, at);
-	reg_start(R300_FG_ALPHA_FUNC, 0);
-	e32(0x0);
+	OUT_BATCH_REGVAL(R300_FG_ALPHA_FUNC, 0);
+
+	OUT_BATCH_REGSEQ(R300_RB3D_CBLEND, 2);
+	OUT_BATCH(0x0);
+	OUT_BATCH(0x0);
+	END_BATCH();
 
+	R300_STATECHANGE(r300, vir[0]);
+	R300_STATECHANGE(r300, fogs);
+	R300_STATECHANGE(r300, vir[1]);
+	R300_STATECHANGE(r300, vic);
+	R300_STATECHANGE(r300, vte);
+	R300_STATECHANGE(r300, vof);
+	R300_STATECHANGE(r300, txe);
+	R300_STATECHANGE(r300, vpt);
+	R300_STATECHANGE(r300, at);
 	R300_STATECHANGE(r300, bld);
-	reg_start(R300_RB3D_CBLEND, 1);
-	e32(0x0);
-	e32(0x0);
+	R300_STATECHANGE(r300, ps);
 
 	if (has_tcl) {
-	    R300_STATECHANGE(r300, vap_clip_cntl);
-	    reg_start(R300_VAP_CLIP_CNTL, 0);
-	    e32(R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+		R300_STATECHANGE(r300, vap_clip_cntl);
+
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		OUT_BATCH_REGVAL(R300_VAP_CLIP_CNTL, R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+		END_BATCH();
         }
 
-	R300_STATECHANGE(r300, ps);
-	reg_start(R300_GA_POINT_SIZE, 0);
-	e32(((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
-	    ((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+	BEGIN_BATCH_NO_AUTOSTATE(2);
+	OUT_BATCH_REGVAL(R300_GA_POINT_SIZE,
+		((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
+		((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+	END_BATCH();
 
 	if (!is_r500) {
 		R300_STATECHANGE(r300, ri);
-		reg_start(R300_RS_IP_0, 7);
-		for (i = 0; i < 8; ++i) {
-			e32(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
-		}
-
 		R300_STATECHANGE(r300, rc);
-		/* The second constant is needed to get glxgears display anything .. */
-		reg_start(R300_RS_COUNT, 1);
-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		e32(0x0);
-
 		R300_STATECHANGE(r300, rr);
-		reg_start(R300_RS_INST_0, 0);
-		e32(R300_RS_INST_COL_CN_WRITE);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R300_RS_IP_0, 8);
+		for (i = 0; i < 8; ++i)
+			OUT_BATCH(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
+
+		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+		OUT_BATCH(0x0);
+
+		OUT_BATCH_REGVAL(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
+		END_BATCH();
 	} else {
 		R300_STATECHANGE(r300, ri);
-		reg_start(R500_RS_IP_0, 7);
+		R300_STATECHANGE(r300, rc);
+		R300_STATECHANGE(r300, rr);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R500_RS_IP_0, 8);
 		for (i = 0; i < 8; ++i) {
-			e32((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-			    (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+			OUT_BATCH((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+				  (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
 		}
 
-		R300_STATECHANGE(r300, rc);
-		/* The second constant is needed to get glxgears display anything .. */
-		reg_start(R300_RS_COUNT, 1);
-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		e32(0x0);
-
-		R300_STATECHANGE(r300, rr);
-		reg_start(R500_RS_INST_0, 0);
-		e32(R500_RS_INST_COL_CN_WRITE);
+		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+		OUT_BATCH(0x0);
 
+		OUT_BATCH_REGVAL(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
+		END_BATCH();
 	}
 
 	if (!is_r500) {
 		R300_STATECHANGE(r300, fp);
-		reg_start(R300_US_CONFIG, 2);
-		e32(0x0);
-		e32(0x0);
-		e32(0x0);
-		reg_start(R300_US_CODE_ADDR_0, 3);
-		e32(0x0);
-		e32(0x0);
-		e32(0x0);
-		e32(R300_RGBA_OUT);
-
 		R300_STATECHANGE(r300, fpi[0]);
 		R300_STATECHANGE(r300, fpi[1]);
 		R300_STATECHANGE(r300, fpi[2]);
 		R300_STATECHANGE(r300, fpi[3]);
 
-		reg_start(R300_US_ALU_RGB_INST_0, 0);
-		e32(FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
-
-		reg_start(R300_US_ALU_RGB_ADDR_0, 0);
-		e32(FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
-
-		reg_start(R300_US_ALU_ALPHA_INST_0, 0);
-		e32(FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
-
-		reg_start(R300_US_ALU_ALPHA_ADDR_0, 0);
-		e32(FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+		BEGIN_BATCH(17);
+		OUT_BATCH_REGSEQ(R300_US_CONFIG, 3);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH_REGSEQ(R300_US_CODE_ADDR_0, 4);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(R300_RGBA_OUT);
+
+		OUT_BATCH_REGVAL(R300_US_ALU_RGB_INST_0,
+			FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
+		OUT_BATCH_REGVAL(R300_US_ALU_RGB_ADDR_0,
+			FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
+		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_INST_0,
+			FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
+		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_ADDR_0,
+			FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+		END_BATCH();
 	} else {
- 		R300_STATECHANGE(r300, fp);
- 		reg_start(R500_US_CONFIG, 1);
- 		e32(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
- 		e32(0x0);
- 		reg_start(R500_US_CODE_ADDR, 2);
- 		e32(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
- 		e32(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
- 		e32(R500_US_CODE_OFFSET_ADDR(0));
+		struct radeon_state_atom r500fp;
+		uint32_t _cmd[10];
 
+		R300_STATECHANGE(r300, fp);
 		R300_STATECHANGE(r300, r500fp);
-		r500fp_start_fragment(0, 6);
-
-		e32(R500_INST_TYPE_OUT |
-		    R500_INST_TEX_SEM_WAIT |
-		    R500_INST_LAST |
-		    R500_INST_RGB_OMASK_R |
-		    R500_INST_RGB_OMASK_G |
-		    R500_INST_RGB_OMASK_B |
-		    R500_INST_ALPHA_OMASK |
-		    R500_INST_RGB_CLAMP |
-		    R500_INST_ALPHA_CLAMP);
-
-		e32(R500_RGB_ADDR0(0) |
-		    R500_RGB_ADDR1(0) |
-		    R500_RGB_ADDR1_CONST |
-		    R500_RGB_ADDR2(0) |
-		    R500_RGB_ADDR2_CONST);
-
-		e32(R500_ALPHA_ADDR0(0) |
-		    R500_ALPHA_ADDR1(0) |
-		    R500_ALPHA_ADDR1_CONST |
-		    R500_ALPHA_ADDR2(0) |
-		    R500_ALPHA_ADDR2_CONST);
-
-		e32(R500_ALU_RGB_SEL_A_SRC0 |
-		    R500_ALU_RGB_R_SWIZ_A_R |
-		    R500_ALU_RGB_G_SWIZ_A_G |
-		    R500_ALU_RGB_B_SWIZ_A_B |
-		    R500_ALU_RGB_SEL_B_SRC0 |
-		    R500_ALU_RGB_R_SWIZ_B_R |
-		    R500_ALU_RGB_B_SWIZ_B_G |
-		    R500_ALU_RGB_G_SWIZ_B_B);
-
-		e32(R500_ALPHA_OP_CMP |
-		    R500_ALPHA_SWIZ_A_A |
-		    R500_ALPHA_SWIZ_B_A);
-
-		e32(R500_ALU_RGBA_OP_CMP |
-		    R500_ALU_RGBA_R_SWIZ_0 |
-		    R500_ALU_RGBA_G_SWIZ_0 |
-		    R500_ALU_RGBA_B_SWIZ_0 |
-		    R500_ALU_RGBA_A_SWIZ_0);
+
+		BEGIN_BATCH(7);
+		OUT_BATCH_REGSEQ(R500_US_CONFIG, 2);
+		OUT_BATCH(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+		OUT_BATCH(0x0);
+		OUT_BATCH_REGSEQ(R500_US_CODE_ADDR, 3);
+		OUT_BATCH(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
+		OUT_BATCH(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
+		OUT_BATCH(R500_US_CODE_OFFSET_ADDR(0));
+		END_BATCH();
+
+		r500fp.check = check_r500fp;
+		r500fp.cmd = _cmd;
+		r500fp.cmd[0] = cmdr500fp(r300->radeon.radeonScreen, 0, 1, 0, 0);
+		r500fp.cmd[1] = R500_INST_TYPE_OUT |
+			R500_INST_TEX_SEM_WAIT |
+			R500_INST_LAST |
+			R500_INST_RGB_OMASK_R |
+			R500_INST_RGB_OMASK_G |
+			R500_INST_RGB_OMASK_B |
+			R500_INST_ALPHA_OMASK |
+			R500_INST_RGB_CLAMP |
+			R500_INST_ALPHA_CLAMP;
+		r500fp.cmd[2] = R500_RGB_ADDR0(0) |
+			R500_RGB_ADDR1(0) |
+			R500_RGB_ADDR1_CONST |
+			R500_RGB_ADDR2(0) |
+			R500_RGB_ADDR2_CONST;
+		r500fp.cmd[3] = R500_ALPHA_ADDR0(0) |
+			R500_ALPHA_ADDR1(0) |
+			R500_ALPHA_ADDR1_CONST |
+			R500_ALPHA_ADDR2(0) |
+			R500_ALPHA_ADDR2_CONST;
+		r500fp.cmd[4] = R500_ALU_RGB_SEL_A_SRC0 |
+			R500_ALU_RGB_R_SWIZ_A_R |
+			R500_ALU_RGB_G_SWIZ_A_G |
+			R500_ALU_RGB_B_SWIZ_A_B |
+			R500_ALU_RGB_SEL_B_SRC0 |
+			R500_ALU_RGB_R_SWIZ_B_R |
+			R500_ALU_RGB_B_SWIZ_B_G |
+			R500_ALU_RGB_G_SWIZ_B_B;
+		r500fp.cmd[5] = R500_ALPHA_OP_CMP |
+			R500_ALPHA_SWIZ_A_A |
+			R500_ALPHA_SWIZ_B_A;
+		r500fp.cmd[6] = R500_ALU_RGBA_OP_CMP |
+			R500_ALU_RGBA_R_SWIZ_0 |
+			R500_ALU_RGBA_G_SWIZ_0 |
+			R500_ALU_RGBA_B_SWIZ_0 |
+			R500_ALU_RGBA_A_SWIZ_0;
+
+		r500fp.cmd[7] = 0;
+		if (r300->radeon.radeonScreen->kernel_mm) {
+			emit_r500fp(ctx, &r500fp);
+		} else {
+			int dwords = r500fp.check(ctx,&r500fp);
+			BEGIN_BATCH_NO_AUTOSTATE(dwords);
+			OUT_BATCH_TABLE(r500fp.cmd, dwords);
+			END_BATCH();
+		}
+
 	}
 
-	reg_start(R300_VAP_PVS_STATE_FLUSH_REG, 0);
-	e32(0x00000000);
+	BEGIN_BATCH(2);
+	OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+	END_BATCH();
+
 	if (has_tcl) {
-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
 			(12 << R300_VF_MAX_VTX_NUM_SHIFT));
-	    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
-	} else
-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+			vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
+	} else {
+		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
 			(5 << R300_VF_MAX_VTX_NUM_SHIFT));
+	}
 
 	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
-	    vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
-	    vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
-	    vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
-	    vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
 	else
-	    vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
 
-	R300_STATECHANGE(rmesa, vap_cntl);
-	reg_start(R300_VAP_CNTL, 0);
-	e32(vap_cntl);
+	R300_STATECHANGE(r300, vap_cntl);
+
+	BEGIN_BATCH(2);
+	OUT_BATCH_REGVAL(R300_VAP_CNTL, vap_cntl);
+	END_BATCH();
 
 	if (has_tcl) {
+        struct radeon_state_atom vpu;
+        uint32_t _cmd[10];
 		R300_STATECHANGE(r300, pvs);
-		reg_start(R300_VAP_PVS_CODE_CNTL_0, 2);
-
-		e32((0 << R300_PVS_FIRST_INST_SHIFT) |
-		    (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-		    (1 << R300_PVS_LAST_INST_SHIFT));
-		e32((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-		    (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
-		e32(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-
+		R300_STATECHANGE(r300, vap_flush);
 		R300_STATECHANGE(r300, vpi);
-		vsf_start_fragment(0x0, 8);
 
-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 0, 0xf, PVS_DST_REG_OUT));
-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(0x0);
+		BEGIN_BATCH(4);
+		OUT_BATCH_REGSEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
+		OUT_BATCH((0 << R300_PVS_FIRST_INST_SHIFT) |
+			  (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			  (1 << R300_PVS_LAST_INST_SHIFT));
+		OUT_BATCH((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+			  (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
+		OUT_BATCH(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+		END_BATCH();
+
+		vpu.check = check_vpu;
+		vpu.cmd = _cmd;
+		vpu.cmd[0] = cmdvpu(r300->radeon.radeonScreen, 0, 2);
+
+		vpu.cmd[1] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE,
+                                         0, 0xf, PVS_DST_REG_OUT);
+		vpu.cmd[2] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
+                                      PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W,
+                                      PVS_SRC_REG_INPUT, NEGATE_NONE);
+		vpu.cmd[3] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_REG_INPUT, NEGATE_NONE);
+		vpu.cmd[4] = 0x0;
+
+		vpu.cmd[5] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf,
+                                         PVS_DST_REG_OUT);
+		vpu.cmd[6] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X,
+                                      PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z,
+                                      PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT,
+                                      NEGATE_NONE);
+		vpu.cmd[7] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_REG_INPUT, NEGATE_NONE);
+		vpu.cmd[8] = 0x0;
+
+		if (r300->radeon.radeonScreen->kernel_mm) {
+			int dwords = r300->hw.vap_flush.check(ctx,&r300->hw.vap_flush);
+			BEGIN_BATCH_NO_AUTOSTATE(dwords);
+			OUT_BATCH_TABLE(r300->hw.vap_flush.cmd, dwords);
+			END_BATCH();
+			emit_vpu(ctx, &vpu);
+		} else {
+			int dwords = vpu.check(ctx,&vpu);
+			BEGIN_BATCH_NO_AUTOSTATE(dwords);
+			OUT_BATCH_TABLE(vpu.cmd, dwords);
+			END_BATCH();
+		}
 
-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf, PVS_DST_REG_OUT));
-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(0x0);
 	}
 }
 
-/**
- * Buffer clear
- */
-static void r300Clear(GLcontext * ctx, GLbitfield mask)
+static int r300KernelClear(GLcontext *ctx, GLuint flags)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
-	int flags = 0;
-	int bits = 0;
-	int swapped;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "r300Clear\n");
-
-	{
-		LOCK_HARDWARE(&r300->radeon);
-		UNLOCK_HARDWARE(&r300->radeon);
-		if (dPriv->numClipRects == 0)
-			return;
-	}
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
+	struct radeon_renderbuffer *rrb;
+	struct radeon_renderbuffer *rrbd;
+	int bits = 0, ret;
 
-	if (mask & BUFFER_BIT_FRONT_LEFT) {
-		flags |= BUFFER_BIT_FRONT_LEFT;
-		mask &= ~BUFFER_BIT_FRONT_LEFT;
-	}
+	/* Make sure it fits there. */
+	radeon_cs_space_reset_bos(r300->radeon.cmdbuf.cs);
 
-	if (mask & BUFFER_BIT_BACK_LEFT) {
-		flags |= BUFFER_BIT_BACK_LEFT;
-		mask &= ~BUFFER_BIT_BACK_LEFT;
+	if (flags & BUFFER_BIT_COLOR0) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_COLOR0);
+		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	if (mask & BUFFER_BIT_DEPTH) {
-		bits |= CLEARBUFFER_DEPTH;
-		mask &= ~BUFFER_BIT_DEPTH;
+	if (flags & BUFFER_BIT_FRONT_LEFT) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
+		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	if ((mask & BUFFER_BIT_STENCIL) && r300->state.stencil.hw_stencil) {
-		bits |= CLEARBUFFER_STENCIL;
-		mask &= ~BUFFER_BIT_STENCIL;
+	if (flags & BUFFER_BIT_BACK_LEFT) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_BACK_LEFT);
+		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	if (mask) {
-		if (RADEON_DEBUG & DEBUG_FALLBACKS)
-			fprintf(stderr, "%s: swrast clear, mask: %x\n",
-				__FUNCTION__, mask);
-		_swrast_Clear(ctx, mask);
+	rrbd = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
+	if (rrbd) {
+		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+						  rrbd->bo, 0, RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	swapped = r300->radeon.sarea->pfCurrentPage == 1;
+	ret = radeon_cs_space_check(r300->radeon.cmdbuf.cs);
+	if (ret)
+	  return -1;
 
-	/* Make sure it fits there. */
-	r300EnsureCmdBufSpace(r300, 421 * 3, __FUNCTION__);
+	rcommonEnsureCmdBufSpace(&r300->radeon, 421 * 3, __FUNCTION__);
 	if (flags || bits)
 		r300EmitClearState(ctx);
 
+	rrbd = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
+	if (rrbd && (flags & BUFFER_BIT_DEPTH))
+		bits |= CLEARBUFFER_DEPTH;
+
+	if (rrbd && (flags & BUFFER_BIT_STENCIL))
+		bits |= CLEARBUFFER_STENCIL;
+
+	if (flags & BUFFER_BIT_COLOR0) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_COLOR0);
+		r300ClearBuffer(r300, CLEARBUFFER_COLOR, rrb, NULL);
+		bits = 0;
+	}
+
 	if (flags & BUFFER_BIT_FRONT_LEFT) {
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped);
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
 		bits = 0;
 	}
 
 	if (flags & BUFFER_BIT_BACK_LEFT) {
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped ^ 1);
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_BACK_LEFT);
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
 		bits = 0;
 	}
 
 	if (bits)
-		r300ClearBuffer(r300, bits, 0);
-
-}
-
-void r300Flush(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+		r300ClearBuffer(r300, bits, NULL, rrbd);
 
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush( rmesa );
-
-	if (rmesa->cmdbuf.count_used > rmesa->cmdbuf.count_reemit)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+	COMMIT_BATCH();
+	return 0;
 }
 
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-
-void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size)
+/**
+ * Buffer clear
+ */
+static void r300Clear(GLcontext * ctx, GLbitfield mask)
 {
-	struct r300_dma_buffer *dmabuf;
-	size = MAX2(size, RADEON_BUFFER_SIZE * 16);
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-		fprintf(stderr, "%s\n", __FUNCTION__);
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+	GLbitfield swrast_mask = 0, tri_mask = 0;
+	int i, ret;
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
 
-	if (rmesa->dma.flush) {
-		rmesa->dma.flush(rmesa);
-	}
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "r300Clear\n");
 
-	if (rmesa->dma.current.buf) {
-#ifdef USER_BUFFERS
-		r300_mem_use(rmesa, rmesa->dma.current.buf->id);
-#endif
-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+	if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+		LOCK_HARDWARE(&r300->radeon);
+		UNLOCK_HARDWARE(&r300->radeon);
+		if (dPriv->numClipRects == 0)
+			return;
 	}
-	if (rmesa->dma.nr_released_bufs > 4)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
 
-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
-	dmabuf->buf = (void *)1;	/* hack */
-	dmabuf->refcount = 1;
-
-	dmabuf->id = r300_mem_alloc(rmesa, 4, size);
-	if (dmabuf->id == 0) {
-		LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
-
-		r300FlushCmdBufLocked(rmesa, __FUNCTION__);
-		radeonWaitForIdleLocked(&rmesa->radeon);
+	/* Flush swtcl vertices if necessary, because we will change hardware
+	 * state during clear. See also the state-related comment in
+	 * r300EmitClearState.
+	 */
+	R300_NEWPRIM(r300);
 
-		dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+	if (colorMask == ~0)
+	  tri_mask |= (mask & BUFFER_BITS_COLOR);
+	else
+	  tri_mask |= (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT));
 
-		UNLOCK_HARDWARE(&rmesa->radeon);
 
-		if (dmabuf->id == 0) {
-			fprintf(stderr,
-				"Error: Could not get dma buffer... exiting\n");
-			_mesa_exit(-1);
-		}
+	/* HW stencil */
+	if (mask & BUFFER_BIT_STENCIL) {
+		tri_mask |= BUFFER_BIT_STENCIL;
 	}
 
-	rmesa->dma.current.buf = dmabuf;
-	rmesa->dma.current.address = r300_mem_ptr(rmesa, dmabuf->id);
-	rmesa->dma.current.end = size;
-	rmesa->dma.current.start = 0;
-	rmesa->dma.current.ptr = 0;
-}
-
-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-			  struct r300_dma_region *region, const char *caller)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
-
-	if (!region->buf)
-		return;
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (--region->buf->refcount == 0) {
-		r300_mem_free(rmesa, region->buf->id);
-		FREE(region->buf);
-		rmesa->dma.nr_released_bufs++;
+	/* HW depth */
+	if (mask & BUFFER_BIT_DEPTH) {
+    	        tri_mask |= BUFFER_BIT_DEPTH;
 	}
 
-	region->buf = 0;
-	region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r300AllocDmaRegion(r300ContextPtr rmesa,
-			struct r300_dma_region *region,
-			int bytes, int alignment)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (region->buf)
-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
-
-	alignment--;
-	rmesa->dma.current.start = rmesa->dma.current.ptr =
-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
-		r300RefillCurrentDmaRegion(rmesa, (bytes + 0x7) & ~0x7);
-
-	region->start = rmesa->dma.current.start;
-	region->ptr = rmesa->dma.current.start;
-	region->end = rmesa->dma.current.start + bytes;
-	region->address = rmesa->dma.current.address;
-	region->buf = rmesa->dma.current.buf;
-	region->buf->refcount++;
-
-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
-	rmesa->dma.current.start =
-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
-
-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
-}
+	/* If we're doing a tri pass for depth/stencil, include a likely color
+	 * buffer with it.
+	 */
 
-#else
-static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa)
-{
-	struct r300_dma_buffer *dmabuf;
-	int fd = rmesa->radeon.dri.fd;
-	int index = 0;
-	int size = 0;
-	drmDMAReq dma;
-	int ret;
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush) {
-		rmesa->dma.flush(rmesa);
+	for (i = 0; i < BUFFER_COUNT; i++) {
+	  GLuint bufBit = 1 << i;
+	  if ((tri_mask) & bufBit) {
+	    if (!fb->Attachment[i].Renderbuffer->ClassID) {
+	      tri_mask &= ~bufBit;
+	      swrast_mask |= bufBit;
+	    }
+	  }
 	}
 
-	if (rmesa->dma.current.buf)
-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
-
-	if (rmesa->dma.nr_released_bufs > 4)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	dma.context = rmesa->radeon.dri.hwContext;
-	dma.send_count = 0;
-	dma.send_list = NULL;
-	dma.send_sizes = NULL;
-	dma.flags = 0;
-	dma.request_count = 1;
-	dma.request_size = RADEON_BUFFER_SIZE;
-	dma.request_list = &index;
-	dma.request_sizes = &size;
-	dma.granted_count = 0;
-
-	LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
-
-	ret = drmDMA(fd, &dma);
-
-	if (ret != 0) {
-		/* Try to release some buffers and wait until we can't get any more */
-		if (rmesa->dma.nr_released_bufs) {
-			r300FlushCmdBufLocked(rmesa, __FUNCTION__);
-		}
-
-		if (RADEON_DEBUG & DEBUG_DMA)
-			fprintf(stderr, "Waiting for buffers\n");
-
-		radeonWaitForIdleLocked(&rmesa->radeon);
-		ret = drmDMA(fd, &dma);
-
-		if (ret != 0) {
-			UNLOCK_HARDWARE(&rmesa->radeon);
-			fprintf(stderr,
-				"Error: Could not get dma buffer... exiting\n");
-			_mesa_exit(-1);
+	/* SW fallback clearing */
+	swrast_mask = mask & ~tri_mask;
+
+	ret = 0;
+	if (tri_mask) {
+		if (r300->radeon.radeonScreen->kernel_mm)
+			radeonUserClear(ctx, tri_mask);
+		else {
+			/* if kernel clear fails due to size restraints fallback */
+			ret = r300KernelClear(ctx, tri_mask);
+			if (ret < 0)
+				swrast_mask |= tri_mask;
 		}
 	}
 
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	if (RADEON_DEBUG & DEBUG_DMA)
-		fprintf(stderr, "Allocated buffer %d\n", index);
-
-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
-	dmabuf->buf = &rmesa->radeon.radeonScreen->buffers->list[index];
-	dmabuf->refcount = 1;
-
-	rmesa->dma.current.buf = dmabuf;
-	rmesa->dma.current.address = dmabuf->buf->address;
-	rmesa->dma.current.end = dmabuf->buf->total;
-	rmesa->dma.current.start = 0;
-	rmesa->dma.current.ptr = 0;
-}
-
-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-			  struct r300_dma_region *region, const char *caller)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
-
-	if (!region->buf)
-		return;
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (--region->buf->refcount == 0) {
-		drm_radeon_cmd_header_t *cmd;
-
-		if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-			fprintf(stderr, "%s -- DISCARD BUF %d\n",
-				__FUNCTION__, region->buf->buf->idx);
-		cmd =
-		    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa,
-								sizeof
-								(*cmd) / 4,
-								__FUNCTION__);
-		cmd->dma.cmd_type = R300_CMD_DMA_DISCARD;
-		cmd->dma.buf_idx = region->buf->buf->idx;
-
-		FREE(region->buf);
-		rmesa->dma.nr_released_bufs++;
+	if (swrast_mask) {
+		if (RADEON_DEBUG & RADEON_FALLBACKS)
+			fprintf(stderr, "%s: swrast clear, mask: %x\n",
+				__FUNCTION__, swrast_mask);
+		_swrast_Clear(ctx, swrast_mask);
 	}
-
-	region->buf = 0;
-	region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r300AllocDmaRegion(r300ContextPtr rmesa,
-			struct r300_dma_region *region,
-			int bytes, int alignment)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (region->buf)
-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
-
-	alignment--;
-	rmesa->dma.current.start = rmesa->dma.current.ptr =
-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
-		r300RefillCurrentDmaRegion(rmesa);
-
-	region->start = rmesa->dma.current.start;
-	region->ptr = rmesa->dma.current.start;
-	region->end = rmesa->dma.current.start + bytes;
-	region->address = rmesa->dma.current.address;
-	region->buf = rmesa->dma.current.buf;
-	region->buf->refcount++;
-
-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
-	rmesa->dma.current.start =
-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
-
-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
-}
-
-#endif
-
-GLboolean r300IsGartMemory(r300ContextPtr rmesa, const GLvoid * pointer,
-			   GLint size)
-{
-	int offset =
-	    (char *)pointer -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-	int valid = (size >= 0 && offset >= 0
-		     && offset + size <
-		     rmesa->radeon.radeonScreen->gartTextures.size);
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "r300IsGartMemory( %p ) : %d\n", pointer,
-			valid);
-
-	return valid;
-}
-
-GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa, const GLvoid * pointer)
-{
-	int offset =
-	    (char *)pointer -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-
-	//fprintf(stderr, "offset=%08x\n", offset);
-
-	if (offset < 0
-	    || offset > rmesa->radeon.radeonScreen->gartTextures.size)
-		return ~0;
-	else
-		return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
 }
 
 void r300InitIoctlFuncs(struct dd_function_table *functions)
 {
 	functions->Clear = r300Clear;
 	functions->Finish = radeonFinish;
-	functions->Flush = r300Flush;
+	functions->Flush = radeonFlush;
 }
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.h b/src/mesa/drivers/dri/r300/r300_ioctl.h
index e1143fb6c3..3abfa71a6e 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.h
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.h
@@ -39,22 +39,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "radeon_drm.h"
 
-extern GLboolean r300IsGartMemory(r300ContextPtr rmesa,
-				  const GLvoid * pointer, GLint size);
-
-extern GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa,
-					const GLvoid * pointer);
-
-extern void r300Flush(GLcontext * ctx);
-
-extern void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-				 struct r300_dma_region *region,
-				 const char *caller);
-extern void r300AllocDmaRegion(r300ContextPtr rmesa,
-			       struct r300_dma_region *region, int bytes,
-			       int alignment);
-
 extern void r300InitIoctlFuncs(struct dd_function_table *functions);
 
-extern void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size);
 #endif				/* __R300_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_mem.c b/src/mesa/drivers/dri/r300/r300_mem.c
deleted file mode 100644
index f8f9d4fcdf..0000000000
--- a/src/mesa/drivers/dri/r300/r300_mem.c
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * Copyright (C) 2005 Aapo Tahkola.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-/**
- * \file
- *
- * \author Aapo Tahkola <aet@rasterburn.org>
- */
-
-#include <unistd.h>
-
-#include "r300_context.h"
-#include "r300_cmdbuf.h"
-#include "r300_ioctl.h"
-#include "r300_mem.h"
-#include "radeon_ioctl.h"
-
-#ifdef USER_BUFFERS
-
-static void resize_u_list(r300ContextPtr rmesa)
-{
-	void *temp;
-	int nsize;
-
-	temp = rmesa->rmm->u_list;
-	nsize = rmesa->rmm->u_size * 2;
-
-	rmesa->rmm->u_list = _mesa_malloc(nsize * sizeof(*rmesa->rmm->u_list));
-	_mesa_memset(rmesa->rmm->u_list, 0,
-		     nsize * sizeof(*rmesa->rmm->u_list));
-
-	if (temp) {
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-		_mesa_memcpy(rmesa->rmm->u_list, temp,
-			     rmesa->rmm->u_size * sizeof(*rmesa->rmm->u_list));
-		_mesa_free(temp);
-	}
-
-	rmesa->rmm->u_size = nsize;
-}
-
-void r300_mem_init(r300ContextPtr rmesa)
-{
-	rmesa->rmm = malloc(sizeof(struct r300_memory_manager));
-	memset(rmesa->rmm, 0, sizeof(struct r300_memory_manager));
-
-	rmesa->rmm->u_size = 128;
-	resize_u_list(rmesa);
-}
-
-void r300_mem_destroy(r300ContextPtr rmesa)
-{
-	_mesa_free(rmesa->rmm->u_list);
-	rmesa->rmm->u_list = NULL;
-
-	_mesa_free(rmesa->rmm);
-	rmesa->rmm = NULL;
-}
-
-void *r300_mem_ptr(r300ContextPtr rmesa, int id)
-{
-	assert(id <= rmesa->rmm->u_last);
-	return rmesa->rmm->u_list[id].ptr;
-}
-
-int r300_mem_find(r300ContextPtr rmesa, void *ptr)
-{
-	int i;
-
-	for (i = 1; i < rmesa->rmm->u_size + 1; i++)
-		if (rmesa->rmm->u_list[i].ptr &&
-		    ptr >= rmesa->rmm->u_list[i].ptr &&
-		    ptr <
-		    rmesa->rmm->u_list[i].ptr + rmesa->rmm->u_list[i].size)
-			break;
-
-	if (i < rmesa->rmm->u_size + 1)
-		return i;
-
-	fprintf(stderr, "%p failed\n", ptr);
-	return 0;
-}
-
-//#define MM_DEBUG
-int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size)
-{
-	drm_radeon_mem_alloc_t alloc;
-	int offset = 0, ret;
-	int i, free = -1;
-	int done_age;
-	drm_radeon_mem_free_t memfree;
-	int tries = 0;
-	static int bytes_wasted = 0, allocated = 0;
-
-	if (size < 4096)
-		bytes_wasted += 4096 - size;
-
-	allocated += size;
-
-#if 0
-	static int t = 0;
-	if (t != time(NULL)) {
-		t = time(NULL);
-		fprintf(stderr, "slots used %d, wasted %d kb, allocated %d\n",
-			rmesa->rmm->u_last, bytes_wasted / 1024,
-			allocated / 1024);
-	}
-#endif
-
-	memfree.region = RADEON_MEM_REGION_GART;
-
-      again:
-
-	done_age = radeonGetAge((radeonContextPtr) rmesa);
-
-	if (rmesa->rmm->u_last + 1 >= rmesa->rmm->u_size)
-		resize_u_list(rmesa);
-
-	for (i = rmesa->rmm->u_last + 1; i > 0; i--) {
-		if (rmesa->rmm->u_list[i].ptr == NULL) {
-			free = i;
-			continue;
-		}
-
-		if (rmesa->rmm->u_list[i].h_pending == 0 &&
-		    rmesa->rmm->u_list[i].pending
-		    && rmesa->rmm->u_list[i].age <= done_age) {
-			memfree.region_offset =
-			    (char *)rmesa->rmm->u_list[i].ptr -
-			    (char *)rmesa->radeon.radeonScreen->gartTextures.
-			    map;
-
-			ret =
-			    drmCommandWrite(rmesa->radeon.radeonScreen->
-					    driScreen->fd, DRM_RADEON_FREE,
-					    &memfree, sizeof(memfree));
-
-			if (ret) {
-				fprintf(stderr, "Failed to free at %p\n",
-					rmesa->rmm->u_list[i].ptr);
-				fprintf(stderr, "ret = %s\n", strerror(-ret));
-				exit(1);
-			} else {
-#ifdef MM_DEBUG
-				fprintf(stderr, "really freed %d at age %x\n",
-					i,
-					radeonGetAge((radeonContextPtr) rmesa));
-#endif
-				if (i == rmesa->rmm->u_last)
-					rmesa->rmm->u_last--;
-
-				if (rmesa->rmm->u_list[i].size < 4096)
-					bytes_wasted -=
-					    4096 - rmesa->rmm->u_list[i].size;
-
-				allocated -= rmesa->rmm->u_list[i].size;
-				rmesa->rmm->u_list[i].pending = 0;
-				rmesa->rmm->u_list[i].ptr = NULL;
-				free = i;
-			}
-		}
-	}
-	rmesa->rmm->u_head = i;
-
-	if (free == -1) {
-		WARN_ONCE("Ran out of slots!\n");
-		//usleep(100);
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-		tries++;
-		if (tries > 100) {
-			WARN_ONCE("Ran out of slots!\n");
-			exit(1);
-		}
-		goto again;
-	}
-
-	alloc.region = RADEON_MEM_REGION_GART;
-	alloc.alignment = alignment;
-	alloc.size = size;
-	alloc.region_offset = &offset;
-
-	ret =
-	    drmCommandWriteRead(rmesa->radeon.dri.fd, DRM_RADEON_ALLOC, &alloc,
-				sizeof(alloc));
-	if (ret) {
-#if 0
-		WARN_ONCE("Ran out of mem!\n");
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-		//usleep(100);
-		tries2++;
-		tries = 0;
-		if (tries2 > 100) {
-			WARN_ONCE("Ran out of GART memory!\n");
-			exit(1);
-		}
-		goto again;
-#else
-		WARN_ONCE
-		    ("Ran out of GART memory (for %d)!\nPlease consider adjusting GARTSize option.\n",
-		     size);
-		return 0;
-#endif
-	}
-
-	i = free;
-
-	if (i > rmesa->rmm->u_last)
-		rmesa->rmm->u_last = i;
-
-	rmesa->rmm->u_list[i].ptr =
-	    ((GLubyte *) rmesa->radeon.radeonScreen->gartTextures.map) + offset;
-	rmesa->rmm->u_list[i].size = size;
-	rmesa->rmm->u_list[i].age = 0;
-	//fprintf(stderr, "alloc %p at id %d\n", rmesa->rmm->u_list[i].ptr, i);
-
-#ifdef MM_DEBUG
-	fprintf(stderr, "allocated %d at age %x\n", i,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	return i;
-}
-
-void r300_mem_use(r300ContextPtr rmesa, int id)
-{
-	uint64_t ull;
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-	drm_r300_cmd_header_t *cmd;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (id == 0)
-		return;
-
-	cmd =
-	    (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa,
-						      2 + sizeof(ull) / 4,
-						      __FUNCTION__);
-	cmd[0].scratch.cmd_type = R300_CMD_SCRATCH;
-	cmd[0].scratch.reg = R300_MEM_SCRATCH;
-	cmd[0].scratch.n_bufs = 1;
-	cmd[0].scratch.flags = 0;
-	cmd++;
-
-	ull = (uint64_t) (intptr_t) & rmesa->rmm->u_list[id].age;
-	_mesa_memcpy(cmd, &ull, sizeof(ull));
-	cmd += sizeof(ull) / 4;
-
-	cmd[0].u = /*id */ 0;
-
-	LOCK_HARDWARE(&rmesa->radeon);	/* Protect from DRM. */
-	rmesa->rmm->u_list[id].h_pending++;
-	UNLOCK_HARDWARE(&rmesa->radeon);
-}
-
-unsigned long r300_mem_offset(r300ContextPtr rmesa, int id)
-{
-	unsigned long offset;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	offset = (char *)rmesa->rmm->u_list[id].ptr -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-	offset += rmesa->radeon.radeonScreen->gart_texture_offset;
-
-	return offset;
-}
-
-void *r300_mem_map(r300ContextPtr rmesa, int id, int access)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-	void *ptr;
-	int tries = 0;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (access == R300_MEM_R) {
-
-		if (rmesa->rmm->u_list[id].mapped == 1)
-			WARN_ONCE("buffer %d already mapped\n", id);
-
-		rmesa->rmm->u_list[id].mapped = 1;
-		ptr = r300_mem_ptr(rmesa, id);
-
-		return ptr;
-	}
-
-	if (rmesa->rmm->u_list[id].h_pending)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	if (rmesa->rmm->u_list[id].h_pending) {
-		return NULL;
-	}
-
-	while (rmesa->rmm->u_list[id].age >
-	       radeonGetAge((radeonContextPtr) rmesa) && tries++ < 1000)
-		usleep(10);
-
-	if (tries >= 1000) {
-		fprintf(stderr, "Idling failed (%x vs %x)\n",
-			rmesa->rmm->u_list[id].age,
-			radeonGetAge((radeonContextPtr) rmesa));
-		return NULL;
-	}
-
-	if (rmesa->rmm->u_list[id].mapped == 1)
-		WARN_ONCE("buffer %d already mapped\n", id);
-
-	rmesa->rmm->u_list[id].mapped = 1;
-	ptr = r300_mem_ptr(rmesa, id);
-
-	return ptr;
-}
-
-void r300_mem_unmap(r300ContextPtr rmesa, int id)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (rmesa->rmm->u_list[id].mapped == 0)
-		WARN_ONCE("buffer %d not mapped\n", id);
-
-	rmesa->rmm->u_list[id].mapped = 0;
-}
-
-void r300_mem_free(r300ContextPtr rmesa, int id)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (id == 0)
-		return;
-
-	if (rmesa->rmm->u_list[id].ptr == NULL) {
-		WARN_ONCE("Not allocated!\n");
-		return;
-	}
-
-	if (rmesa->rmm->u_list[id].pending) {
-		WARN_ONCE("%p already pended!\n", rmesa->rmm->u_list[id].ptr);
-		return;
-	}
-
-	rmesa->rmm->u_list[id].pending = 1;
-}
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_mem.h b/src/mesa/drivers/dri/r300/r300_mem.h
deleted file mode 100644
index 625a7f6d8d..0000000000
--- a/src/mesa/drivers/dri/r300/r300_mem.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef __R300_MEM_H__
-#define __R300_MEM_H__
-
-//#define R300_MEM_PDL 0
-#define R300_MEM_UL 1
-
-#define R300_MEM_R 1
-#define R300_MEM_W 2
-#define R300_MEM_RW (R300_MEM_R | R300_MEM_W)
-
-#define R300_MEM_SCRATCH 2
-
-struct r300_memory_manager {
-	struct {
-		void *ptr;
-		uint32_t size;
-		uint32_t age;
-		uint32_t h_pending;
-		int pending;
-		int mapped;
-	} *u_list;
-	int u_head, u_size, u_last;
-
-};
-
-extern void r300_mem_init(r300ContextPtr rmesa);
-extern void r300_mem_destroy(r300ContextPtr rmesa);
-extern void *r300_mem_ptr(r300ContextPtr rmesa, int id);
-extern int r300_mem_find(r300ContextPtr rmesa, void *ptr);
-extern int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size);
-extern void r300_mem_use(r300ContextPtr rmesa, int id);
-extern unsigned long r300_mem_offset(r300ContextPtr rmesa, int id);
-extern void *r300_mem_map(r300ContextPtr rmesa, int id, int access);
-extern void r300_mem_unmap(r300ContextPtr rmesa, int id);
-extern void r300_mem_free(r300ContextPtr rmesa, int id);
-
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index 7c6485ef60..b9ccd098dc 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -656,7 +656,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_GB_FOG_SELECT_C3A           (3 << 0)
 #	define R300_GB_FOG_SELECT_1_1_W         (4 << 0)
 #	define R300_GB_FOG_SELECT_Z		(5 << 0)
-#	define R300_GB_DEPTH_SELECT_Z		(0 << 3
+#	define R300_GB_DEPTH_SELECT_Z		(0 << 3)
 #	define R300_GB_DEPTH_SELECT_1_1_W	(1 << 3)
 #	define R300_GB_W_SELECT_1_W		(0 << 4)
 #	define R300_GB_W_SELECT_1		(1 << 4)
@@ -730,8 +730,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R500_RS_IP_TEX_PTR_Q_SHIFT 			18
 #define R500_RS_IP_COL_PTR_SHIFT 			24
 #define R500_RS_IP_COL_FMT_SHIFT 			27
-#	define R500_RS_COL_PTR(x)		        (x << 24)
-#       define R500_RS_COL_FMT(x)                       (x << 27)
+#	define R500_RS_COL_PTR(x)		        ((x) << 24)
+#       define R500_RS_COL_FMT(x)                       ((x) << 27)
 /* gap */
 #define R500_RS_IP_OFFSET_DIS 				(0 << 31)
 #define R500_RS_IP_OFFSET_EN 				(1 << 31)
@@ -1022,15 +1022,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 	R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD | \
 	R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
 	R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD | \
-	R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
-	R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+	R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD)
 /** TODO: might be candidate for removal, the GOURAUD stuff also looks buggy to me */
 #	define R300_RE_SHADE_MODEL_FLAT     ( \
 	R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT | \
 	R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
 	R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT | \
-	R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
-	R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+	R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD)
 
 /* Specifies red & green components of fill color -- S312 format -- Backwards comp. */
 #define R300_GA_SOLID_RG                         0x427c
@@ -1128,6 +1126,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* SU Depth Offset value */
 #define R300_SU_DEPTH_OFFSET                0x42c4
 
+#define R300_SU_REG_DEST		    0x42c8
+#	define R300_RASTER_PIPE_SELECT_0	(1 << 0)
+#	define R300_RASTER_PIPE_SELECT_1	(1 << 1)
+#	define R300_RASTER_PIPE_SELECT_2	(1 << 2)
+#	define R300_RASTER_PIPE_SELECT_3	(1 << 3)
+#	define R300_RASTER_PIPE_SELECT_ALL	0xf
+
 
 /* BEGIN: Rasterization / Interpolators - many guesses */
 
@@ -1172,9 +1177,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_RS_IP_3				        0x431C
 #       define R300_RS_INTERP_SRC_SHIFT          2 /* TODO: check for removal */
 #       define R300_RS_INTERP_SRC_MASK           (7 << 2) /* TODO: check for removal */
-#	define R300_RS_TEX_PTR(x)		        (x << 0)
-#	define R300_RS_COL_PTR(x)		        (x << 6)
-#	define R300_RS_COL_FMT(x)		        (x << 9)
+#	define R300_RS_TEX_PTR(x)		        ((x) << 0)
+#	define R300_RS_COL_PTR(x)		        ((x) << 6)
+#	define R300_RS_COL_FMT(x)		        ((x) << 9)
 #	define R300_RS_COL_FMT_RGBA		        0
 #	define R300_RS_COL_FMT_RGB0		        1
 #	define R300_RS_COL_FMT_RGB1		        2
@@ -1184,10 +1189,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_RS_COL_FMT_111A		        8
 #	define R300_RS_COL_FMT_1110		        9
 #	define R300_RS_COL_FMT_1111		        10
-#	define R300_RS_SEL_S(x)		                (x << 13)
-#	define R300_RS_SEL_T(x)		                (x << 16)
-#	define R300_RS_SEL_R(x)		                (x << 19)
-#	define R300_RS_SEL_Q(x)		                (x << 22)
+#	define R300_RS_SEL_S(x)		                ((x) << 13)
+#	define R300_RS_SEL_T(x)		                ((x) << 16)
+#	define R300_RS_SEL_R(x)		                ((x) << 19)
+#	define R300_RS_SEL_Q(x)		                ((x) << 22)
 #	define R300_RS_SEL_C0		                0
 #	define R300_RS_SEL_C1		                1
 #	define R300_RS_SEL_C2		                2
@@ -1224,6 +1229,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R500_RS_INST_COL_ADDR_SHIFT			18
 #define R500_RS_INST_TEX_ADJ				(1 << 25)
 #define R500_RS_INST_W_CN				(1 << 26)
+#define R500_RS_INST_TEX_ID(x)				((x) << R500_RS_INST_TEX_ID_SHIFT)
+#define R500_RS_INST_TEX_ADDR(x)			((x) << R500_RS_INST_TEX_ADDR_SHIFT)
+#define R500_RS_INST_COL_ID(x)				((x) << R500_RS_INST_COL_ID_SHIFT)
+#define R500_RS_INST_COL_ADDR(x)			((x) << R500_RS_INST_COL_ADDR_SHIFT)
 
 /* These DWORDs control how vertex data is routed into fragment program
  * registers, after interpolators.
@@ -1239,9 +1248,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_RS_INST_TEX_ID(x)  		((x) << 0)
 #	define R300_RS_INST_TEX_CN_WRITE 	(1 << 3)
 #	define R300_RS_INST_TEX_ADDR_SHIFT 	6
+#	define R300_RS_INST_TEX_ADDR(x)		((x) << R300_RS_INST_TEX_ADDR_SHIFT)
 #	define R300_RS_INST_COL_ID(x)		((x) << 11)
 #	define R300_RS_INST_COL_CN_WRITE	(1 << 14)
 #	define R300_RS_INST_COL_ADDR_SHIFT	17
+#	define R300_RS_INST_COL_ADDR(x)		((x) << R300_RS_INST_COL_ADDR_SHIFT)
 #	define R300_RS_INST_TEX_ADJ		(1 << 22)
 #	define R300_RS_COL_BIAS_UNUSED_SHIFT    23
 
@@ -1461,6 +1472,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_TX_FORMAT_3D		   (1 << 25)
 #	define R300_TX_FORMAT_CUBIC_MAP		   (2 << 25)
 
+#	define R300_TX_FORMAT_GAMMA			(1 << 21)
+
 	/* gap */
 	/* Floating point formats */
 	/* Note - hardware supports both 16 and 32 bit floating point */
@@ -1525,6 +1538,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R500_SEL_FILTER4_TC3		 (3 << 18)
 
 #define R300_TX_OFFSET_0                    0x4540
+#define R300_TX_OFFSET_1                    0x4544
+#define R300_TX_OFFSET_2                    0x4548
+#define R300_TX_OFFSET_3                    0x454C
+#define R300_TX_OFFSET_4                    0x4550
+#define R300_TX_OFFSET_5                    0x4554
+#define R300_TX_OFFSET_6                    0x4558
+#define R300_TX_OFFSET_7                    0x455C
 	/* BEGIN: Guess from R200 */
 #       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
 #       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
@@ -1999,6 +2019,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R500_FG_ALPHA_VALUE                0x4be0
 #	define R500_FG_ALPHA_VALUE_MASK 0x0000ffff
 
+#define RV530_FG_ZBREG_DEST                 0x4be8
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_0             (1 << 0)
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_1             (1 << 1)
+#	define RV530_FG_ZBREG_DEST_PIPE_SELECT_ALL           (3 << 0)
+
 /* gap */
 
 /* Fragment program parameters in 7.16 floating point */
@@ -2288,6 +2313,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_Z_WRITE_ENABLE		 (1 << 2)
 #	define R300_Z_SIGNED_COMPARE		 (1 << 3)
 #	define R300_STENCIL_FRONT_BACK		 (1 << 4)
+#	define R400_ZSIGNED_MAGNITUDE		 (1 << 5)
+#	define R500_STENCIL_REFMASK_FRONT_BACK	 (1 << 6)
 
 #define R300_ZB_ZSTENCILCNTL                   0x4f04
 	/* functions */
@@ -2419,6 +2446,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* Z Buffer Clear Value */
 #define R300_ZB_DEPTHCLEARVALUE                  0x4f28
 
+#define R300_ZB_ZMASK_OFFSET                     0x4f30
+#define R300_ZB_ZMASK_PITCH                      0x4f34
+#define R300_ZB_ZMASK_WRINDEX                    0x4f38
+#define R300_ZB_ZMASK_DWORD                      0x4f3c
+#define R300_ZB_ZMASK_RDINDEX                    0x4f40
+
 /* Hierarchical Z Memory Offset */
 #define R300_ZB_HIZ_OFFSET                       0x4f44
 
@@ -2646,6 +2679,24 @@ enum {
 	PVS_SRC_ADDR_MODE_1_SHIFT	= 32,
 };
 
+
+#define PVS_OP_DST_OPERAND(opcode, math_inst, macro_inst, reg_index, reg_writemask, reg_class)	\
+	 (((opcode & PVS_DST_OPCODE_MASK) << PVS_DST_OPCODE_SHIFT)	\
+	 | ((math_inst & PVS_DST_MATH_INST_MASK) << PVS_DST_MATH_INST_SHIFT)	\
+	 | ((macro_inst & PVS_DST_MACRO_INST_MASK) << PVS_DST_MACRO_INST_SHIFT)	\
+	 | ((reg_index & PVS_DST_OFFSET_MASK) << PVS_DST_OFFSET_SHIFT)	\
+	 | ((reg_writemask & 0xf) << PVS_DST_WE_X_SHIFT)	/* X Y Z W */	\
+	 | ((reg_class & PVS_DST_REG_TYPE_MASK) << PVS_DST_REG_TYPE_SHIFT))
+
+#define PVS_SRC_OPERAND(in_reg_index, comp_x, comp_y, comp_z, comp_w, reg_class, negate)	\
+	(((in_reg_index & PVS_SRC_OFFSET_MASK) << PVS_SRC_OFFSET_SHIFT)				\
+	 | ((comp_x & PVS_SRC_SWIZZLE_X_MASK) << PVS_SRC_SWIZZLE_X_SHIFT)			\
+	 | ((comp_y & PVS_SRC_SWIZZLE_Y_MASK) << PVS_SRC_SWIZZLE_Y_SHIFT)			\
+	 | ((comp_z & PVS_SRC_SWIZZLE_Z_MASK) << PVS_SRC_SWIZZLE_Z_SHIFT)			\
+	 | ((comp_w & PVS_SRC_SWIZZLE_W_MASK) << PVS_SRC_SWIZZLE_W_SHIFT)			\
+	 | ((negate & 0xf) << PVS_SRC_MODIFIER_X_SHIFT)	/* X Y Z W */				\
+	 | ((reg_class & PVS_SRC_REG_TYPE_MASK) << PVS_SRC_REG_TYPE_SHIFT))
+
 /*\}*/
 
 /* BEGIN: Packet 3 commands */
@@ -2705,7 +2756,7 @@ enum {
 #   define R500_ALPHA_OP_COS				13
 #   define R500_ALPHA_OP_MDH				14
 #   define R500_ALPHA_OP_MDV				15
-#   define R500_ALPHA_ADDRD(x)				(x << 4)
+#   define R500_ALPHA_ADDRD(x)				((x) << 4)
 #   define R500_ALPHA_ADDRD_REL				(1 << 11)
 #  define R500_ALPHA_SEL_A_SHIFT			12
 #   define R500_ALPHA_SEL_A_SRC0			(0 << 12)
@@ -2749,16 +2800,16 @@ enum {
 #   define R500_ALPHA_OMOD_DIV_4			(5 << 26)
 #   define R500_ALPHA_OMOD_DIV_8			(6 << 26)
 #   define R500_ALPHA_OMOD_DISABLE			(7 << 26)
-#   define R500_ALPHA_TARGET(x)				(x << 29)
+#   define R500_ALPHA_TARGET(x)				((x) << 29)
 #   define R500_ALPHA_W_OMASK				(1 << 31)
 #define R500_US_ALU_ALPHA_ADDR_0			0x9800
-#   define R500_ALPHA_ADDR0(x)				(x << 0)
+#   define R500_ALPHA_ADDR0(x)				((x) << 0)
 #   define R500_ALPHA_ADDR0_CONST			(1 << 8)
 #   define R500_ALPHA_ADDR0_REL				(1 << 9)
-#   define R500_ALPHA_ADDR1(x)				(x << 10)
+#   define R500_ALPHA_ADDR1(x)				((x) << 10)
 #   define R500_ALPHA_ADDR1_CONST			(1 << 18)
 #   define R500_ALPHA_ADDR1_REL				(1 << 19)
-#   define R500_ALPHA_ADDR2(x)				(x << 20)
+#   define R500_ALPHA_ADDR2(x)				((x) << 20)
 #   define R500_ALPHA_ADDR2_CONST			(1 << 28)
 #   define R500_ALPHA_ADDR2_REL				(1 << 29)
 #   define R500_ALPHA_SRCP_OP_1_MINUS_2A0		(0 << 30)
@@ -2779,7 +2830,7 @@ enum {
 #   define R500_ALU_RGBA_OP_SOP				(10 << 0)
 #   define R500_ALU_RGBA_OP_MDH				(11 << 0)
 #   define R500_ALU_RGBA_OP_MDV				(12 << 0)
-#   define R500_ALU_RGBA_ADDRD(x)			(x << 4)
+#   define R500_ALU_RGBA_ADDRD(x)			((x) << 4)
 #   define R500_ALU_RGBA_ADDRD_REL			(1 << 11)
 #  define R500_ALU_RGBA_SEL_C_SHIFT			12
 #   define R500_ALU_RGBA_SEL_C_SRC0			(0 << 12)
@@ -2906,16 +2957,16 @@ enum {
 #   define R500_ALU_RGB_OMOD_DIV_4			(5 << 26)
 #   define R500_ALU_RGB_OMOD_DIV_8			(6 << 26)
 #   define R500_ALU_RGB_OMOD_DISABLE			(7 << 26)
-#   define R500_ALU_RGB_TARGET(x)			(x << 29)
+#   define R500_ALU_RGB_TARGET(x)			((x) << 29)
 #   define R500_ALU_RGB_WMASK				(1 << 31)
 #define R500_US_ALU_RGB_ADDR_0				0x9000
-#   define R500_RGB_ADDR0(x)				(x << 0)
+#   define R500_RGB_ADDR0(x)				((x) << 0)
 #   define R500_RGB_ADDR0_CONST				(1 << 8)
 #   define R500_RGB_ADDR0_REL				(1 << 9)
-#   define R500_RGB_ADDR1(x)				(x << 10)
+#   define R500_RGB_ADDR1(x)				((x) << 10)
 #   define R500_RGB_ADDR1_CONST				(1 << 18)
 #   define R500_RGB_ADDR1_REL				(1 << 19)
-#   define R500_RGB_ADDR2(x)				(x << 20)
+#   define R500_RGB_ADDR2(x)				((x) << 20)
 #   define R500_RGB_ADDR2_CONST				(1 << 28)
 #   define R500_RGB_ADDR2_REL				(1 << 29)
 #   define R500_RGB_SRCP_OP_1_MINUS_2RGB0		(0 << 30)
@@ -2970,19 +3021,19 @@ enum {
 
 /* note that these are 8 bit lengths, despite the offsets, at least for R500 */
 #define R500_US_CODE_ADDR				0x4630
-#   define R500_US_CODE_START_ADDR(x)			(x << 0)
-#   define R500_US_CODE_END_ADDR(x)			(x << 16)
+#   define R500_US_CODE_START_ADDR(x)			((x) << 0)
+#   define R500_US_CODE_END_ADDR(x)			((x) << 16)
 #define R500_US_CODE_OFFSET				0x4638
-#   define R500_US_CODE_OFFSET_ADDR(x)			(x << 0)
+#   define R500_US_CODE_OFFSET_ADDR(x)			((x) << 0)
 #define R500_US_CODE_RANGE				0x4634
-#   define R500_US_CODE_RANGE_ADDR(x)			(x << 0)
-#   define R500_US_CODE_RANGE_SIZE(x)			(x << 16)
+#   define R500_US_CODE_RANGE_ADDR(x)			((x) << 0)
+#   define R500_US_CODE_RANGE_SIZE(x)			((x) << 16)
 #define R500_US_CONFIG					0x4600
 #   define R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO		(1 << 1)
 #define R500_US_FC_ADDR_0				0xa000
-#   define R500_FC_BOOL_ADDR(x)				(x << 0)
-#   define R500_FC_INT_ADDR(x)				(x << 8)
-#   define R500_FC_JUMP_ADDR(x)				(x << 16)
+#   define R500_FC_BOOL_ADDR(x)				((x) << 0)
+#   define R500_FC_INT_ADDR(x)				((x) << 8)
+#   define R500_FC_JUMP_ADDR(x)				((x) << 16)
 #   define R500_FC_JUMP_GLOBAL				(1 << 31)
 #define R500_US_FC_BOOL_CONST				0x4620
 #   define R500_FC_KBOOL(x)				(x)
@@ -3003,8 +3054,8 @@ enum {
 #   define R500_FC_A_OP_NONE				(0 << 6)
 #   define R500_FC_A_OP_POP				(1 << 6)
 #   define R500_FC_A_OP_PUSH				(2 << 6)
-#   define R500_FC_JUMP_FUNC(x)				(x << 8)
-#   define R500_FC_B_POP_CNT(x)				(x << 16)
+#   define R500_FC_JUMP_FUNC(x)				((x) << 8)
+#   define R500_FC_B_POP_CNT(x)				((x) << 16)
 #   define R500_FC_B_OP0_NONE				(0 << 24)
 #   define R500_FC_B_OP0_DECR				(1 << 24)
 #   define R500_FC_B_OP0_INCR				(2 << 24)
@@ -3013,14 +3064,14 @@ enum {
 #   define R500_FC_B_OP1_INCR				(2 << 26)
 #   define R500_FC_IGNORE_UNCOVERED			(1 << 28)
 #define R500_US_FC_INT_CONST_0				0x4c00
-#   define R500_FC_INT_CONST_KR(x)			(x << 0)
-#   define R500_FC_INT_CONST_KG(x)			(x << 8)
-#   define R500_FC_INT_CONST_KB(x)			(x << 16)
+#   define R500_FC_INT_CONST_KR(x)			((x) << 0)
+#   define R500_FC_INT_CONST_KG(x)			((x) << 8)
+#   define R500_FC_INT_CONST_KB(x)			((x) << 16)
 /* _0 through _15 */
 #define R500_US_FORMAT0_0				0x4640
-#   define R500_FORMAT_TXWIDTH(x)			(x << 0)
-#   define R500_FORMAT_TXHEIGHT(x)			(x << 11)
-#   define R500_FORMAT_TXDEPTH(x)			(x << 22)
+#   define R500_FORMAT_TXWIDTH(x)			((x) << 0)
+#   define R500_FORMAT_TXHEIGHT(x)			((x) << 11)
+#   define R500_FORMAT_TXDEPTH(x)			((x) << 22)
 /* _0 through _3 */
 #define R500_US_OUT_FMT_0				0x46a4
 #   define R500_OUT_FMT_C4_8				(0 << 0)
@@ -3061,12 +3112,12 @@ enum {
 #   define R500_C3_SEL_R				(1 << 14)
 #   define R500_C3_SEL_G				(2 << 14)
 #   define R500_C3_SEL_B				(3 << 14)
-#   define R500_OUT_SIGN(x)				(x << 16)
+#   define R500_OUT_SIGN(x)				((x) << 16)
 #   define R500_ROUND_ADJ				(1 << 20)
 #define R500_US_PIXSIZE					0x4604
 #   define R500_PIX_SIZE(x)				(x)
 #define R500_US_TEX_ADDR_0				0x9800
-#   define R500_TEX_SRC_ADDR(x)				(x << 0)
+#   define R500_TEX_SRC_ADDR(x)				((x) << 0)
 #   define R500_TEX_SRC_ADDR_REL			(1 << 7)
 #   define R500_TEX_SRC_S_SWIZ_R			(0 << 8)
 #   define R500_TEX_SRC_S_SWIZ_G			(1 << 8)
@@ -3084,7 +3135,7 @@ enum {
 #   define R500_TEX_SRC_Q_SWIZ_G			(1 << 14)
 #   define R500_TEX_SRC_Q_SWIZ_B			(2 << 14)
 #   define R500_TEX_SRC_Q_SWIZ_A			(3 << 14)
-#   define R500_TEX_DST_ADDR(x)				(x << 16)
+#   define R500_TEX_DST_ADDR(x)				((x) << 16)
 #   define R500_TEX_DST_ADDR_REL			(1 << 23)
 #   define R500_TEX_DST_R_SWIZ_R			(0 << 24)
 #   define R500_TEX_DST_R_SWIZ_G			(1 << 24)
@@ -3103,7 +3154,7 @@ enum {
 #   define R500_TEX_DST_A_SWIZ_B			(2 << 30)
 #   define R500_TEX_DST_A_SWIZ_A			(3 << 30)
 #define R500_US_TEX_ADDR_DXDY_0				0xa000
-#   define R500_DX_ADDR(x)				(x << 0)
+#   define R500_DX_ADDR(x)				((x) << 0)
 #   define R500_DX_ADDR_REL				(1 << 7)
 #   define R500_DX_S_SWIZ_R				(0 << 8)
 #   define R500_DX_S_SWIZ_G				(1 << 8)
@@ -3121,7 +3172,7 @@ enum {
 #   define R500_DX_Q_SWIZ_G				(1 << 14)
 #   define R500_DX_Q_SWIZ_B				(2 << 14)
 #   define R500_DX_Q_SWIZ_A				(3 << 14)
-#   define R500_DY_ADDR(x)				(x << 16)
+#   define R500_DY_ADDR(x)				((x) << 16)
 #   define R500_DY_ADDR_REL				(1 << 17)
 #   define R500_DY_S_SWIZ_R				(0 << 24)
 #   define R500_DY_S_SWIZ_G				(1 << 24)
@@ -3140,7 +3191,7 @@ enum {
 #   define R500_DY_Q_SWIZ_B				(2 << 30)
 #   define R500_DY_Q_SWIZ_A				(3 << 30)
 #define R500_US_TEX_INST_0				0x9000
-#   define R500_TEX_ID(x)				(x << 16)
+#   define R500_TEX_ID(x)				((x) << 16)
 #   define R500_TEX_INST_NOP				(0 << 22)
 #   define R500_TEX_INST_LD				(1 << 22)
 #   define R500_TEX_INST_TEXKILL			(2 << 22)
@@ -3159,6 +3210,9 @@ enum {
 #   define R300_W_SRC_RAS				(1 << 2)
 
 
+/* Packet0 field ordering to write all values to the same reg */
+#define RADEON_ONE_REG_WR        (1 << 15)
+
 /* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
  * Two parameter dwords:
  * 0. VAP_VTX_FMT: The first parameter is not written to hardware
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index f9266e44c1..b5ddfdc9f8 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -50,6 +50,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  * no bugs...
  */
 
+#include "r300_render.h"
+
 #include "main/glheader.h"
 #include "main/state.h"
 #include "main/imports.h"
@@ -62,20 +64,19 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
 #include "vbo/vbo.h"
+#include "vbo/vbo_split.h"
 #include "tnl/tnl.h"
 #include "tnl/t_vp_build.h"
 #include "radeon_reg.h"
 #include "radeon_macros.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
-extern int future_hw_tcl_on;
+#include "r300_fragprog_common.h"
+#include "r300_swtcl.h"
 
 /**
  * \brief Convert a OpenGL primitive type into a R300 primitive type.
@@ -172,96 +173,167 @@ int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
 	return num_verts - verts_off;
 }
 
-static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts)
+static void r300FireEB(r300ContextPtr rmesa, int vertex_count, int type, int offset)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_dma_region *rvb = &rmesa->state.elt_dma;
-	void *out;
-
-	if (r300IsGartMemory(rmesa, elts, n_elts * 4)) {
-		rvb->address = rmesa->radeon.radeonScreen->gartTextures.map;
-		rvb->start = ((char *)elts) - rvb->address;
-		rvb->aos_offset =
-		    rmesa->radeon.radeonScreen->gart_texture_offset +
-		    rvb->start;
-		return;
-	} else if (r300IsGartMemory(rmesa, elts, 1)) {
-		WARN_ONCE("Pointer not within GART memory!\n");
-		_mesa_exit(-1);
+	BATCH_LOCALS(&rmesa->radeon);
+	int size;
+
+	/* offset is in indices */
+	BEGIN_BATCH(10);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0);
+	if (rmesa->ind_buf.is_32bit) {
+		/* convert to bytes */
+		offset *= 4;
+		size = vertex_count;
+		OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+		  (vertex_count << 16) | type |
+		  R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+	} else {
+		/* convert to bytes */
+		offset *= 2;
+		size = (vertex_count + 1) >> 1;
+		OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+		   (vertex_count << 16) | type);
 	}
 
-	r300AllocDmaRegion(rmesa, rvb, n_elts * 4, 4);
-	rvb->aos_offset = GET_START(rvb);
-
-	out = rvb->address + rvb->start;
-	memcpy(out, elts, n_elts * 4);
-}
-
-static void r300FireEB(r300ContextPtr rmesa, unsigned long addr,
-		       int vertex_count, int type)
-{
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (vertex_count << 16) | type | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
-
-	start_packet3(CP_PACKET3(R300_PACKET3_INDX_BUFFER, 2), 2);
-	e32(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
-	    (R300_VAP_PORT_IDX0 >> 2));
-	e32(addr);
-	e32(vertex_count);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+		OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+				 (R300_VAP_PORT_IDX0 >> 2));
+		OUT_BATCH_RELOC(0, rmesa->ind_buf.bo, rmesa->ind_buf.bo_offset + offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+		OUT_BATCH(size);
+	} else {
+		OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+		OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+				 (R300_VAP_PORT_IDX0 >> 2));
+		OUT_BATCH(rmesa->ind_buf.bo_offset + offset);
+		OUT_BATCH(size);
+		radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+				      rmesa->ind_buf.bo, RADEON_GEM_DOMAIN_GTT, 0, 0);
+	}
+	END_BATCH();
 }
 
 static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 {
+	BATCH_LOCALS(&rmesa->radeon);
+	uint32_t voffset;
 	int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
 	int i;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
 
-	if (RADEON_DEBUG & DEBUG_VERTS)
+	if (RADEON_DEBUG & RADEON_VERTS)
 		fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
 			offset);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1), sz - 1);
-	e32(nr);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH(sz+2+(nr * 2));
+		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+		OUT_BATCH(nr);
+
+		for (i = 0; i + 1 < nr; i += 2) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+				  (rmesa->radeon.tcl.aos[i].stride << 8) |
+				  (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+				  (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[i].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+			  offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[i+1].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+		}
 
-	for (i = 0; i + 1 < nr; i += 2) {
-		e32((rmesa->state.aos[i].aos_size << 0) |
-		    (rmesa->state.aos[i].aos_stride << 8) |
-		    (rmesa->state.aos[i + 1].aos_size << 16) |
-		    (rmesa->state.aos[i + 1].aos_stride << 24));
+		if (nr & 1) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+				  (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[nr - 1].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+		}
+		END_BATCH();
+	} else {
 
-		e32(rmesa->state.aos[i].aos_offset + offset * 4 * rmesa->state.aos[i].aos_stride);
-		e32(rmesa->state.aos[i + 1].aos_offset + offset * 4 * rmesa->state.aos[i + 1].aos_stride);
-	}
+		BEGIN_BATCH(sz+2+(nr * 2));
+		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+		OUT_BATCH(nr);
+
+		for (i = 0; i + 1 < nr; i += 2) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+				  (rmesa->radeon.tcl.aos[i].stride << 8) |
+				  (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+				  (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			OUT_BATCH(voffset);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			OUT_BATCH(voffset);
+		}
 
-	if (nr & 1) {
-		e32((rmesa->state.aos[nr - 1].aos_size << 0) |
-		    (rmesa->state.aos[nr - 1].aos_stride << 8));
-		e32(rmesa->state.aos[nr - 1].aos_offset + offset * 4 * rmesa->state.aos[nr - 1].aos_stride);
+		if (nr & 1) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+			  (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			OUT_BATCH(voffset);
+		}
+		for (i = 0; i + 1 < nr; i += 2) {
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[i+0].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[i+1].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+		}
+		if (nr & 1) {
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[nr-1].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+		}
+		END_BATCH();
 	}
+
 }
 
 static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(&rmesa->radeon);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+        r300_emit_scissor(rmesa->radeon.glCtx);
+	BEGIN_BATCH(3);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+	END_BATCH();
 }
 
-static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
-				   int start, int end, int prim)
+void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
 
 	type = r300PrimitiveType(rmesa, prim);
 	num_verts = r300NumVerts(rmesa, end - start, prim);
@@ -269,190 +341,161 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 	if (type < 0 || num_verts <= 0)
 		return;
 
-	if (vb->Elts) {
-		if (num_verts > 65535) {
-			/* not implemented yet */
-			WARN_ONCE("Too many elts\n");
+	if (rmesa->ind_buf.bo) {
+		GLuint first, incr, offset = 0;
+
+		if (!split_prim_inplace(prim & PRIM_MODE_MASK, &first, &incr) &&
+			num_verts > 65500) {
+			WARN_ONCE("Fixme: can't handle spliting prim %d\n", prim);
 			return;
 		}
-		/* Note: The following is incorrect, but it's the best I can do
-		 * without a major refactoring of how DMA memory is handled.
-		 * The problem: Ensuring that both vertex arrays *and* index
-		 * arrays are at the right position, and then ensuring that
-		 * the LOAD_VBPNTR, DRAW_INDX and INDX_BUFFER packets are emitted
-		 * at once.
-		 *
-		 * So why is the following incorrect? Well, it seems like
-		 * allocating the index array might actually evict the vertex
-		 * arrays. *sigh*
-		 */
-		r300EmitElts(ctx, vb->Elts, num_verts);
-		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
-		r300FireEB(rmesa, rmesa->state.elt_dma.aos_offset, num_verts, type);
-	} else {
-		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
-		r300FireAOS(rmesa, num_verts, type);
-	}
-}
 
-static GLboolean r300RunRender(GLcontext * ctx,
-			       struct tnl_pipeline_stage *stage)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	int i;
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
-
-
-	if (RADEON_DEBUG & DEBUG_PRIMS)
-		fprintf(stderr, "%s\n", __FUNCTION__);
 
-	r300UpdateShaders(rmesa);
-	if (r300EmitArrays(ctx))
-		return GL_TRUE;
-
-	r300UpdateShaderStates(rmesa);
-
-	r300EmitCacheFlush(rmesa);
-	r300EmitState(rmesa);
-
-	for (i = 0; i < vb->PrimitiveCount; i++) {
-		GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
-		GLuint start = vb->Primitive[i].start;
-		GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
-		r300RunRenderPrimitive(rmesa, ctx, start, end, prim);
-	}
-
-	r300EmitCacheFlush(rmesa);
+		r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, 0);
+		if (rmesa->radeon.radeonScreen->kernel_mm) {
+			BEGIN_BATCH_NO_AUTOSTATE(2);
+			OUT_BATCH_REGSEQ(R300_VAP_VF_MAX_VTX_INDX, 1);
+			OUT_BATCH(rmesa->radeon.tcl.aos[0].count);
+			END_BATCH();
+		}
 
-#ifdef USER_BUFFERS
-	r300UseArrays(ctx);
-#endif
+		r300_emit_scissor(rmesa->radeon.glCtx);
+		while (num_verts > 0) {
+			int nr;
+			int align;
+
+			nr = MIN2(num_verts, 65535);
+			nr -= (nr - first) % incr;
+
+			/* get alignment for IB correct */
+			if (nr != num_verts) {
+				do {
+				    align = nr * (rmesa->ind_buf.is_32bit ? 4 : 2);
+				    if (align % 4)
+					nr -= incr;
+				} while(align % 4);
+				if (nr <= 0) {
+					WARN_ONCE("did the impossible happen? we never aligned nr to dword\n");
+					return;
+				}
+					
+			}
+			r300FireEB(rmesa, nr, type, offset);
 
-	r300ReleaseArrays(ctx);
+			num_verts -= nr;
+			offset += nr;
+		}
 
-	return GL_FALSE;
-}
+	} else {
+		GLuint first, incr, offset = 0;
 
-#define FALLBACK_IF(expr)						\
-	do {								\
-		if (expr) {						\
-			if (1 || RADEON_DEBUG & DEBUG_FALLBACKS)	\
-				WARN_ONCE("Software fallback:%s\n",	\
-					  #expr);			\
-			return R300_FALLBACK_RAST;			\
-		}							\
-	} while(0)
-
-static int r300Fallback(GLcontext * ctx)
-{
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	/* Do we need to use new-style shaders?
-	 * Also is there a better way to do this? */
-	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-		struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-		if (fp) {
-			if (!fp->translated) {
-				r500TranslateFragmentShader(r300, fp);
-				FALLBACK_IF(!fp->translated);
-			}
+		if (!split_prim_inplace(prim & PRIM_MODE_MASK, &first, &incr) &&
+			num_verts > 65535) {
+			WARN_ONCE("Fixme: can't handle spliting prim %d\n", prim);
+			return;
 		}
-	} else {
-		struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-		if (fp) {
-			if (!fp->translated) {
-				r300TranslateFragmentShader(r300, fp);
-				FALLBACK_IF(!fp->translated);
-			}
+		r300_emit_scissor(rmesa->radeon.glCtx);
+		while (num_verts > 0) {
+			int nr;
+			nr = MIN2(num_verts, 65535);
+			nr -= (nr - first) % incr;
+			r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, start + offset);
+			r300FireAOS(rmesa, nr, type);
+			num_verts -= nr;
+			offset += nr;
 		}
 	}
-
-	FALLBACK_IF(ctx->RenderMode != GL_RENDER);
-
-	FALLBACK_IF(ctx->Stencil._TestTwoSide
-		    && (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[1]
-			|| ctx->Stencil.ValueMask[0] !=
-			ctx->Stencil.ValueMask[1]
-			|| ctx->Stencil.WriteMask[0] !=
-			ctx->Stencil.WriteMask[1]));
-
-	if (ctx->Extensions.NV_point_sprite || ctx->Extensions.ARB_point_sprite)
-		FALLBACK_IF(ctx->Point.PointSprite);
-
-	if (!r300->disable_lowimpact_fallback) {
-		FALLBACK_IF(ctx->Polygon.StippleFlag);
-		FALLBACK_IF(ctx->Multisample._Enabled);
-		FALLBACK_IF(ctx->Line.StippleFlag);
-		FALLBACK_IF(ctx->Line.SmoothFlag);
-		FALLBACK_IF(ctx->Point.SmoothFlag);
-	}
-
-	return R300_FALLBACK_NONE;
+	COMMIT_BATCH();
 }
 
-static GLboolean r300RunNonTCLRender(GLcontext * ctx,
-				     struct tnl_pipeline_stage *stage)
+static const char *getFallbackString(uint32_t bit)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-	if (RADEON_DEBUG & DEBUG_PRIMS)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (r300Fallback(ctx) >= R300_FALLBACK_RAST)
-		return GL_TRUE;
-
-	if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
- 	        return GL_TRUE;
-
-	return r300RunRender(ctx, stage);
+	switch (bit) {
+		case R300_FALLBACK_VERTEX_PROGRAM :
+			return "vertex program";
+		case R300_FALLBACK_LINE_SMOOTH:
+			return "smooth lines";
+		case R300_FALLBACK_POINT_SMOOTH:
+			return "smooth points";
+		case R300_FALLBACK_POLYGON_SMOOTH:
+			return "smooth polygons";
+		case R300_FALLBACK_LINE_STIPPLE:
+			return "line stipple";
+		case R300_FALLBACK_POLYGON_STIPPLE:
+			return "polygon stipple";
+		case R300_FALLBACK_STENCIL_TWOSIDE:
+			return "two-sided stencil";
+		case R300_FALLBACK_RENDER_MODE:
+			return "render mode != GL_RENDER";
+		case R300_FALLBACK_FRAGMENT_PROGRAM:
+			return "fragment program";
+		case R300_FALLBACK_AOS_LIMIT:
+			return "aos limit";
+		case R300_FALLBACK_INVALID_BUFFERS:
+			return "invalid buffers";
+		default:
+			return "unknown";
+	}
 }
 
-static GLboolean r300RunTCLRender(GLcontext * ctx,
-				  struct tnl_pipeline_stage *stage)
+void r300SwitchFallback(GLcontext *ctx, uint32_t bit, GLboolean mode)
 {
+	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_vertex_program *vp;
+	uint32_t old_fallback = rmesa->fallback;
+	static uint32_t fallback_warn = 0;
+
+	if (mode) {
+		if ((fallback_warn & bit) == 0) {
+			if (RADEON_DEBUG & RADEON_FALLBACKS)
+				_mesa_fprintf(stderr, "WARNING! Falling back to software for %s\n", getFallbackString(bit));
+			fallback_warn |= bit;
+		}
+		rmesa->fallback |= bit;
+
+		/* update only if we change from no tcl fallbacks to some tcl fallbacks */
+		if (rmesa->options.hw_tcl_enabled) {
+			if (((old_fallback & R300_TCL_FALLBACK_MASK) == 0) &&
+				((bit & R300_TCL_FALLBACK_MASK) > 0)) {
+				R300_STATECHANGE(rmesa, vap_cntl_status);
+				rmesa->hw.vap_cntl_status.cmd[1] |= R300_VAP_TCL_BYPASS;
+			}
+		}
 
-	hw_tcl_on = future_hw_tcl_on;
+		/* update only if we change from no raster fallbacks to some raster fallbacks */
+		if (((old_fallback & R300_RASTER_FALLBACK_MASK) == 0) &&
+			((bit & R300_RASTER_FALLBACK_MASK) > 0)) {
 
-	if (RADEON_DEBUG & DEBUG_PRIMS)
-		fprintf(stderr, "%s\n", __FUNCTION__);
+			radeon_firevertices(&rmesa->radeon);
+			rmesa->radeon.swtcl.RenderIndex = ~0;
+			_swsetup_Wakeup( ctx );
+		}
+	} else {
+		rmesa->fallback &= ~bit;
 
-	if (hw_tcl_on == GL_FALSE)
-		return GL_TRUE;
+		/* update only if we have disabled all tcl fallbacks */
+		if (rmesa->options.hw_tcl_enabled) {
+			if ((old_fallback & R300_RASTER_FALLBACK_MASK) == bit) {
+				R300_STATECHANGE(rmesa, vap_cntl_status);
+				rmesa->hw.vap_cntl_status.cmd[1] &= ~R300_VAP_TCL_BYPASS;
+			}
+		}
 
-	if (r300Fallback(ctx) >= R300_FALLBACK_TCL) {
-		hw_tcl_on = GL_FALSE;
-		return GL_TRUE;
-	}
+		/* update only if we have disabled all raster fallbacks */
+		if ((old_fallback & R300_RASTER_FALLBACK_MASK) == bit) {
+			_swrast_flush( ctx );
 
-	r300UpdateShaders(rmesa);
+			tnl->Driver.Render.Start = r300RenderStart;
+			tnl->Driver.Render.Finish = r300RenderFinish;
+			tnl->Driver.Render.PrimitiveNotify = r300RenderPrimitive;
+			tnl->Driver.Render.ResetLineStipple = r300ResetLineStipple;
+			tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+			tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+			tnl->Driver.Render.Interp = _tnl_interp;
 
-	vp = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-	if (vp->native == GL_FALSE) {
-		hw_tcl_on = GL_FALSE;
-		return GL_TRUE;
+			_tnl_invalidate_vertex_state( ctx, ~0 );
+			_tnl_invalidate_vertices( ctx, ~0 );
+		}
 	}
 
-	return r300RunRender(ctx, stage);
 }
-
-const struct tnl_pipeline_stage _r300_render_stage = {
-	"r300 Hardware Rasterization",
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	r300RunNonTCLRender
-};
-
-const struct tnl_pipeline_stage _r300_tcl_stage = {
-	"r300 Hardware Transform, Clipping and Lighting",
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	r300RunTCLRender
-};
diff --git a/src/mesa/drivers/dri/r300/r300_render.h b/src/mesa/drivers/dri/r300/r300_render.h
new file mode 100644
index 0000000000..ec785474a6
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_render.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_RENDER_H__
+#define __R300_RENDER_H__
+
+#include "main/mtypes.h"
+
+#define R300_FALLBACK_VERTEX_PROGRAM    (1 << 0)
+#define R300_TCL_FALLBACK_MASK           0x0000ffff
+
+#define R300_FALLBACK_LINE_SMOOTH       (1 << 16)
+#define R300_FALLBACK_POINT_SMOOTH      (1 << 17)
+#define R300_FALLBACK_POLYGON_SMOOTH    (1 << 18)
+#define R300_FALLBACK_LINE_STIPPLE      (1 << 19)
+#define R300_FALLBACK_POLYGON_STIPPLE   (1 << 20)
+#define R300_FALLBACK_STENCIL_TWOSIDE   (1 << 21)
+#define R300_FALLBACK_RENDER_MODE       (1 << 22)
+#define R300_FALLBACK_FRAGMENT_PROGRAM  (1 << 23)
+#define R300_FALLBACK_AOS_LIMIT         (1 << 30)
+#define R300_FALLBACK_INVALID_BUFFERS   (1 << 31)
+#define R300_RASTER_FALLBACK_MASK        0xffff0000
+
+#define MASK_XYZW (R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W)
+#define MASK_X R300_WRITE_ENA_X
+#define MASK_Y R300_WRITE_ENA_Y
+#define MASK_Z R300_WRITE_ENA_Z
+#define MASK_W R300_WRITE_ENA_W
+
+#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
+    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
+    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
+    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
+    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
+    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
+#error Cannot change these!
+#endif
+
+extern const struct tnl_pipeline_stage _r300_render_stage;
+
+extern void r300SwitchFallback(GLcontext *ctx, uint32_t bit, GLboolean mode);
+
+extern void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_shader.c b/src/mesa/drivers/dri/r300/r300_shader.c
index f30fd986e0..a4f9db13ec 100644
--- a/src/mesa/drivers/dri/r300/r300_shader.c
+++ b/src/mesa/drivers/dri/r300/r300_shader.c
@@ -1,47 +1,79 @@
+/*
+ * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
 
 #include "main/glheader.h"
 
 #include "shader/program.h"
 #include "tnl/tnl.h"
 #include "r300_context.h"
-#include "r300_fragprog.h"
+#include "r300_fragprog_common.h"
+
+static void freeFragProgCache(GLcontext *ctx, struct r300_fragment_program_cont *cache)
+{
+	struct r300_fragment_program *tmp, *fp = cache->progs;
+
+	while (fp) {
+		tmp = fp->next;
+		rc_constants_destroy(&fp->code.constants);
+		_mesa_free(fp);
+		fp = tmp;
+	}
+}
+
+static void freeVertProgCache(GLcontext *ctx, struct r300_vertex_program_cont *cache)
+{
+	struct r300_vertex_program *tmp, *vp = cache->progs;
+
+	while (vp) {
+		tmp = vp->next;
+		rc_constants_destroy(&vp->code.constants);
+		_mesa_reference_vertprog(ctx, &vp->Base, NULL);
+		_mesa_free(vp);
+		vp = tmp;
+	}
+}
 
 static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 					 GLuint id)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	struct r300_vertex_program_cont *vp;
-	struct r300_fragment_program *r300_fp;
-	struct r500_fragment_program *r500_fp;
+	struct r300_fragment_program_cont *fp;
 
 	switch (target) {
 	case GL_VERTEX_STATE_PROGRAM_NV:
 	case GL_VERTEX_PROGRAM_ARB:
 		vp = CALLOC_STRUCT(r300_vertex_program_cont);
-		return _mesa_init_vertex_program(ctx, &vp->mesa_program,
-						 target, id);
-	case GL_FRAGMENT_PROGRAM_ARB:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-			r500_fp = CALLOC_STRUCT(r500_fragment_program);
-			r500_fp->ctx = ctx;
-			return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
-							   target, id);
-		} else {
-			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
-							   target, id);
-		}
+		return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id);
 
 	case GL_FRAGMENT_PROGRAM_NV:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-			r500_fp = CALLOC_STRUCT(r500_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
-							   target, id);
-		} else {
-			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
-							   target, id);
-		}
+	case GL_FRAGMENT_PROGRAM_ARB:
+		fp = CALLOC_STRUCT(r300_fragment_program_cont);
+		return _mesa_init_fragment_program(ctx, &fp->Base, target, id);
+
 	default:
 		_mesa_problem(ctx, "Bad target in r300NewProgram");
 	}
@@ -51,26 +83,35 @@ static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 
 static void r300DeleteProgram(GLcontext * ctx, struct gl_program *prog)
 {
+	struct r300_vertex_program_cont *vp = (struct r300_vertex_program_cont *)prog;
+	struct r300_fragment_program_cont *fp = (struct r300_fragment_program_cont *)prog;
+
+	switch (prog->Target) {
+		case GL_VERTEX_PROGRAM_ARB:
+			freeVertProgCache(ctx, vp);
+			break;
+		case GL_FRAGMENT_PROGRAM_ARB:
+			freeFragProgCache(ctx, fp);
+			break;
+	}
+
 	_mesa_delete_program(ctx, prog);
 }
 
 static void
 r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_vertex_program_cont *vp = (void *)prog;
-	struct r300_fragment_program *r300_fp = (struct r300_fragment_program *)prog;
-	struct r500_fragment_program *r500_fp = (struct r500_fragment_program *)prog;
+	struct r300_vertex_program_cont *vp = (struct r300_vertex_program_cont *)prog;
+	struct r300_fragment_program_cont *fp = (struct r300_fragment_program_cont *)prog;
 
 	switch (target) {
 	case GL_VERTEX_PROGRAM_ARB:
+		freeVertProgCache(ctx, vp);
 		vp->progs = NULL;
 		break;
 	case GL_FRAGMENT_PROGRAM_ARB:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-			r500_fp->translated = GL_FALSE;
-		else
-			r300_fp->translated = GL_FALSE;
+		freeFragProgCache(ctx, fp);
+		fp->progs = NULL;
 		break;
 	}
 
@@ -81,7 +122,15 @@ r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 static GLboolean
 r300IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-	return GL_TRUE;
+	if (target == GL_FRAGMENT_PROGRAM_ARB) {
+		struct r300_fragment_program *fp = r300SelectAndTranslateFragmentShader(ctx);
+
+		return !fp->error;
+	} else {
+		struct r300_vertex_program *vp = r300SelectAndTranslateVertexShader(ctx);
+
+		return !vp->error;
+	}
 }
 
 void r300InitShaderFuncs(struct dd_function_table *functions)
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index a63dbac343..3060f49aaf 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -42,6 +42,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/context.h"
 #include "main/dd.h"
+#include "main/framebuffer.h"
 #include "main/simple_list.h"
 #include "main/api_arrayelt.h"
 #include "main/texformat.h"
@@ -52,22 +53,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "shader/prog_statevars.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
 
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
 #include "r300_tex.h"
+#include "r300_fragprog_common.h"
+#include "r300_render.h"
+#include "r300_vertprog.h"
 
 #include "drirenderbuffer.h"
 
-extern int future_hw_tcl_on;
-extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
-
 static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
@@ -366,12 +365,13 @@ static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 	GLint *ip;
 
 	/* no VAP UCP on non-TCL chipsets */
-	if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (!rmesa->options.hw_tcl_enabled)
 			return;
 
 	p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
 	ip = (GLint *)ctx->Transform._ClipUserPlane[p];
 
+	R300_STATECHANGE( rmesa, vap_flush );
 	R300_STATECHANGE( rmesa, vpucp[p] );
 	rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
 	rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
@@ -385,7 +385,7 @@ static void r300SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state)
 	GLuint p;
 
 	/* no VAP UCP on non-TCL chipsets */
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (!r300->options.hw_tcl_enabled)
 		return;
 
 	p = cap - GL_CLIP_PLANE0;
@@ -433,6 +433,10 @@ static void r300UpdateCulling(GLcontext * ctx)
 		break;
 	}
 
+	/* Winding is inverted when rendering to FBO */
+	if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+		val ^= R300_FRONT_FACE_CW;
+
 	R300_STATECHANGE(r300, cul);
 	r300->hw.cul.cmd[R300_CUL_CULL] = val;
 }
@@ -453,26 +457,22 @@ static GLboolean current_fragment_program_writes_depth(GLcontext* ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 
-	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
-		struct r300_fragment_program *fp = (struct r300_fragment_program *)
-			(char *)ctx->FragmentProgram._Current;
-		return (fp && fp->WritesDepth);
-	} else {
-		struct r500_fragment_program* fp =
-			(struct r500_fragment_program*)(char*)
-			ctx->FragmentProgram._Current;
-		return (fp && fp->writes_depth);
-	}
+	return ctx->FragmentProgram._Current && r300->selected_fp->code.writes_depth;
 }
 
 static void r300SetEarlyZState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	GLuint topZ = R300_ZTOP_ENABLE;
+	GLuint w_fmt, fgdepthsrc;
 
 	if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
 		topZ = R300_ZTOP_DISABLE;
-	if (current_fragment_program_writes_depth(ctx))
+	else if (current_fragment_program_writes_depth(ctx))
+		topZ = R300_ZTOP_DISABLE;
+	else if (ctx->FragmentProgram._Current && ctx->FragmentProgram._Current->UsesKill)
+		topZ = R300_ZTOP_DISABLE;
+	else if (r300->radeon.query.current)
 		topZ = R300_ZTOP_DISABLE;
 
 	if (topZ != r300->hw.zstencil_format.cmd[2]) {
@@ -483,6 +483,26 @@ static void r300SetEarlyZState(GLcontext * ctx)
 		R300_STATECHANGE(r300, zstencil_format);
 		r300->hw.zstencil_format.cmd[2] = topZ;
 	}
+
+	/* w_fmt value is set to get best performance
+	* see p.130 R5xx 3D acceleration guide v1.3 */
+	if (current_fragment_program_writes_depth(ctx)) {
+		fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
+		w_fmt = R300_W_FMT_W24 | R300_W_SRC_US;
+	} else {
+		fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
+		w_fmt = R300_W_FMT_W0 | R300_W_SRC_US;
+	}
+
+	if (w_fmt != r300->hw.us_out_fmt.cmd[5]) {
+		R300_STATECHANGE(r300, us_out_fmt);
+		r300->hw.us_out_fmt.cmd[5] = w_fmt;
+	}
+
+	if (fgdepthsrc != r300->hw.fg_depth_src.cmd[1]) {
+		R300_STATECHANGE(r300, fg_depth_src);
+		r300->hw.fg_depth_src.cmd[1] = fgdepthsrc;
+	}
 }
 
 static void r300SetAlphaState(GLcontext * ctx)
@@ -533,8 +553,6 @@ static void r300SetAlphaState(GLcontext * ctx)
 	R300_STATECHANGE(r300, at);
 	r300->hw.at.cmd[R300_AT_ALPHA_TEST] = pp_misc;
 	r300->hw.at.cmd[R300_AT_UNKNOWN] = 0;
-
-	r300SetEarlyZState(ctx);
 }
 
 static void r300AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref)
@@ -572,7 +590,9 @@ static void r300SetDepthState(GLcontext * ctx)
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 
 	R300_STATECHANGE(r300, zs);
-	r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE|R300_STENCIL_FRONT_BACK;
+	r300->hw.zs.cmd[R300_ZS_CNTL_0] &= (R300_STENCIL_ENABLE |
+					    R300_STENCIL_FRONT_BACK |
+					    R500_STENCIL_REFMASK_FRONT_BACK);
 	r300->hw.zs.cmd[R300_ZS_CNTL_1] &= ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
 
 	if (ctx->Depth.Test) {
@@ -582,15 +602,40 @@ static void r300SetDepthState(GLcontext * ctx)
 		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
 		    translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
 	}
+}
 
-	r300SetEarlyZState(ctx);
+static void r300CatchStencilFallback(GLcontext *ctx)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	const unsigned back = ctx->Stencil._BackFace;
+
+	if (rmesa->radeon.radeonScreen->kernel_mm &&
+	    (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)) {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_FALSE);
+	} else if (ctx->Stencil._Enabled &&
+		   (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
+		    || ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[back]
+		    || ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[back])) {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_TRUE);
+	} else {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_FALSE);
+	}
 }
 
 static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLboolean hw_stencil = GL_FALSE;
+
+	r300CatchStencilFallback(ctx);
 
-	if (r300->state.stencil.hw_stencil) {
+	if (ctx->DrawBuffer) {
+		struct radeon_renderbuffer *rrbStencil
+			= radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+		hw_stencil = (rrbStencil && rrbStencil->bo);
+	}
+
+	if (hw_stencil) {
 		R300_STATECHANGE(r300, zs);
 		if (state) {
 			r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
@@ -599,10 +644,6 @@ static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 			r300->hw.zs.cmd[R300_ZS_CNTL_0] &=
 			    ~R300_STENCIL_ENABLE;
 		}
-	} else {
-#if R200_MERGED
-		FALLBACK(&r300->radeon, RADEON_FALLBACK_STENCIL, state);
-#endif
 	}
 }
 
@@ -730,139 +771,17 @@ static void r300ColorMask(GLcontext * ctx,
 }
 
 /* =============================================================
- * Fog
- */
-static void r300Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param)
-{
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	union {
-		int i;
-		float f;
-	} fogScale, fogStart;
-
-	(void)param;
-
-	fogScale.i = r300->hw.fogp.cmd[R300_FOGP_SCALE];
-	fogStart.i = r300->hw.fogp.cmd[R300_FOGP_START];
-
-	switch (pname) {
-	case GL_FOG_MODE:
-		switch (ctx->Fog.Mode) {
-		case GL_LINEAR:
-			R300_STATECHANGE(r300, fogs);
-			r300->hw.fogs.cmd[R300_FOGS_STATE] =
-			    (r300->hw.fogs.
-			     cmd[R300_FOGS_STATE] & ~R300_FG_FOG_BLEND_FN_MASK) |
-			    R300_FG_FOG_BLEND_FN_LINEAR;
-
-			if (ctx->Fog.Start == ctx->Fog.End) {
-				fogScale.f = -1.0;
-				fogStart.f = 1.0;
-			} else {
-				fogScale.f =
-				    1.0 / (ctx->Fog.End - ctx->Fog.Start);
-				fogStart.f =
-				    -ctx->Fog.Start / (ctx->Fog.End -
-						       ctx->Fog.Start);
-			}
-			break;
-		case GL_EXP:
-			R300_STATECHANGE(r300, fogs);
-			r300->hw.fogs.cmd[R300_FOGS_STATE] =
-			    (r300->hw.fogs.
-			     cmd[R300_FOGS_STATE] & ~R300_FG_FOG_BLEND_FN_MASK) |
-			    R300_FG_FOG_BLEND_FN_EXP;
-			fogScale.f = 0.0933 * ctx->Fog.Density;
-			fogStart.f = 0.0;
-			break;
-		case GL_EXP2:
-			R300_STATECHANGE(r300, fogs);
-			r300->hw.fogs.cmd[R300_FOGS_STATE] =
-			    (r300->hw.fogs.
-			     cmd[R300_FOGS_STATE] & ~R300_FG_FOG_BLEND_FN_MASK) |
-			    R300_FG_FOG_BLEND_FN_EXP2;
-			fogScale.f = 0.3 * ctx->Fog.Density;
-			fogStart.f = 0.0;
-		default:
-			return;
-		}
-		break;
-	case GL_FOG_DENSITY:
-		switch (ctx->Fog.Mode) {
-		case GL_EXP:
-			fogScale.f = 0.0933 * ctx->Fog.Density;
-			fogStart.f = 0.0;
-			break;
-		case GL_EXP2:
-			fogScale.f = 0.3 * ctx->Fog.Density;
-			fogStart.f = 0.0;
-		default:
-			break;
-		}
-		break;
-	case GL_FOG_START:
-	case GL_FOG_END:
-		if (ctx->Fog.Mode == GL_LINEAR) {
-			if (ctx->Fog.Start == ctx->Fog.End) {
-				fogScale.f = -1.0;
-				fogStart.f = 1.0;
-			} else {
-				fogScale.f =
-				    1.0 / (ctx->Fog.End - ctx->Fog.Start);
-				fogStart.f =
-				    -ctx->Fog.Start / (ctx->Fog.End -
-						       ctx->Fog.Start);
-			}
-		}
-		break;
-	case GL_FOG_COLOR:
-		R300_STATECHANGE(r300, fogc);
-		r300->hw.fogc.cmd[R300_FOGC_R] =
-		    (GLuint) (ctx->Fog.Color[0] * 1023.0F) & 0x3FF;
-		r300->hw.fogc.cmd[R300_FOGC_G] =
-		    (GLuint) (ctx->Fog.Color[1] * 1023.0F) & 0x3FF;
-		r300->hw.fogc.cmd[R300_FOGC_B] =
-		    (GLuint) (ctx->Fog.Color[2] * 1023.0F) & 0x3FF;
-		break;
-	case GL_FOG_COORD_SRC:
-		break;
-	default:
-		return;
-	}
-
-	if (fogScale.i != r300->hw.fogp.cmd[R300_FOGP_SCALE] ||
-	    fogStart.i != r300->hw.fogp.cmd[R300_FOGP_START]) {
-		R300_STATECHANGE(r300, fogp);
-		r300->hw.fogp.cmd[R300_FOGP_SCALE] = fogScale.i;
-		r300->hw.fogp.cmd[R300_FOGP_START] = fogStart.i;
-	}
-}
-
-static void r300SetFogState(GLcontext * ctx, GLboolean state)
-{
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-
-	R300_STATECHANGE(r300, fogs);
-	if (state) {
-		r300->hw.fogs.cmd[R300_FOGS_STATE] |= R300_FG_FOG_BLEND_ENABLE;
-
-		r300Fogfv(ctx, GL_FOG_MODE, NULL);
-		r300Fogfv(ctx, GL_FOG_DENSITY, &ctx->Fog.Density);
-		r300Fogfv(ctx, GL_FOG_START, &ctx->Fog.Start);
-		r300Fogfv(ctx, GL_FOG_END, &ctx->Fog.End);
-		r300Fogfv(ctx, GL_FOG_COLOR, ctx->Fog.Color);
-	} else {
-		r300->hw.fogs.cmd[R300_FOGS_STATE] &= ~R300_FG_FOG_BLEND_ENABLE;
-	}
-}
-
-/* =============================================================
  * Point state
  */
 static void r300PointSize(GLcontext * ctx, GLfloat size)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        /* same size limits for AA, non-AA points */
+
+	/* We need to clamp to user defined range here, because
+	 * the HW clamping happens only for per vertex point size. */
+	size = CLAMP(size, ctx->Point.MinSize, ctx->Point.MaxSize);
+
+	/* same size limits for AA, non-AA points */
 	size = CLAMP(size, ctx->Const.MinPointSize, ctx->Const.MaxPointSize);
 
 	R300_STATECHANGE(r300, ps);
@@ -955,35 +874,33 @@ static void r300ShadeModel(GLcontext * ctx, GLenum mode)
 
 	R300_STATECHANGE(rmesa, shade);
 	rmesa->hw.shade.cmd[1] = 0x00000002;
+	R300_STATECHANGE(rmesa, shade2);
 	switch (mode) {
 	case GL_FLAT:
-		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_FLAT;
+		rmesa->hw.shade2.cmd[1] = R300_RE_SHADE_MODEL_FLAT;
 		break;
 	case GL_SMOOTH:
-		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_SMOOTH;
+		rmesa->hw.shade2.cmd[1] = R300_RE_SHADE_MODEL_SMOOTH;
 		break;
 	default:
 		return;
 	}
-	rmesa->hw.shade.cmd[3] = 0x00000000;
-	rmesa->hw.shade.cmd[4] = 0x00000000;
+	rmesa->hw.shade2.cmd[2] = 0x00000000;
+	rmesa->hw.shade2.cmd[3] = 0x00000000;
 }
 
 static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 				    GLenum func, GLint ref, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	GLuint refmask =
-	    (((ctx->Stencil.
-	       Ref[0] & 0xff) << R300_STENCILREF_SHIFT) | ((ctx->
-							    Stencil.
-							    ValueMask
-							    [0] &
-							    0xff)
-							   <<
-							   R300_STENCILMASK_SHIFT));
-
+	GLuint refmask;
 	GLuint flag;
+	const unsigned back = ctx->Stencil._BackFace;
+
+	r300CatchStencilFallback(ctx);
+
+	refmask = ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
+	     | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
 
 	R300_STATECHANGE(rmesa, zs);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
@@ -1000,17 +917,31 @@ static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
 	    (flag << R300_S_FRONT_FUNC_SHIFT);
 
-	if (ctx->Stencil._TestTwoSide)
-		flag = translate_func(ctx->Stencil.Function[1]);
+	flag = translate_func(ctx->Stencil.Function[back]);
 
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
 	    (flag << R300_S_BACK_FUNC_SHIFT);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |= refmask;
+
+	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R500_STENCIL_REFMASK_FRONT_BACK;
+		R300_STATECHANGE(rmesa, zsb);
+		refmask = ((ctx->Stencil.Ref[back] & 0xff) << R300_STENCILREF_SHIFT)
+			| ((ctx->Stencil.ValueMask[back] & 0xff) << R300_STENCILMASK_SHIFT);
+
+		rmesa->hw.zsb.cmd[R300_ZSB_CNTL_0] &=
+			~((R300_STENCILREF_MASK << R300_STENCILREF_SHIFT) |
+			  (R300_STENCILREF_MASK << R300_STENCILMASK_SHIFT));
+		rmesa->hw.zsb.cmd[R300_ZSB_CNTL_0] |= refmask;
+	}
 }
 
 static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	const unsigned back = ctx->Stencil._BackFace;
+
+	r300CatchStencilFallback(ctx);
 
 	R300_STATECHANGE(rmesa, zs);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
@@ -1020,12 +951,22 @@ static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 	    (ctx->Stencil.
 	     WriteMask[0] & R300_STENCILREF_MASK) <<
 	     R300_STENCILWRITEMASK_SHIFT;
+	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		R300_STATECHANGE(rmesa, zsb);
+		rmesa->hw.zsb.cmd[R300_ZSB_CNTL_0] |=
+			(ctx->Stencil.
+			 WriteMask[back] & R300_STENCILREF_MASK) <<
+			R300_STENCILWRITEMASK_SHIFT;
+	}
 }
 
 static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
 				  GLenum fail, GLenum zfail, GLenum zpass)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	const unsigned back = ctx->Stencil._BackFace;
+
+	r300CatchStencilFallback(ctx);
 
 	R300_STATECHANGE(rmesa, zs);
 	/* It is easier to mask what's left.. */
@@ -1042,51 +983,45 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
 	    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
 	       R300_S_FRONT_ZPASS_OP_SHIFT);
 
-	if (ctx->Stencil._TestTwoSide) {
-		rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
-		    (translate_stencil_op(ctx->Stencil.FailFunc[1]) <<
-		     R300_S_BACK_SFAIL_OP_SHIFT)
-		    | (translate_stencil_op(ctx->Stencil.ZFailFunc[1]) <<
-		       R300_S_BACK_ZFAIL_OP_SHIFT)
-		    | (translate_stencil_op(ctx->Stencil.ZPassFunc[1]) <<
-		       R300_S_BACK_ZPASS_OP_SHIFT);
-	} else {
-		rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
-		    (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
-		     R300_S_BACK_SFAIL_OP_SHIFT)
-		    | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
-		       R300_S_BACK_ZFAIL_OP_SHIFT)
-		    | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
-		       R300_S_BACK_ZPASS_OP_SHIFT);
-	}
+	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+	    (translate_stencil_op(ctx->Stencil.FailFunc[back]) <<
+	     R300_S_BACK_SFAIL_OP_SHIFT)
+	    | (translate_stencil_op(ctx->Stencil.ZFailFunc[back]) <<
+	       R300_S_BACK_ZFAIL_OP_SHIFT)
+	    | (translate_stencil_op(ctx->Stencil.ZPassFunc[back]) <<
+	       R300_S_BACK_ZPASS_OP_SHIFT);
 }
 
 /* =============================================================
  * Window position and viewport transformation
  */
 
-/*
- * To correctly position primitives:
- */
-#define SUBPIXEL_X 0.125
-#define SUBPIXEL_Y 0.125
-
 static void r300UpdateWindow(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
 	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
+	const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+	const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+	GLfloat y_scale, y_bias;
+
+	if (render_to_fbo) {
+		y_scale = 1.0;
+		y_bias = 0;
+	} else {
+		y_scale = -1.0;
+		y_bias = yoffset;
+	}
 
 	GLfloat sx = v[MAT_SX];
-	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
-	GLfloat sy = -v[MAT_SY];
-	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
-	GLfloat sz = v[MAT_SZ] * rmesa->state.depth.scale;
-	GLfloat tz = v[MAT_TZ] * rmesa->state.depth.scale;
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat sy = v[MAT_SY] * y_scale;
+	GLfloat ty = (v[MAT_TY] * y_scale) + y_bias;
+	GLfloat sz = v[MAT_SZ] * depthScale;
+	GLfloat tz = v[MAT_TZ] * depthScale;
 
-	R300_FIREVERTICES(rmesa);
 	R300_STATECHANGE(rmesa, vpt);
 
 	rmesa->hw.vpt.cmd[R300_VPT_XSCALE] = r300PackFloat32(sx);
@@ -1105,6 +1040,8 @@ static void r300Viewport(GLcontext * ctx, GLint x, GLint y,
 	 * values, or keep the originals hanging around.
 	 */
 	r300UpdateWindow(ctx);
+
+	radeon_viewport(ctx, x, y, width, height);
 }
 
 static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
@@ -1115,13 +1052,13 @@ static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
 void r300UpdateViewportOffset(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = ((radeonContextPtr) rmesa)->dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = (GLfloat) dPriv->x;
 	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
 
-	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
-	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat ty = (-v[MAT_TY]) + yoffset;
 
 	if (rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] != r300PackFloat32(tx) ||
 	    rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] != r300PackFloat32(ty)) {
@@ -1138,138 +1075,26 @@ void r300UpdateViewportOffset(GLcontext * ctx)
 }
 
 /**
- * Tell the card where to render (offset, pitch).
- * Effected by glDrawBuffer, etc
- */
-void r300UpdateDrawBuffer(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	r300ContextPtr r300 = rmesa;
-	struct gl_framebuffer *fb = ctx->DrawBuffer;
-	driRenderbuffer *drb;
-
-	if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-		/* draw to front */
-		drb =
-		    (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].
-		    Renderbuffer;
-	} else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-		/* draw to back */
-		drb =
-		    (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].
-		    Renderbuffer;
-	} else {
-		/* drawing to multiple buffers, or none */
-		return;
-	}
-
-	assert(drb);
-	assert(drb->flippedPitch);
-
-	R300_STATECHANGE(rmesa, cb);
-
-	r300->hw.cb.cmd[R300_CB_OFFSET] = drb->flippedOffset +	//r300->radeon.state.color.drawOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.cb.cmd[R300_CB_PITCH] = drb->flippedPitch;	//r300->radeon.state.color.drawPitch;
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-#if 0
-	R200_STATECHANGE(rmesa, ctx);
-
-	/* Note: we used the (possibly) page-flipped values */
-	rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
-	    = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
-	       & R200_COLOROFFSET_MASK);
-	rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
-
-	if (rmesa->sarea->tiling_enabled) {
-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
-		    R200_COLOR_TILE_ENABLE;
-	}
-#endif
-}
-
-static void
-r300FetchStateParameter(GLcontext * ctx,
-			const gl_state_index state[STATE_LENGTH],
-			GLfloat * value)
-{
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-
-	switch (state[0]) {
-	case STATE_INTERNAL:
-		switch (state[1]) {
-		case STATE_R300_WINDOW_DIMENSION:
-			value[0] = r300->radeon.dri.drawable->w * 0.5f;	/* width*0.5 */
-			value[1] = r300->radeon.dri.drawable->h * 0.5f;	/* height*0.5 */
-			value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
-			value[3] = 1.0F;	/* not used */
-			break;
-
-		case STATE_R300_TEXRECT_FACTOR:{
-				struct gl_texture_object *t =
-				    ctx->Texture.Unit[state[2]].CurrentRect;
-
-				if (t && t->Image[0][t->BaseLevel]) {
-					struct gl_texture_image *image =
-					    t->Image[0][t->BaseLevel];
-					value[0] = 1.0 / image->Width2;
-					value[1] = 1.0 / image->Height2;
-				} else {
-					value[0] = 1.0;
-					value[1] = 1.0;
-				}
-				value[2] = 1.0;
-				value[3] = 1.0;
-				break;
-			}
-
-		default:
-			break;
-		}
-		break;
-
-	default:
-		break;
-	}
-}
-
-/**
  * Update R300's own internal state parameters.
  * For now just STATE_R300_WINDOW_DIMENSION
  */
-void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
+static void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
 {
-	struct r300_fragment_program *fp;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	struct gl_program_parameter_list *paramList;
-	GLuint i;
 
-	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)))
 		return;
 
-	fp = (struct r300_fragment_program *)ctx->FragmentProgram._Current;
-	if (!fp)
+	if (!ctx->FragmentProgram._Current || !rmesa->selected_fp)
 		return;
 
-	paramList = fp->mesa_program.Base.Parameters;
+	paramList = ctx->FragmentProgram._Current->Base.Parameters;
 
 	if (!paramList)
 		return;
 
-	for (i = 0; i < paramList->NumParameters; i++) {
-		if (paramList->Parameters[i].Type == PROGRAM_STATE_VAR) {
-			r300FetchStateParameter(ctx,
-						paramList->Parameters[i].
-						StateIndexes,
-						paramList->ParameterValues[i]);
-		}
-	}
+	_mesa_load_state_parameters(ctx, paramList);
 }
 
 /* =============================================================
@@ -1376,9 +1201,7 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int i;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-	struct r300_fragment_program_code *code = &fp->code;
+	struct r300_fragment_program_code *code = &r300->selected_fp->code.code.r300;
 
 	R300_STATECHANGE(r300, fpt);
 
@@ -1412,15 +1235,15 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 	}
 
 	r300->hw.fpt.cmd[R300_FPT_CMD_0] =
-		cmdpacket0(R300_US_TEX_INST_0, code->tex.length);
+		cmdpacket0(r300->radeon.radeonScreen,
+                   R300_US_TEX_INST_0, code->tex.length);
 }
 
 static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 {
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int i;
-	struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-	struct r500_fragment_program_code *code = &fp->code;
+	struct r500_fragment_program_code *code = &r300->selected_fp->code.code.r500;
 
 	/* find all the texture instructions and relocate the texture units */
 	for (i = 0; i < code->inst_end + 1; i++) {
@@ -1460,16 +1283,15 @@ static GLuint translate_lod_bias(GLfloat bias)
 	return (((GLuint)b) << R300_LOD_BIAS_SHIFT) & R300_LOD_BIAS_MASK;
 }
 
+
 static void r300SetupTextures(GLcontext * ctx)
 {
 	int i, mtu;
-	struct r300_tex_obj *t;
+	struct radeon_tex_obj *t;
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int hw_tmu = 0;
 	int last_hw_tmu = -1;	/* -1 translates into no setup costs for fields */
 	int tmu_mappings[R300_MAX_TEXTURE_UNITS] = { -1, };
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
 
 	R300_STATECHANGE(r300, txe);
 	R300_STATECHANGE(r300, tex.filter);
@@ -1484,7 +1306,7 @@ static void r300SetupTextures(GLcontext * ctx)
 	r300->hw.txe.cmd[R300_TXE_ENABLE] = 0x0;
 
 	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
-	if (RADEON_DEBUG & DEBUG_STATE)
+	if (RADEON_DEBUG & RADEON_STATE)
 		fprintf(stderr, "mtu=%d\n", mtu);
 
 	if (mtu > R300_MAX_TEXTURE_UNITS) {
@@ -1497,24 +1319,19 @@ static void r300SetupTextures(GLcontext * ctx)
 	/* We cannot let disabled tmu offsets pass DRM */
 	for (i = 0; i < mtu; i++) {
 		if (ctx->Texture.Unit[i]._ReallyEnabled) {
-
-#if 0				/* Enables old behaviour */
-			hw_tmu = i;
-#endif
 			tmu_mappings[i] = hw_tmu;
 
-			t = r300->state.texture.unit[i].texobj;
-			/* XXX questionable fix for bug 9170: */
+			t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
 			if (!t)
 				continue;
 
-			if ((t->format & 0xffffff00) == 0xffffff00) {
+			if ((t->pp_txformat & 0xffffff00) == 0xffffff00) {
 				WARN_ONCE
 				    ("unknown texture format (entry %x) encountered. Help me !\n",
-				     t->format & 0xff);
+				     t->pp_txformat & 0xff);
 			}
 
-			if (RADEON_DEBUG & DEBUG_STATE)
+			if (RADEON_DEBUG & RADEON_STATE)
 				fprintf(stderr,
 					"Activating texture unit %d\n", i);
 
@@ -1522,29 +1339,28 @@ static void r300SetupTextures(GLcontext * ctx)
 
 			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
 						hw_tmu] =
-			    gen_fixed_filter(t->filter) | (hw_tmu << 28);
+			    gen_fixed_filter(t->pp_txfilter) | (hw_tmu << 28);
 			/* Note: There is a LOD bias per texture unit and a LOD bias
 			 * per texture object. We add them here to get the correct behaviour.
 			 * (The per-texture object LOD bias was introduced in OpenGL 1.4
 			 * and is not present in the EXT_texture_object extension).
 			 */
 			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-				t->filter_1 |
-				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias);
+				t->pp_txfilter_1 |
+				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.LodBias);
 			r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-			    t->size;
+			    t->pp_txsize;
 			r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
-						hw_tmu] = t->format;
+						hw_tmu] = t->pp_txformat;
 			r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-			    t->pitch_reg;
-			r300->hw.tex.offset.cmd[R300_TEX_VALUE_0 +
-						hw_tmu] = t->offset;
+			  t->pp_txpitch;
+			r300->hw.textures[hw_tmu] = t;
 
-			if (t->offset & R300_TXO_MACRO_TILE) {
+			if (t->tile_bits & R300_TXO_MACRO_TILE) {
 				WARN_ONCE("macro tiling enabled!\n");
 			}
 
-			if (t->offset & R300_TXO_MICRO_TILE) {
+			if (t->tile_bits & R300_TXO_MICRO_TILE) {
 				WARN_ONCE("micro tiling enabled!\n");
 			}
 
@@ -1560,40 +1376,48 @@ static void r300SetupTextures(GLcontext * ctx)
 		}
 	}
 
+	/* R3xx and R4xx chips require that the texture unit corresponding to
+	 * KIL instructions is really enabled.
+	 *
+	 * We do some fakery here and in the state atom emit logic to enable
+	 * the texture without tripping up the CS checker in the kernel.
+	 */
+	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+		if (ctx->FragmentProgram._Current->UsesKill && last_hw_tmu < 0) {
+			last_hw_tmu++;
+
+			r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
+
+			r300->hw.tex.border_color.cmd[R300_TEX_VALUE_0] = 0;
+			r300->hw.tex.chroma_key.cmd[R300_TEX_VALUE_0] = 0;
+			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0] = 0;
+			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0] = 0;
+			r300->hw.tex.size.cmd[R300_TEX_VALUE_0] = 0; /* 1x1 texture */
+			r300->hw.tex.format.cmd[R300_TEX_VALUE_0] = 0; /* A8 format */
+			r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0] = 0;
+		}
+	}
+
 	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER0_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, last_hw_tmu + 1);
 	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER1_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, last_hw_tmu + 1);
 	r300->hw.tex.size.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_SIZE_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, last_hw_tmu + 1);
 	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, last_hw_tmu + 1);
 	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT2_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, last_hw_tmu + 1);
 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_OFFSET_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, last_hw_tmu + 1);
 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
-	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
-		if (fp->mesa_program.UsesKill && last_hw_tmu < 0) {
-			// The KILL operation requires the first texture unit
-			// to be enabled.
-			r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
-			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0] = 0;
-			r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-				cmdpacket0(R300_TX_FILTER0_0, 1);
-		}
-		r300SetupFragmentShaderTextures(ctx, tmu_mappings);
-	} else
-		r500SetupFragmentShaderTextures(ctx, tmu_mappings);
+	r300->vtbl.SetupFragmentShaderTextures(ctx, tmu_mappings);
 
-	if (RADEON_DEBUG & DEBUG_STATE)
+	if (RADEON_DEBUG & RADEON_STATE)
 		fprintf(stderr, "TX_ENABLE: %08x  last_hw_tmu=%d\n",
 			r300->hw.txe.cmd[R300_TXE_ENABLE], last_hw_tmu);
 }
@@ -1610,124 +1434,44 @@ union r300_outputs_written {
 static void r300SetupRSUnit(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	/* I'm still unsure if these are needed */
-	GLuint interp_col[8];
-        TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
 	union r300_outputs_written OutputsWritten;
 	GLuint InputsRead;
 	int fp_reg, high_rr;
-	int col_interp_nr;
-	int rs_tex_count = 0, rs_col_count = 0;
-	int i, count;
+	int col_ip, tex_ip;
+	int rs_tex_count = 0;
+	int i, col_fmt, hw_tcl_on;
 
-	memset(interp_col, 0, sizeof(interp_col));
+	hw_tcl_on = r300->options.hw_tcl_enabled;
 
 	if (hw_tcl_on)
-		OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+		OutputsWritten.vp_outputs = r300->selected_vp->code.OutputsWritten;
 	else
-		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->render_inputs_bitset);
 
-	if (ctx->FragmentProgram._Current)
-		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-	else {
-		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
-		return;		/* This should only ever happen once.. */
-	}
+	InputsRead = r300->selected_fp->InputsRead;
 
 	R300_STATECHANGE(r300, ri);
 	R300_STATECHANGE(r300, rc);
 	R300_STATECHANGE(r300, rr);
 
-	fp_reg = col_interp_nr = high_rr = 0;
-
-	r300->hw.rr.cmd[R300_RR_INST_1] = 0;
-
-	if (InputsRead & FRAG_BIT_WPOS) {
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
-			if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
-				break;
-
-		if (i == ctx->Const.MaxTextureUnits) {
-			fprintf(stderr, "\tno free texcoord found...\n");
-			_mesa_exit(-1);
-		}
-
-		InputsRead |= (FRAG_BIT_TEX0 << i);
-		InputsRead &= ~FRAG_BIT_WPOS;
-	}
-
-	if (InputsRead & FRAG_BIT_COL0) {
-		count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-		interp_col[0] |= R300_RS_COL_PTR(rs_col_count);
-		if (count == 3)
-			interp_col[0] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGB1);
-		rs_col_count += count;
-	}
-	else
-		interp_col[0] = R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
-
-	if (InputsRead & FRAG_BIT_COL1) {
-		count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-		if (count == 3)
-			interp_col[1] |= R300_RS_COL_FMT(R300_RS_COL_FMT_RGB0);
-		interp_col[1] |= R300_RS_COL_PTR(1);
-		rs_col_count += count;
-	}
-
-	if (InputsRead & FRAG_BIT_FOGC) {
-		/* XXX FIX THIS
-		 * Just turn off the bit for now.
-		 * Need to do something similar to the color/texcoord inputs.
-		 */
-		InputsRead &= ~FRAG_BIT_FOGC;
-	}
+	fp_reg = col_ip = tex_ip = col_fmt = 0;
 
-	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-		int swiz;
+	r300->hw.rc.cmd[1] = 0;
+	r300->hw.rc.cmd[2] = 0;
+	for (i=0; i<R300_RR_CMDSIZE-1; ++i)
+		r300->hw.rr.cmd[R300_RR_INST_0 + i] = 0;
 
-		/* with TCL we always seem to route 4 components */
-		if (hw_tcl_on)
-		  count = 4;
-		else
-		  count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
+	for (i=0; i<R300_RI_CMDSIZE-1; ++i)
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0;
 
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = interp_col[i] | rs_tex_count;
-		switch(count) {
-		case 4: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3); break;
-		case 3: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
-		default:
-		case 1:
-		case 2: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
-		};
-
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + i] |= swiz;
-
-		r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] = 0;
-		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-
-			rs_tex_count += count;
-
-			//assert(r300->state.texture.tc_count != 0);
-			r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] |= R300_RS_INST_TEX_CN_WRITE | i	/* source INTERP */
-			    | (fp_reg << R300_RS_INST_TEX_ADDR_SHIFT);
-			high_rr = fp_reg;
-
-			/* Passing invalid data here can lock the GPU. */
-			if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
-				InputsRead &= ~(FRAG_BIT_TEX0 << i);
-				fp_reg++;
-			} else {
-				WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
-			}
-		}
-	}
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-			r300->hw.rr.cmd[R300_RR_INST_0] |= R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
+			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL0;
-			col_interp_nr++;
+			++col_ip;
+			++fp_reg;
 		} else {
 			WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
 		}
@@ -1735,29 +1479,47 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL1) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-			r300->hw.rr.cmd[R300_RR_INST_1] |= R300_RS_INST_COL_ID(1) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
+			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL1;
-			if (high_rr < 1)
-				high_rr = 1;
-			col_interp_nr++;
+			++col_ip;
+			++fp_reg;
 		} else {
 			WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
 		}
 	}
 
-	/* Need at least one. This might still lock as the values are undefined... */
-	if (rs_tex_count == 0 && col_interp_nr == 0) {
-		r300->hw.rr.cmd[R300_RR_INST_0] |= R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | (fp_reg++ << R300_RS_INST_COL_ADDR_SHIFT);
-		col_interp_nr++;
+	/* We always route 4 texcoord components */
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+		if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
+		    continue;
+
+		if (!R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+		    WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+		    continue;
+		}
+
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) | R300_RS_TEX_PTR(rs_tex_count);
+		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+		InputsRead &= ~(FRAG_BIT_TEX0 << i);
+		rs_tex_count += 4;
+		++tex_ip;
+		++fp_reg;
 	}
 
-	r300->hw.rc.cmd[1] = 0 | (rs_tex_count << R300_IT_COUNT_SHIFT)
-	  | (col_interp_nr << R300_IC_COUNT_SHIFT)
-	  | R300_HIRES_EN;
+	/* Setup default color if no color or tex was set */
+	if (rs_tex_count == 0 && col_ip == 0) {
+		r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_ADDR(0);
+		r300->hw.ri.cmd[R300_RI_INTERP_0] = R300_RS_COL_PTR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+		++col_ip;
+	}
+
+	high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
+	r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT) | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+	r300->hw.rc.cmd[2] |= high_rr - 1;
 
-	assert(high_rr >= 0);
-	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr + 1);
-	r300->hw.rc.cmd[2] = high_rr;
+	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, high_rr);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, high_rr);
 
 	if (InputsRead)
 		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
@@ -1766,128 +1528,44 @@ static void r300SetupRSUnit(GLcontext * ctx)
 static void r500SetupRSUnit(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	/* I'm still unsure if these are needed */
-	GLuint interp_col[8];
 	union r300_outputs_written OutputsWritten;
-        TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
 	GLuint InputsRead;
 	int fp_reg, high_rr;
-	int rs_col_count = 0;
-	int in_texcoords, col_interp_nr;
-	int i, count;
+	int col_ip, tex_ip;
+	int rs_tex_count = 0;
+	int i, col_fmt, hw_tcl_on;
+
+	hw_tcl_on = r300->options.hw_tcl_enabled;
 
-	memset(interp_col, 0, sizeof(interp_col));
 	if (hw_tcl_on)
-		OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+		OutputsWritten.vp_outputs = r300->selected_vp->code.OutputsWritten;
 	else
-		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->render_inputs_bitset);
 
-	if (ctx->FragmentProgram._Current)
-		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-	else {
-		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
-		return;		/* This should only ever happen once.. */
-	}
+	InputsRead = r300->selected_fp->InputsRead;
 
 	R300_STATECHANGE(r300, ri);
 	R300_STATECHANGE(r300, rc);
 	R300_STATECHANGE(r300, rr);
 
-	fp_reg = col_interp_nr = high_rr = in_texcoords = 0;
-
-	r300->hw.rr.cmd[R300_RR_INST_1] = 0;
+	fp_reg = col_ip = tex_ip = col_fmt = 0;
 
-	if (InputsRead & FRAG_BIT_WPOS) {
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
-			if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
-				break;
+	r300->hw.rc.cmd[1] = 0;
+	r300->hw.rc.cmd[2] = 0;
+	for (i=0; i<R300_RR_CMDSIZE-1; ++i)
+		r300->hw.rr.cmd[R300_RR_INST_0 + i] = 0;
 
-		if (i == ctx->Const.MaxTextureUnits) {
-			fprintf(stderr, "\tno free texcoord found...\n");
-			_mesa_exit(-1);
-		}
+	for (i=0; i<R500_RI_CMDSIZE-1; ++i)
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0;
 
-		InputsRead |= (FRAG_BIT_TEX0 << i);
-		InputsRead &= ~FRAG_BIT_WPOS;
-	}
-
-	if (InputsRead & FRAG_BIT_COL0) {
-		count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-		interp_col[0] |= R500_RS_COL_PTR(rs_col_count);
-		if (count == 3)
-			interp_col[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGB1);
-		rs_col_count += count;
-	}
-	else
-		interp_col[0] = R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
-
-	if (InputsRead & FRAG_BIT_COL1) {
-		count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-		interp_col[1] |= R500_RS_COL_PTR(1);
-		if (count == 3)
-			interp_col[1] |= R500_RS_COL_FMT(R300_RS_COL_FMT_RGB0);
-		rs_col_count += count;
-	}
-
-	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-		GLuint swiz = 0;
-
-		/* with TCL we always seem to route 4 components */
-		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-
-		  if (hw_tcl_on)
-		    count = 4;
-		  else
-		    count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-
-		  /* always have on texcoord */
-		  swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_S_SHIFT;
-		  if (count >= 2)
-		    swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_T_SHIFT;
-		  else
-		    swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-
-		  if (count >= 3)
-		    swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_R_SHIFT;
-		  else
-		    swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-
-		  if (count == 4)
-		    swiz |= in_texcoords++ << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		  else
-		    swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-
-		} else
-		   swiz = (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-		          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-		          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-		          (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
-
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = interp_col[i] | swiz;
-
-		r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] = 0;
-		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-			//assert(r300->state.texture.tc_count != 0);
-			r300->hw.rr.cmd[R300_RR_INST_0 + fp_reg] |= R500_RS_INST_TEX_CN_WRITE | i	/* source INTERP */
-			    | (fp_reg << R500_RS_INST_TEX_ADDR_SHIFT);
-			high_rr = fp_reg;
-
-			/* Passing invalid data here can lock the GPU. */
-			if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
-				InputsRead &= ~(FRAG_BIT_TEX0 << i);
-				fp_reg++;
-			} else {
-				WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
-			}
-		}
-	}
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-			r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_CN_WRITE | (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
+			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL0;
-			col_interp_nr++;
+			++col_ip;
+			++fp_reg;
 		} else {
 			WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
 		}
@@ -1895,86 +1573,59 @@ static void r500SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL1) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-			r300->hw.rr.cmd[R300_RR_INST_1] |= (1 << 12) | R500_RS_INST_COL_CN_WRITE |  (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
+			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL1;
-			if (high_rr < 1)
-				high_rr = 1;
-			col_interp_nr++;
+			++col_ip;
+			++fp_reg;
 		} else {
 			WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
 		}
 	}
 
-	/* Need at least one. This might still lock as the values are undefined... */
-	if (in_texcoords == 0 && col_interp_nr == 0) {
-		r300->hw.rr.cmd[R300_RR_INST_0] |= 0 | R500_RS_INST_COL_CN_WRITE | (fp_reg++ << R500_RS_INST_COL_ADDR_SHIFT);
-		col_interp_nr++;
-	}
-
-	r300->hw.rc.cmd[1] = 0 | (in_texcoords << R300_IT_COUNT_SHIFT)
-	  | (col_interp_nr << R300_IC_COUNT_SHIFT)
-	  | R300_HIRES_EN;
-
-	assert(high_rr >= 0);
-	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr + 1);
-	r300->hw.rc.cmd[2] = 0xC0 | high_rr;
-
-	if (InputsRead)
-		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
-}
-
-
-
+	/* We always route 4 texcoord components */
+	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+		if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
+		    continue;
 
-#define bump_vpu_count(ptr, new_count)   do{\
-	drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
-	int _nc=(new_count)/4; \
-	assert(_nc < 256); \
-	if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
-	}while(0)
+		if (!R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+		    WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+		    continue;
+		}
 
-static INLINE void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
-{
-	int i;
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+			((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+			((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+			((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
 
-	if (vsf->length == 0)
-		return;
+		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+		InputsRead &= ~(FRAG_BIT_TEX0 << i);
+		rs_tex_count += 4;
+		++tex_ip;
+		++fp_reg;
+	}
 
-	if (vsf->length & 0x3) {
-		fprintf(stderr, "VERTEX_SHADER_FRAGMENT must have length divisible by 4\n");
-		_mesa_exit(-1);
+	/* Setup default color if no color or tex was set */
+	if (rs_tex_count == 0 && col_ip == 0) {
+		r300->hw.rr.cmd[R300_RR_INST_0] = R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_ADDR(0);
+		r300->hw.ri.cmd[R300_RI_INTERP_0] = R500_RS_COL_PTR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+		++col_ip;
 	}
 
-	switch ((dest >> 8) & 0xf) {
-	case 0:
-		R300_STATECHANGE(r300, vpi);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vpi.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
+	high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
+	r300->hw.rc.cmd[1] = (rs_tex_count << R300_IT_COUNT_SHIFT) | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+	r300->hw.rc.cmd[2] = 0xC0 | (high_rr - 1);
 
-	case 2:
-		R300_STATECHANGE(r300, vpp);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vpp.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-	case 4:
-		R300_STATECHANGE(r300, vps);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vps.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-	default:
-		fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
-		_mesa_exit(-1);
-	}
+	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, high_rr);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, high_rr);
+
+	if (InputsRead)
+		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
 }
 
 #define MIN3(a, b, c)	((a) < (b) ? MIN2(a, c) : MIN2(b, c))
 
-
-static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
+void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 			GLuint output_count, GLuint temp_count)
 {
     int vtx_mem_size;
@@ -1998,7 +1649,7 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
     pvs_num_cntrls = MIN2(6, vtx_mem_size/temp_count);
 
     R300_STATECHANGE(rmesa, vap_cntl);
-    if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+    if (rmesa->options.hw_tcl_enabled) {
 	rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] =
 	    (pvs_num_slots << R300_PVS_NUM_SLOTS_SHIFT) |
 	    (pvs_num_cntrls << R300_PVS_NUM_CNTLRS_SHIFT) |
@@ -2028,114 +1679,6 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 
 }
 
-static void r300SetupDefaultVertexProgram(r300ContextPtr rmesa)
-{
-	struct r300_vertex_shader_state *prog = &(rmesa->state.vertex_shader);
-	GLuint o_reg = 0;
-	GLuint i_reg = 0;
-	int i;
-	int inst_count = 0;
-	int param_count = 0;
-	int program_end = 0;
-
-	for (i = VERT_ATTRIB_POS; i < VERT_ATTRIB_MAX; i++) {
-		if (rmesa->state.sw_tcl_inputs[i] != -1) {
-			prog->program.body.i[program_end + 0] = PVS_OP_DST_OPERAND(VE_MULTIPLY, GL_FALSE, GL_FALSE, o_reg++, VSF_FLAG_ALL, PVS_DST_REG_OUT);
-			prog->program.body.i[program_end + 1] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			prog->program.body.i[program_end + 2] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			prog->program.body.i[program_end + 3] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			program_end += 4;
-			i_reg++;
-		}
-	}
-
-	prog->program.length = program_end;
-
-	r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START,
-				       &(prog->program));
-	inst_count = (prog->program.length / 4) - 1;
-
-	r300VapCntl(rmesa, i_reg, o_reg, 0);
-
-	R300_STATECHANGE(rmesa, pvs);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-	    (0 << R300_PVS_FIRST_INST_SHIFT) |
-	    (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
-	    (inst_count << R300_PVS_LAST_INST_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-	    (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-	    (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-	    (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-}
-
-static int bit_count (int x)
-{
-    x = ((x & 0xaaaaaaaaU) >> 1) + (x & 0x55555555U);
-    x = ((x & 0xccccccccU) >> 2) + (x & 0x33333333U);
-    x = (x >> 16) + (x & 0xffff);
-    x = ((x & 0xf0f0) >> 4) + (x & 0x0f0f);
-    return (x >> 8) + (x & 0x00ff);
-}
-
-static void r300SetupRealVertexProgram(r300ContextPtr rmesa)
-{
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_vertex_program *prog = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-	int inst_count = 0;
-	int param_count = 0;
-
-	/* FIXME: r300SetupVertexProgramFragment */
-	R300_STATECHANGE(rmesa, vpp);
-	param_count =
-	    r300VertexProgUpdateParams(ctx,
-				       (struct r300_vertex_program_cont *)
-				       ctx->VertexProgram._Current,
-				       (float *)&rmesa->hw.vpp.
-				       cmd[R300_VPP_PARAM_0]);
-	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
-	param_count /= 4;
-
-	r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START, &(prog->program));
-	inst_count = (prog->program.length / 4) - 1;
-
-	r300VapCntl(rmesa, bit_count(prog->key.InputsRead),
-		    bit_count(prog->key.OutputsWritten), prog->num_temporaries);
-
-	R300_STATECHANGE(rmesa, pvs);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-	  (0 << R300_PVS_FIRST_INST_SHIFT) |
-	  (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
-	  (inst_count << R300_PVS_LAST_INST_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-	  (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-	  (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-	  (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-}
-
-static void r300SetupVertexProgram(r300ContextPtr rmesa)
-{
-	GLcontext *ctx = rmesa->radeon.glCtx;
-
-	/* Reset state, in case we don't use something */
-	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
-	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
-	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
-
-	/* Not sure why this doesnt work...
-	   0x400 area might have something to do with pixel shaders as it appears right after pfs programming.
-	   0x406 is set to { 0.0, 0.0, 1.0, 0.0 } most of the time but should change with smooth points and in other rare cases. */
-	//setup_vertex_shader_fragment(rmesa, 0x406, &unk4);
-	if (hw_tcl_on && ((struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx))->translated) {
-		r300SetupRealVertexProgram(rmesa);
-	} else {
-		/* FIXME: This needs to be replaced by vertex shader generation code. */
-		r300SetupDefaultVertexProgram(rmesa);
-	}
-
-}
-
 /**
  * Enable/Disable states.
  *
@@ -2143,20 +1686,13 @@ static void r300SetupVertexProgram(r300ContextPtr rmesa)
  */
 static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 {
-	if (RADEON_DEBUG & DEBUG_STATE)
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	if (RADEON_DEBUG & RADEON_STATE)
 		fprintf(stderr, "%s( %s = %s )\n", __FUNCTION__,
 			_mesa_lookup_enum_by_nr(cap),
 			state ? "GL_TRUE" : "GL_FALSE");
 
 	switch (cap) {
-	case GL_TEXTURE_1D:
-	case GL_TEXTURE_2D:
-	case GL_TEXTURE_3D:
-		/* empty */
-		break;
-	case GL_FOG:
-		r300SetFogState(ctx, state);
-		break;
 	case GL_ALPHA_TEST:
 		r300SetAlphaState(ctx);
 		break;
@@ -2174,22 +1710,46 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 	case GL_CLIP_PLANE5:
 		r300SetClipPlaneState(ctx, cap, state);
 		break;
+	case GL_CULL_FACE:
+		r300UpdateCulling(ctx);
+		break;
 	case GL_DEPTH_TEST:
 		r300SetDepthState(ctx);
 		break;
-	case GL_STENCIL_TEST:
-		r300SetStencilState(ctx, state);
+	case GL_LINE_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_LINE_SMOOTH, ctx->Line.SmoothFlag);
 		break;
-	case GL_CULL_FACE:
-		r300UpdateCulling(ctx);
+	case GL_LINE_STIPPLE:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_LINE_STIPPLE, ctx->Line.StippleFlag);
+		break;
+	case GL_POINT_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POINT_SMOOTH, ctx->Point.SmoothFlag);
+		break;
+	case GL_POLYGON_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POLYGON_SMOOTH, ctx->Polygon.SmoothFlag);
+		break;
+	case GL_POLYGON_STIPPLE:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POLYGON_STIPPLE, ctx->Polygon.StippleFlag);
 		break;
 	case GL_POLYGON_OFFSET_POINT:
 	case GL_POLYGON_OFFSET_LINE:
 	case GL_POLYGON_OFFSET_FILL:
 		r300SetPolygonOffsetState(ctx, state);
 		break;
+	case GL_SCISSOR_TEST:
+		radeon_firevertices(&rmesa->radeon);
+		rmesa->radeon.state.scissor.enabled = state;
+		radeonUpdateScissor( ctx );
+		break;
+	case GL_STENCIL_TEST:
+		r300SetStencilState(ctx, state);
+		break;
 	default:
-		radeonEnable(ctx, cap, state);
 		break;
 	}
 }
@@ -2200,15 +1760,14 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 static void r300ResetHwState(r300ContextPtr r300)
 {
 	GLcontext *ctx = r300->radeon.glCtx;
-	int has_tcl = 1;
+	int has_tcl;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
-	if (RADEON_DEBUG & DEBUG_STATE)
+	if (RADEON_DEBUG & RADEON_STATE)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
-	r300UpdateWindow(ctx);
+	radeon_firevertices(&r300->radeon);
 
 	r300ColorMask(ctx,
 		      ctx->Color.ColorMask[RCOMP],
@@ -2220,7 +1779,7 @@ static void r300ResetHwState(r300ContextPtr r300)
 	r300DepthFunc(ctx, ctx->Depth.Func);
 
 	/* stencil */
-	r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
+	r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil._Enabled);
 	r300StencilMaskSeparate(ctx, 0, ctx->Stencil.WriteMask[0]);
 	r300StencilFuncSeparate(ctx, 0, ctx->Stencil.Function[0],
 				ctx->Stencil.Ref[0], ctx->Stencil.ValueMask[0]);
@@ -2230,8 +1789,6 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300UpdateCulling(ctx);
 
-	r300UpdateTextureState(ctx);
-
 	r300SetBlendState(ctx);
 	r300SetLogicOpState(ctx);
 
@@ -2309,11 +1866,9 @@ static void r300ResetHwState(r300ContextPtr r300)
 		break;
 	}
 
-	/* XXX: set to 0 when fog is disabled? */
-	r300->hw.gb_misc.cmd[R300_GB_MISC_SELECT] = R300_GB_FOG_SELECT_1_1_W;
-
 	/* XXX: Enable anti-aliasing? */
-	r300->hw.gb_misc.cmd[R300_GB_MISC_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
+	r300->hw.gb_misc2.cmd[R300_GB_MISC2_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
+	r300->hw.gb_misc2.cmd[R300_GB_MISC2_SELECT] = 0;
 
 	r300->hw.ga_point_s0.cmd[1] = r300PackFloat32(0.0);
 	r300->hw.ga_point_s0.cmd[2] = r300PackFloat32(0.0);
@@ -2362,36 +1917,16 @@ static void r300ResetHwState(r300ContextPtr r300)
 	  R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
 	r300->hw.us_out_fmt.cmd[4] = R500_OUT_FMT_UNUSED |
 	  R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
-	r300->hw.us_out_fmt.cmd[5] = R300_W_FMT_W24;
+	r300->hw.us_out_fmt.cmd[5] = R300_W_FMT_W0 | R300_W_SRC_US;
 
-	r300Enable(ctx, GL_FOG, ctx->Fog.Enabled);
-	r300Fogfv(ctx, GL_FOG_MODE, NULL);
-	r300Fogfv(ctx, GL_FOG_DENSITY, &ctx->Fog.Density);
-	r300Fogfv(ctx, GL_FOG_START, &ctx->Fog.Start);
-	r300Fogfv(ctx, GL_FOG_END, &ctx->Fog.End);
-	r300Fogfv(ctx, GL_FOG_COLOR, ctx->Fog.Color);
-	r300Fogfv(ctx, GL_FOG_COORDINATE_SOURCE_EXT, NULL);
-
-	r300->hw.fg_depth_src.cmd[1] = 0;
+	/* disable fog unit */
+	r300->hw.fogs.cmd[R300_FOGS_STATE] = 0;
+	r300->hw.fg_depth_src.cmd[1] = R300_FG_DEPTH_SRC_SCAN;
 
 	r300->hw.rb3d_cctl.cmd[1] = 0;
 
 	r300BlendColor(ctx, ctx->Color.BlendColor);
 
-	/* Again, r300ClearBuffer uses this */
-	r300->hw.cb.cmd[R300_CB_OFFSET] =
-	    r300->radeon.state.color.drawOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-
 	r300->hw.rb3d_dither_ctl.cmd[1] = 0;
 	r300->hw.rb3d_dither_ctl.cmd[2] = 0;
 	r300->hw.rb3d_dither_ctl.cmd[3] = 0;
@@ -2404,44 +1939,18 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300->hw.rb3d_aaresolve_ctl.cmd[1] = 0;
 
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
-
-	r300->hw.zb.cmd[R300_ZB_OFFSET] =
-	    r300->radeon.radeonScreen->depthOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.zb.cmd[R300_ZB_PITCH] = r300->radeon.radeonScreen->depthPitch;
-
-	if (r300->radeon.sarea->tiling_enabled) {
-		/* XXX: Turn off when clearing buffers ? */
-		r300->hw.zb.cmd[R300_ZB_PITCH] |= R300_DEPTHMACROTILE_ENABLE;
-
-		if (ctx->Visual.depthBits == 24)
-			r300->hw.zb.cmd[R300_ZB_PITCH] |=
-			    R300_DEPTHMICROTILE_TILED;
-	}
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
 
 	r300->hw.zb_depthclearvalue.cmd[1] = 0;
 
-	switch (ctx->Visual.depthBits) {
-	case 16:
-		r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_16BIT_INT_Z;
-		break;
-	case 24:
-		r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-		break;
-	default:
-		fprintf(stderr, "Error: Unsupported depth %d... exiting\n", ctx->Visual.depthBits);
-		_mesa_exit(-1);
-	}
-
 	r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
 	r300->hw.zstencil_format.cmd[3] = 0x00000003;
 	r300->hw.zstencil_format.cmd[4] = 0x00000000;
 	r300SetEarlyZState(ctx);
 
-	r300->hw.unk4F30.cmd[1] = 0;
-	r300->hw.unk4F30.cmd[2] = 0;
+	r300->hw.zb_zmask.cmd[1] = 0;
+	r300->hw.zb_zmask.cmd[2] = 0;
 
 	r300->hw.zb_hiz_offset.cmd[1] = 0;
 
@@ -2455,135 +1964,151 @@ static void r300ResetHwState(r300ContextPtr r300)
 		r300->hw.vps.cmd[R300_VPS_ZERO_3] = 0;
 	}
 
-	r300->hw.all_dirty = GL_TRUE;
+	r300->radeon.hw.all_dirty = GL_TRUE;
 }
 
 void r300UpdateShaders(r300ContextPtr rmesa)
 {
-	GLcontext *ctx;
-	struct r300_vertex_program *vp;
-	int i;
+	GLcontext *ctx = rmesa->radeon.glCtx;
 
-	ctx = rmesa->radeon.glCtx;
+	/* should only happenen once, just after context is created */
+	/* TODO: shouldn't we fallback to sw here? */
+	if (!ctx->FragmentProgram._Current) {
+		_mesa_fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+		return;
+	}
 
-	if (rmesa->NewGLState && hw_tcl_on) {
-		rmesa->NewGLState = 0;
+	{
+		struct r300_fragment_program *fp;
 
-		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
-			rmesa->temp_attrib[i] =
-			    TNL_CONTEXT(ctx)->vb.AttribPtr[i];
-			TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
-			    &rmesa->dummy_attrib[i];
-		}
+		fp = r300SelectAndTranslateFragmentShader(ctx);
+
+		r300SwitchFallback(ctx, R300_FALLBACK_FRAGMENT_PROGRAM, fp->error);
+	}
 
-		_tnl_UpdateFixedFunctionProgram(ctx);
+	if (rmesa->options.hw_tcl_enabled) {
+		struct r300_vertex_program *vp;
 
-		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
-			TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
-			    rmesa->temp_attrib[i];
-		}
+		if (rmesa->radeon.NewGLState) {
+			int i;
+			for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+				rmesa->temp_attrib[i] =
+				    TNL_CONTEXT(ctx)->vb.AttribPtr[i];
+				TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+				    &rmesa->dummy_attrib[i];
+			}
 
-		r300SelectVertexShader(rmesa);
-		vp = (struct r300_vertex_program *)
-		    CURRENT_VERTEX_SHADER(ctx);
-		/*if (vp->translated == GL_FALSE)
-		   r300TranslateVertexShader(vp); */
-		if (vp->translated == GL_FALSE) {
-			fprintf(stderr, "Failing back to sw-tcl\n");
-			hw_tcl_on = future_hw_tcl_on = 0;
-			r300ResetHwState(rmesa);
-
-			r300UpdateStateParameters(ctx, _NEW_PROGRAM);
-			return;
+			_tnl_UpdateFixedFunctionProgram(ctx);
+
+			for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+				TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+				    rmesa->temp_attrib[i];
+			}
 		}
+
+		vp = r300SelectAndTranslateVertexShader(ctx);
+
+		r300SwitchFallback(ctx, R300_FALLBACK_VERTEX_PROGRAM, vp->error);
 	}
-	r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+
+	r300UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+	rmesa->radeon.NewGLState = 0;
 }
 
-static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
-	struct gl_program *program, struct prog_src_register srcreg)
+static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx, GLuint index, GLfloat * buffer)
 {
 	static const GLfloat dummy[4] = { 0, 0, 0, 0 };
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct rc_constant * rcc = &rmesa->selected_fp->code.constants.Constants[index];
+
+	switch(rcc->Type) {
+	case RC_CONSTANT_EXTERNAL:
+		return ctx->FragmentProgram._Current->Base.Parameters->ParameterValues[rcc->u.External];
+	case RC_CONSTANT_IMMEDIATE:
+		return rcc->u.Immediate;
+	case RC_CONSTANT_STATE:
+		switch(rcc->u.State[0]) {
+		case RC_STATE_SHADOW_AMBIENT: {
+			const int unit = (int) rcc->u.State[1];
+			const struct gl_texture_object *texObj = ctx->Texture.Unit[unit]._Current;
+			if (texObj) {
+				buffer[0] =
+				buffer[1] =
+				buffer[2] =
+				buffer[3] = texObj->CompareFailValue;
+			}
+			return buffer;
+		}
 
-	switch(srcreg.File) {
-	case PROGRAM_LOCAL_PARAM:
-		return program->LocalParams[srcreg.Index];
-	case PROGRAM_ENV_PARAM:
-		return ctx->FragmentProgram.Parameters[srcreg.Index];
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_NAMED_PARAM:
-	case PROGRAM_CONSTANT:
-		return program->Parameters->ParameterValues[srcreg.Index];
-	default:
-		_mesa_problem(ctx, "get_fragmentprogram_constant: Unknown\n");
-		return dummy;
+		case RC_STATE_R300_WINDOW_DIMENSION: {
+			__DRIdrawablePrivate * drawable = radeon_get_drawable(&rmesa->radeon);
+			buffer[0] = drawable->w * 0.5f;	/* width*0.5 */
+			buffer[1] = drawable->h * 0.5f;	/* height*0.5 */
+			buffer[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
+			buffer[3] = 1.0F;	/* not used */
+			return buffer;
+		}
+
+		case RC_STATE_R300_TEXRECT_FACTOR: {
+			struct gl_texture_object *t =
+				ctx->Texture.Unit[rcc->u.State[1]].CurrentTex[TEXTURE_RECT_INDEX];
+
+			if (t && t->Image[0][t->BaseLevel]) {
+				struct gl_texture_image *image =
+					t->Image[0][t->BaseLevel];
+				buffer[0] = 1.0 / image->Width2;
+				buffer[1] = 1.0 / image->Height2;
+			} else {
+				buffer[0] = 1.0;
+				buffer[1] = 1.0;
+			}
+			buffer[2] = 1.0;
+			buffer[3] = 1.0;
+			return buffer;
+		}
+		}
 	}
+
+	return dummy;
 }
 
 
-static void r300SetupPixelShader(r300ContextPtr rmesa)
+static void r300SetupPixelShader(GLcontext *ctx)
 {
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = rmesa->selected_fp;
 	struct r300_fragment_program_code *code;
-	int i, k;
-
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
-	r300TranslateFragmentShader(rmesa, fp);
-	if (!fp->translated) {
-		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
-			__FUNCTION__);
-		return;
-	}
-	code = &fp->code;
+	int i;
 
-	r300SetupTextures(ctx);
+	code = &fp->code.code.r300;
 
 	R300_STATECHANGE(rmesa, fpi[0]);
 	R300_STATECHANGE(rmesa, fpi[1]);
 	R300_STATECHANGE(rmesa, fpi[2]);
 	R300_STATECHANGE(rmesa, fpi[3]);
-	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu.length);
-	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu.length);
-	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu.length);
-	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, code->alu.length);
+	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, code->alu.length);
+	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, code->alu.length);
+	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
 	for (i = 0; i < code->alu.length; i++) {
-		rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
-		rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
-		rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2;
-		rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst3;
+		rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].rgb_inst;
+		rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].rgb_addr;
+		rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].alpha_inst;
+		rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].alpha_addr;
 	}
 
 	R300_STATECHANGE(rmesa, fp);
-	rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->cur_node | (code->first_node_has_tex << 3);
-	rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->max_temp_idx;
-	rmesa->hw.fp.cmd[R300_FP_CNTL2] =
-	  (0 << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
-	  ((code->alu.length-1) << R300_PFS_CNTL_ALU_END_SHIFT) |
-	  (0 << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
-	  ((code->tex.length ? code->tex.length-1 : 0) << R300_PFS_CNTL_TEX_END_SHIFT);
-	/* I just want to say, the way these nodes are stored.. weird.. */
-	for (i = 0, k = (4 - (code->cur_node + 1)); i < 4; i++, k++) {
-		if (i < (code->cur_node + 1)) {
-			rmesa->hw.fp.cmd[R300_FP_NODE0 + k] =
-			  (code->node[i].alu_offset << R300_ALU_START_SHIFT) |
-			  (code->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
-			  (code->node[i].tex_offset << R300_TEX_START_SHIFT) |
-			  (code->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
-			  code->node[i].flags;
-		} else {
-			rmesa->hw.fp.cmd[R300_FP_NODE0 + (3 - i)] = 0;
-		}
-	}
+	rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->config;
+	rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->pixsize;
+	rmesa->hw.fp.cmd[R300_FP_CNTL2] = code->code_offset;
+	for (i = 0; i < 4; i++)
+		rmesa->hw.fp.cmd[R300_FP_NODE0 + i] = code->code_addr[i];
 
 	R300_STATECHANGE(rmesa, fpp);
-	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
-	for (i = 0; i < code->const_nr; i++) {
-		const GLfloat *constant = get_fragmentprogram_constant(ctx,
-			&fp->mesa_program.Base, code->constant[i]);
+	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_PFS_PARAM_0_X, fp->code.constants.Count * 4);
+	for (i = 0; i < fp->code.constants.Count; i++) {
+		GLfloat buffer[4];
+		const GLfloat *constant = get_fragmentprogram_constant(ctx, i, buffer);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]);
@@ -2605,51 +2130,29 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 	if(_nc>_p->r500fp.count)_p->r500fp.count=_nc;\
 } while(0)
 
-static void r500SetupPixelShader(r300ContextPtr rmesa)
+static void r500SetupPixelShader(GLcontext *ctx)
 {
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = rmesa->selected_fp;
 	int i;
 	struct r500_fragment_program_code *code;
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
 	((drm_r300_cmd_header_t *) rmesa->hw.r500fp.cmd)->r500fp.count = 0;
 	((drm_r300_cmd_header_t *) rmesa->hw.r500fp_const.cmd)->r500fp.count = 0;
 
-	r500TranslateFragmentShader(rmesa, fp);
-	if (!fp->translated) {
-		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
-			__FUNCTION__);
-		return;
-	}
-	code = &fp->code;
-
-	if (fp->mesa_program.FogOption != GL_NONE) {
-		/* Enable HW fog. Try not to squish GL context.
-		 * (Anybody sane remembered to set glFog() opts first!) */
-		r300SetFogState(ctx, GL_TRUE);
-		ctx->Fog.Mode = fp->mesa_program.FogOption;
-		r300Fogfv(ctx, GL_FOG_MODE, NULL);
-	} else
-		/* Make sure HW is matching GL context. */
-		r300SetFogState(ctx, ctx->Fog.Enabled);
-
-	r300SetupTextures(ctx);
+	code = &fp->code.code.r500;
 
 	R300_STATECHANGE(rmesa, fp);
 	rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
 
 	rmesa->hw.fp.cmd[R500_FP_CODE_ADDR] =
-	    R500_US_CODE_START_ADDR(code->inst_offset) |
+	    R500_US_CODE_START_ADDR(0) |
 	    R500_US_CODE_END_ADDR(code->inst_end);
 	rmesa->hw.fp.cmd[R500_FP_CODE_RANGE] =
-	    R500_US_CODE_RANGE_ADDR(code->inst_offset) |
+	    R500_US_CODE_RANGE_ADDR(0) |
 	    R500_US_CODE_RANGE_SIZE(code->inst_end);
 	rmesa->hw.fp.cmd[R500_FP_CODE_OFFSET] =
-	    R500_US_CODE_OFFSET_ADDR(0); /* FIXME when we add flow control */
+	    R500_US_CODE_OFFSET_ADDR(0);
 
 	R300_STATECHANGE(rmesa, r500fp);
 	/* Emit our shader... */
@@ -2665,16 +2168,73 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
 	bump_r500fp_count(rmesa->hw.r500fp.cmd, (code->inst_end + 1) * 6);
 
 	R300_STATECHANGE(rmesa, r500fp_const);
-	for (i = 0; i < code->const_nr; i++) {
-		const GLfloat *constant = get_fragmentprogram_constant(ctx,
-			&fp->mesa_program.Base, code->constant[i]);
+	for (i = 0; i < fp->code.constants.Count; i++) {
+		GLfloat buffer[4];
+		const GLfloat *constant = get_fragmentprogram_constant(ctx, i, buffer);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]);
 	}
-	bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
+	bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, fp->code.constants.Count * 4);
+}
+
+void r300SetupVAP(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten)
+{
+	r300ContextPtr rmesa = R300_CONTEXT( ctx );
+	struct vertex_attribute *attrs = rmesa->vbuf.attribs;
+	int i, j, reg_count;
+	uint32_t *vir0 = &rmesa->hw.vir[0].cmd[1];
+	uint32_t *vir1 = &rmesa->hw.vir[1].cmd[1];
+
+	for (i = 0; i < R300_VIR_CMDSIZE-1; ++i)
+		vir0[i] = vir1[i] = 0;
+
+	for (i = 0, j = 0; i < rmesa->vbuf.num_attribs; ++i) {
+		int tmp;
+
+		tmp = attrs[i].data_type | (attrs[i].dst_loc << R300_DST_VEC_LOC_SHIFT);
+		if (attrs[i]._signed)
+			tmp |= R300_SIGNED;
+		if (attrs[i].normalize)
+			tmp |= R300_NORMALIZE;
+
+		if (i % 2 == 0) {
+			vir0[j] = tmp << R300_DATA_TYPE_0_SHIFT;
+			vir1[j] = attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT);
+		} else {
+			vir0[j] |= tmp << R300_DATA_TYPE_1_SHIFT;
+			vir1[j] |= (attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
+			++j;
+		}
+	}
 
+	reg_count = (rmesa->vbuf.num_attribs + 1) >> 1;
+	if (rmesa->vbuf.num_attribs % 2 != 0) {
+		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
+	} else {
+		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
+	}
+
+	R300_STATECHANGE(rmesa, vir[0]);
+	R300_STATECHANGE(rmesa, vir[1]);
+	R300_STATECHANGE(rmesa, vof);
+	R300_STATECHANGE(rmesa, vic);
+
+	if (rmesa->radeon.radeonScreen->kernel_mm) {
+		rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[0].cmd[0] |= (reg_count & 0x3FFF) << 16;
+		rmesa->hw.vir[1].cmd[0] |= (reg_count & 0x3FFF) << 16;
+	} else {
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count = reg_count;
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count = reg_count;
+	}
+
+	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
+	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
+	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = r300VAPOutputCntl1(ctx, OutputsWritten);
 }
 
 void r300UpdateShaderStates(r300ContextPtr rmesa)
@@ -2682,30 +2242,21 @@ void r300UpdateShaderStates(r300ContextPtr rmesa)
 	GLcontext *ctx;
 	ctx = rmesa->radeon.glCtx;
 
-	r300UpdateTextureState(ctx);
+	/* should only happenen once, just after context is created */
+	if (!ctx->FragmentProgram._Current)
+		return;
+
 	r300SetEarlyZState(ctx);
 
-	GLuint fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
-	if (current_fragment_program_writes_depth(ctx))
-		fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
-	if (fgdepthsrc != rmesa->hw.fg_depth_src.cmd[1]) {
-		R300_STATECHANGE(rmesa, fg_depth_src);
-		rmesa->hw.fg_depth_src.cmd[1] = fgdepthsrc;
-	}
+	r300SetupTextures(ctx);
 
-	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		r500SetupPixelShader(rmesa);
-	else
-		r300SetupPixelShader(rmesa);
+	rmesa->vtbl.SetupPixelShader(ctx);
 
-	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		r500SetupRSUnit(ctx);
-	else
-		r300SetupRSUnit(ctx);
+	rmesa->vtbl.SetupRSUnit(ctx);
 
-	if ((rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (rmesa->options.hw_tcl_enabled) {
 		r300SetupVertexProgram(rmesa);
-
+	}
 }
 
 /**
@@ -2719,15 +2270,25 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
 	_swsetup_InvalidateState(ctx, new_state);
 	_vbo_InvalidateState(ctx, new_state);
 	_tnl_InvalidateState(ctx, new_state);
-	_ae_invalidate_state(ctx, new_state);
 
-	if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
-		r300UpdateDrawBuffer(ctx);
+	if (new_state & _NEW_BUFFERS) {
+		_mesa_update_framebuffer(ctx);
+		/* this updates the DrawBuffer's Width/Height if it's a FBO */
+		_mesa_update_draw_buffer_bounds(ctx);
+
+		R300_STATECHANGE(r300, cb);
+		R300_STATECHANGE(r300, zb);
 	}
 
-	r300UpdateStateParameters(ctx, new_state);
+	if (new_state & (_NEW_LIGHT)) {
+		R300_STATECHANGE(r300, shade2);
+		if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION)
+			r300->hw.shade2.cmd[1] |= R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
+		else
+			r300->hw.shade2.cmd[1] &= ~R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST;
+	}
 
-	r300->NewGLState |= new_state;
+	r300->radeon.NewGLState |= new_state;
 }
 
 /**
@@ -2737,58 +2298,12 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
  */
 void r300InitState(r300ContextPtr r300)
 {
-	GLcontext *ctx = r300->radeon.glCtx;
-	GLuint depth_fmt;
-
-	radeonInitState(&r300->radeon);
-
-	switch (ctx->Visual.depthBits) {
-	case 16:
-		r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
-		depth_fmt = R300_DEPTHFORMAT_16BIT_INT_Z;
-		break;
-	case 24:
-		r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
-		depth_fmt = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-		break;
-	default:
-		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
-			ctx->Visual.depthBits);
-		_mesa_exit(-1);
-	}
-
-	/* Only have hw stencil when depth buffer is 24 bits deep */
-	r300->state.stencil.hw_stencil = (ctx->Visual.stencilBits > 0 &&
-					  ctx->Visual.depthBits == 24);
-
-	memset(&(r300->state.texture), 0, sizeof(r300->state.texture));
-
 	r300ResetHwState(r300);
 }
 
 static void r300RenderMode(GLcontext * ctx, GLenum mode)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	(void)rmesa;
-	(void)mode;
-}
-
-void r300UpdateClipPlanes( GLcontext *ctx )
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	GLuint p;
-
-	for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
-		if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
-			GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
-
-			R300_STATECHANGE( rmesa, vpucp[p] );
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
-		}
-	}
+	r300SwitchFallback(ctx, R300_FALLBACK_RENDER_MODE, ctx->RenderMode != GL_RENDER);
 }
 
 /**
@@ -2796,7 +2311,6 @@ void r300UpdateClipPlanes( GLcontext *ctx )
  */
 void r300InitStateFuncs(struct dd_function_table *functions)
 {
-	radeonInitStateFuncs(functions);
 
 	functions->UpdateState = r300InvalidateState;
 	functions->AlphaFunc = r300AlphaFunc;
@@ -2808,7 +2322,6 @@ void r300InitStateFuncs(struct dd_function_table *functions)
 	functions->DepthFunc = r300DepthFunc;
 	functions->DepthMask = r300DepthMask;
 	functions->CullFace = r300CullFace;
-	functions->Fogfv = r300Fogfv;
 	functions->FrontFace = r300FrontFace;
 	functions->ShadeModel = r300ShadeModel;
 	functions->LogicOpcode = r300LogicOpcode;
@@ -2833,4 +2346,21 @@ void r300InitStateFuncs(struct dd_function_table *functions)
 	functions->RenderMode = r300RenderMode;
 
 	functions->ClipPlane = r300ClipPlane;
+	functions->Scissor = radeonScissor;
+
+	functions->DrawBuffer		= radeonDrawBuffer;
+	functions->ReadBuffer		= radeonReadBuffer;
+}
+
+void r300InitShaderFunctions(r300ContextPtr r300)
+{
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		r300->vtbl.SetupRSUnit = r500SetupRSUnit;
+		r300->vtbl.SetupPixelShader = r500SetupPixelShader;
+		r300->vtbl.SetupFragmentShaderTextures = r500SetupFragmentShaderTextures;
+	} else {
+		r300->vtbl.SetupRSUnit = r300SetupRSUnit;
+		r300->vtbl.SetupPixelShader = r300SetupPixelShader;
+		r300->vtbl.SetupFragmentShaderTextures = r300SetupFragmentShaderTextures;
+	}
 }
diff --git a/src/mesa/drivers/dri/r300/r300_state.h b/src/mesa/drivers/dri/r300/r300_state.h
index 0589ab7cad..d46bf9f179 100644
--- a/src/mesa/drivers/dri/r300/r300_state.h
+++ b/src/mesa/drivers/dri/r300/r300_state.h
@@ -39,42 +39,24 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define R300_NEWPRIM( rmesa )			\
   do {						\
-    if ( rmesa->dma.flush )			\
-      rmesa->dma.flush( rmesa );		\
+  if ( rmesa->radeon.dma.flush )			\
+    rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
   } while (0)
 
 #define R300_STATECHANGE(r300, atom) \
 	do {						\
 	  R300_NEWPRIM(r300);				\
 		r300->hw.atom.dirty = GL_TRUE;		\
-		r300->hw.is_dirty = GL_TRUE;		\
+		r300->radeon.hw.is_dirty = GL_TRUE;		\
 	} while(0)
 
-#define R300_PRINT_STATE(r300, atom) \
-		r300PrintStateAtom(r300, &r300->hw.atom)
-
-/* Fire the buffered vertices no matter what.
-   TODO: This has not been implemented yet
- */
-#define R300_FIREVERTICES( r300 )			\
-do {							\
-    \
-   if ( (r300)->cmdbuf.count_used || (r300)->dma.flush ) {	\
-      r300Flush( (r300)->radeon.glCtx );		\
-   }							\
-    \
-} while (0)
-
-// r300_state.c
-extern int future_hw_tcl_on;
-void _tnl_UpdateFixedFunctionProgram (GLcontext * ctx);
 void r300UpdateViewportOffset (GLcontext * ctx);
 void r300UpdateDrawBuffer (GLcontext * ctx);
-void r300UpdateStateParameters (GLcontext * ctx, GLuint new_state);
 void r300UpdateShaders (r300ContextPtr rmesa);
 void r300UpdateShaderStates (r300ContextPtr rmesa);
 void r300InitState (r300ContextPtr r300);
-void r300UpdateClipPlanes (GLcontext * ctx);
 void r300InitStateFuncs (struct dd_function_table *functions);
+void r300VapCntl(r300ContextPtr rmesa, GLuint input_count, GLuint output_count, GLuint temp_count);
+void r300SetupVAP(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten);
 
 #endif				/* __R300_STATE_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.c b/src/mesa/drivers/dri/r300/r300_swtcl.c
index b6e7ce1a1a..ee2c71e1a7 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.c
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.c
@@ -28,303 +28,264 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /*
  * Authors:
  *   Dave Airlie <airlied@linux.ie>
+ *   Maciej Cencora <m.cencora@gmail.com>
  */
 
-/* derived from r200 swtcl path */
-
-
-
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/colormac.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/imports.h"
-#include "main/light.h"
-#include "main/macros.h"
-
-#include "swrast/s_context.h"
-#include "swrast/s_fog.h"
-#include "swrast_setup/swrast_setup.h"
-#include "math/m_translate.h"
 #include "tnl/tnl.h"
-#include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
 
-#include "r300_context.h"
-#include "r300_swtcl.h"
 #include "r300_state.h"
-#include "r300_ioctl.h"
+#include "r300_swtcl.h"
 #include "r300_emit.h"
-#include "r300_mem.h"
+#include "r300_tex.h"
+#include "r300_render.h"
+#include "main/simple_list.h"
 
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  );
-
-
-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset);
-void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr);
 #define EMIT_ATTR( ATTR, STYLE )					\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
-   rmesa->swtcl.vertex_attr_count++;					\
+	rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
+	rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
+	rmesa->radeon.swtcl.vertex_attr_count++;					\
 } while (0)
 
 #define EMIT_PAD( N )							\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
+} while (0)
+
+#define ADD_ATTR(_attr, _format, _dst_loc, _swizzle, _write_mask, _normalize) \
+do { \
+	attrs[num_attrs].element = (_attr); \
+	attrs[num_attrs].data_type = (_format); \
+	attrs[num_attrs].dst_loc = (_dst_loc); \
+	attrs[num_attrs].swizzle = (_swizzle); \
+	attrs[num_attrs].write_mask = (_write_mask); \
+	attrs[num_attrs]._signed = 0; \
+	attrs[num_attrs].normalize = (_normalize); \
+	++num_attrs; \
 } while (0)
 
-static void r300SetVertexFormat( GLcontext *ctx )
+void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_OutputsWritten)
 {
 	r300ContextPtr rmesa = R300_CONTEXT( ctx );
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	struct vertex_buffer *VB = &tnl->vb;
-	DECLARE_RENDERINPUTS(index_bitset);
-	GLuint InputsRead = 0, OutputsWritten = 0;
-	int vap_fmt_0 = 0;
-	int vap_vte_cntl = 0;
-	int offset = 0;
-	int vte = 0;
-	GLint inputs[VERT_ATTRIB_MAX];
-	GLint tab[VERT_ATTRIB_MAX];
-	int swizzle[VERT_ATTRIB_MAX][4];
-	GLuint i, nr;
-	GLuint sz, vap_fmt_1 = 0;
-
-	DECLARE_RENDERINPUTS(render_inputs_bitset);
-	RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
-	RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
-	RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
-
-	vte = rmesa->hw.vte.cmd[1];
-	vte &= ~(R300_VTX_XY_FMT | R300_VTX_Z_FMT | R300_VTX_W0_FMT);
-	/* Important:
-	 */
-	if ( VB->NdcPtr != NULL ) {
-		VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
-		vte |= R300_VTX_XY_FMT | R300_VTX_Z_FMT;
-	}
-	else {
-		VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
-		vte |= R300_VTX_W0_FMT;
+	int first_free_tex = 0;
+	GLuint InputsRead = 0;
+	GLuint OutputsWritten = 0;
+	int num_attrs = 0;
+	GLuint fp_reads = rmesa->selected_fp->InputsRead;
+	struct vertex_attribute *attrs = rmesa->vbuf.attribs;
+
+	radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s\n", __func__);
+	rmesa->swtcl.coloroffset = rmesa->swtcl.specoffset = 0;
+	rmesa->radeon.swtcl.vertex_attr_count = 0;
+
+	if (RADEON_DEBUG & RADEON_VERTS)
+		fprintf(stderr, "%s\n", __func__);
+
+	/* We always want non Ndc coords format */
+	VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+
+	/* Always write position vector */
+	InputsRead |= 1 << VERT_ATTRIB_POS;
+	OutputsWritten |= 1 << VERT_RESULT_HPOS;
+	EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
+	ADD_ATTR(VERT_ATTRIB_POS, R300_DATA_TYPE_FLOAT_4, SWTCL_OVM_POS, SWIZZLE_XYZW, MASK_XYZW, 0);
+	rmesa->swtcl.coloroffset = 4;
+
+	if (fp_reads & FRAG_BIT_COL0) {
+		InputsRead |= 1 << VERT_ATTRIB_COLOR0;
+		OutputsWritten |= 1 << VERT_RESULT_COL0;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
+#endif
 	}
 
-	assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-	rmesa->swtcl.vertex_attr_count = 0;
+	if (fp_reads & FRAG_BIT_COL1) {
+		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+		InputsRead |= 1 << VERT_ATTRIB_COLOR1;
+		OutputsWritten |= 1 << VERT_RESULT_COL1;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
+#endif
+		rmesa->swtcl.specoffset = rmesa->swtcl.coloroffset + 1;
+	}
 
-	/* EMIT_ATTR's must be in order as they tell t_vertex.c how to
-	 * build up a hardware vertex.
-	 */
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POS)) {
-		sz = VB->AttribPtr[VERT_ATTRIB_POS]->size;
-		InputsRead |= 1 << VERT_ATTRIB_POS;
-		OutputsWritten |= 1 << VERT_RESULT_HPOS;
-		EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_1F + sz - 1 );
-		offset = sz;
-	} else {
-		offset = 4;
-		EMIT_PAD(4 * sizeof(float));
+	if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
+		VB->AttribPtr[VERT_ATTRIB_GENERIC0] = VB->ColorPtr[1];
+		OutputsWritten |= 1 << VERT_RESULT_BFC0;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
+#endif
+		if (fp_reads & FRAG_BIT_COL1) {
+			VB->AttribPtr[VERT_ATTRIB_GENERIC1] = VB->SecondaryColorPtr[1];
+			GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+			OutputsWritten |= 1 << VERT_RESULT_BFC1;
+#if MESA_LITTLE_ENDIAN
+			EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_RGBA );
+			ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
+#else
+			EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_ABGR );
+			ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
+#endif
+		}
 	}
 
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POINTSIZE )) {
+	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POINTSIZE )) {
+		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
+		InputsRead |= 1 << VERT_ATTRIB_POINT_SIZE;
+		OutputsWritten |= 1 << VERT_RESULT_PSIZ;
 		EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
-		vap_fmt_0 |=  R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
-		offset += 1;
+		ADD_ATTR(VERT_ATTRIB_POINT_SIZE, R300_DATA_TYPE_FLOAT_1, SWTCL_OVM_POINT_SIZE, swiz, MASK_X, 0);
 	}
 
-	if (RENDERINPUTS_TEST(index_bitset, _TNL_ATTRIB_COLOR0)) {
-		sz = VB->AttribPtr[VERT_ATTRIB_COLOR0]->size;
-	        rmesa->swtcl.coloroffset = offset;
-		InputsRead |= 1 << VERT_ATTRIB_COLOR0;
-		OutputsWritten |= 1 << VERT_RESULT_COL0;
-		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_1F + sz - 1 );
-		offset += sz;
+	if (rmesa->selected_fp->wpos_attr != FRAG_ATTRIB_MAX) {
+		int tex_id = rmesa->selected_fp->wpos_attr - FRAG_ATTRIB_TEX0;
+
+		VB->AttribPtr[VERT_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
+		VB->TexCoordPtr[tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
+		RENDERINPUTS_SET(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX0 + tex_id);
 	}
 
-	rmesa->swtcl.specoffset = 0;
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
-		sz = VB->AttribPtr[VERT_ATTRIB_COLOR1]->size;
-		rmesa->swtcl.specoffset = offset;
-		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_1F + sz - 1 );
-		InputsRead |= 1 << VERT_ATTRIB_COLOR1;
-		OutputsWritten |= 1 << VERT_RESULT_COL1;
+	if (rmesa->selected_fp->fog_attr != FRAG_ATTRIB_MAX) {
+		int tex_id = rmesa->selected_fp->fog_attr - FRAG_ATTRIB_TEX0;
+
+		VB->AttribPtr[VERT_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
+		VB->TexCoordPtr[tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
+		RENDERINPUTS_SET(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX0 + tex_id);
 	}
 
-	if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+	/**
+	 *  Sending only one texcoord component may lead to lock up,
+	 *  so for all textures always output 4 texcoord components to RS.
+	 */
+	{
 		int i;
-
+		GLuint swiz, format, hw_format;
 		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-			if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-				sz = VB->TexCoordPtr[i]->size;
+			if (fp_reads & FRAG_BIT_TEX(i)) {
+				switch (VB->TexCoordPtr[i]->size) {
+					case 1:
+						format = EMIT_1F;
+						hw_format = R300_DATA_TYPE_FLOAT_1;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+						break;
+					case 2:
+						format = EMIT_2F;
+						hw_format = R300_DATA_TYPE_FLOAT_2;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
+						break;
+					case 3:
+						format = EMIT_3F;
+						hw_format = R300_DATA_TYPE_FLOAT_3;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+						break;
+					case 4:
+						format = EMIT_4F;
+						hw_format = R300_DATA_TYPE_FLOAT_4;
+						swiz = SWIZZLE_XYZW;
+						break;
+					default:
+						continue;
+				}
 				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
 				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-				EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_1F + sz - 1 );
-				vap_fmt_1 |= sz << (3 * i);
+				EMIT_ATTR(_TNL_ATTRIB_TEX(i), format);
+				ADD_ATTR(VERT_ATTRIB_TEX0 + i, hw_format, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_XYZW, 0);
+				++first_free_tex;
 			}
 		}
 	}
 
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			inputs[i] = nr++;
-		} else {
-			inputs[i] = -1;
-		}
-	}
-	
-	/* Fixed, apply to vir0 only */
-	if (InputsRead & (1 << VERT_ATTRIB_POS))
-		inputs[VERT_ATTRIB_POS] = 0;
-	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
-		inputs[VERT_ATTRIB_COLOR0] = 2;
-	if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
-		inputs[VERT_ATTRIB_COLOR1] = 3;
-	for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
-		if (InputsRead & (1 << i))
-			inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
-	
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			tab[nr++] = i;
-		}
-	}
-	
-	for (i = 0; i < nr; i++) {
-		int ci;
-		
-		swizzle[i][0] = SWIZZLE_ZERO;
-		swizzle[i][1] = SWIZZLE_ZERO;
-		swizzle[i][2] = SWIZZLE_ZERO;
-		swizzle[i][3] = SWIZZLE_ONE;
-
-		for (ci = 0; ci < VB->AttribPtr[tab[i]]->size; ci++) {
-			swizzle[i][ci] = ci;
-		}
+	if (first_free_tex >= ctx->Const.MaxTextureUnits) {
+		fprintf(stderr, "\tout of free texcoords to write fog coordinate\n");
+		_mesa_exit(-1);
 	}
 
 	R300_NEWPRIM(rmesa);
-	R300_STATECHANGE(rmesa, vir[0]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
-		r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-				   VB->AttribPtr, inputs, tab, nr);
-	R300_STATECHANGE(rmesa, vir[1]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
-		r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-				   nr);
-   
-	R300_STATECHANGE(rmesa, vic);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-   
-	R300_STATECHANGE(rmesa, vof);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = vap_fmt_1;
-   
-	rmesa->swtcl.vertex_size =
-		_tnl_install_attrs( ctx,
-				    rmesa->swtcl.vertex_attrs, 
-				    rmesa->swtcl.vertex_attr_count,
-				    NULL, 0 );
-	
-	rmesa->swtcl.vertex_size /= 4;
+	rmesa->vbuf.num_attribs = num_attrs;
+	*_InputsRead = InputsRead;
+	*_OutputsWritten = OutputsWritten;
 
-	RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
-
-
-	R300_STATECHANGE(rmesa, vte);
-	rmesa->hw.vte.cmd[1] = vte;
-	rmesa->hw.vte.cmd[2] = rmesa->swtcl.vertex_size;
-}
-
-
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  )
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-	
-	rmesa->dma.flush = NULL;
-
-	if (rmesa->dma.current.buf) {
-		struct r300_dma_region *current = &rmesa->dma.current;
-		GLuint current_offset = GET_START(current);
-
-		assert (current->start + 
-			rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-			current->ptr);
-
-		if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-
-			r300EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size + (12*sizeof(int)), __FUNCTION__);
-			
-			r300EmitState(rmesa);
-			
-			r300EmitVertexAOS( rmesa,
-					   rmesa->swtcl.vertex_size,
-					   current_offset);
-			
-			r300EmitVbufPrim( rmesa,
-					  rmesa->swtcl.hw_primitive,
-					  rmesa->swtcl.numverts);
-			
-			r300EmitCacheFlush(rmesa);
-		}
-		
-		rmesa->swtcl.numverts = 0;
-		current->start = current->ptr;
-	}
+	RENDERINPUTS_COPY(rmesa->render_inputs_bitset, tnl->render_inputs_bitset);
 }
 
-/* Alloc space in the current dma region.
- */
-static void *
-r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
+static void r300PrepareVertices(GLcontext *ctx)
 {
-	GLuint bytes = vsize * nverts;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLuint InputsRead, OutputsWritten;
+	radeon_print(RADEON_SWRENDER, RADEON_TRACE, "%s\n", __func__);
 
-	if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-		r300RefillCurrentDmaRegion( rmesa, bytes);
+	r300ChooseSwtclVertexFormat(ctx, &InputsRead, &OutputsWritten);
+	r300SetupVAP(ctx, InputsRead, OutputsWritten);
 
-	if (!rmesa->dma.flush) {
-		rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-		rmesa->dma.flush = flush_last_swtcl_prim;
-	}
+	rmesa->radeon.swtcl.vertex_size =
+		_tnl_install_attrs( ctx,
+				    rmesa->radeon.swtcl.vertex_attrs,
+				    rmesa->radeon.swtcl.vertex_attr_count,
+				    NULL, 0 );
 
-	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
-	ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
-	ASSERT( rmesa->dma.current.start + 
-		rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-		rmesa->dma.current.ptr );
+	rmesa->radeon.swtcl.vertex_size /= 4;
+}
 
-	{
-		GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
-		rmesa->dma.current.ptr += bytes;
-		rmesa->swtcl.numverts += nverts;
-		return head;
+static void r300_predict_emit_size( r300ContextPtr rmesa )
+{
+	if (!rmesa->radeon.swtcl.emit_prediction) {
+		const int vertex_size = 7;
+		const int prim_size = 3;
+		const int cache_flush_size = 4;
+		const int pre_emit_state = 4;
+		const int scissor_size = 3;
+		const int state_size = radeonCountStateEmitSize(&rmesa->radeon);
+
+		if (rcommonEnsureCmdBufSpace(&rmesa->radeon,
+					state_size + pre_emit_state + scissor_size
+					+ vertex_size + prim_size + cache_flush_size * 2,
+					__FUNCTION__))
+			rmesa->radeon.swtcl.emit_prediction = radeonCountStateEmitSize(&rmesa->radeon);
+		else
+			rmesa->radeon.swtcl.emit_prediction = state_size;
+
+		rmesa->radeon.swtcl.emit_prediction += rmesa->radeon.cmdbuf.cs->cdw
+			+ vertex_size + scissor_size + prim_size + cache_flush_size * 2 + pre_emit_state;
+		radeon_print(RADEON_SWRENDER, RADEON_VERBOSE,
+				"%s, size %d\n",
+				__func__, rmesa->radeon.cmdbuf.cs->cdw
+				+ vertex_size + scissor_size + prim_size + cache_flush_size * 2 + pre_emit_state);
 	}
 }
 
+
 static GLuint reduced_prim[] = {
-  GL_POINTS,
-  GL_LINES,
-  GL_LINES,
-  GL_LINES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
+	GL_POINTS,
+	GL_LINES,
+	GL_LINES,
+	GL_LINES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
 };
 
 static void r300RasterPrimitive( GLcontext *ctx, GLuint prim );
-static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
-//static void r300ResetLineStipple( GLcontext *ctx );
 
 /***********************************************************************
  *                    Emit primitives as inline vertices               *
@@ -343,18 +304,26 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
 #define HAVE_POLYGONS    1
 #define HAVE_ELTS        1
 
+static void* r300_alloc_verts(r300ContextPtr rmesa, GLuint n, GLuint size)
+{
+	void *rv;
+	do {
+		r300_predict_emit_size( rmesa );
+		rv = rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 );
+	} while (!rv);
+	return rv;
+}
+
 #undef LOCAL_VARS
 #undef ALLOC_VERTS
 #define CTX_ARG r300ContextPtr rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) r300AllocDmaLowVerts( rmesa, n, size * 4 )
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) r300_alloc_verts(rmesa, n, size);
 #define LOCAL_VARS						\
    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
-   const char *r300verts = (char *)rmesa->swtcl.verts;
+   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;
 #define VERT(x) (r300Vertex *)(r300verts + ((x) * vertsize * sizeof(int)))
-#define VERTEX r300Vertex 
-#define DO_DEBUG_VERTS (1 && (RADEON_DEBUG & DEBUG_VERTS))
-#define PRINT_VERTEX(x)
+#define VERTEX r300Vertex
 #undef TAG
 #define TAG(x) r300_##x
 #include "tnl_dd/t_dd_triemit.h"
@@ -374,9 +343,8 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
  *              Build render functions from dd templates               *
  ***********************************************************************/
 
-#define R300_TWOSIDE_BIT	0x01
-#define R300_UNFILLED_BIT	0x02
-#define R300_MAX_TRIFUNC	0x04
+#define R300_UNFILLED_BIT	0x01
+#define R300_MAX_TRIFUNC	0x02
 
 static struct {
    tnl_points_func	        points;
@@ -387,9 +355,9 @@ static struct {
 
 #define DO_FALLBACK  0
 #define DO_UNFILLED (IND & R300_UNFILLED_BIT)
-#define DO_TWOSIDE  (IND & R300_TWOSIDE_BIT)
+#define DO_TWOSIDE   0
 #define DO_FLAT      0
-#define DO_OFFSET     0
+#define DO_OFFSET    0
 #define DO_TRI       1
 #define DO_QUAD      1
 #define DO_LINE      1
@@ -409,33 +377,39 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
-
-/* Only used to pull back colors into vertices (ie, we know color is
- * floating point).
- */
-#define R300_COLOR( dst, src )				\
-do {							\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[0], (src)[2]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[1], (src)[1]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[2], (src)[0]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[3], (src)[3]);	\
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
+
+#define VERT_SET_RGBA( v, c ) \
+do { \
+   r300_color_t *color = (r300_color_t *)&((v)->ui[coloroffset]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]); \
 } while (0)
 
-#define VERT_SET_RGBA( v, c )    if (coloroffset) R300_COLOR( v->ub4[coloroffset], c )
-#define VERT_COPY_RGBA( v0, v1 ) if (coloroffset) v0->ui[coloroffset] = v1->ui[coloroffset]
-#define VERT_SAVE_RGBA( idx )    if (coloroffset) color[idx] = v[idx]->ui[coloroffset]
-#define VERT_RESTORE_RGBA( idx ) if (coloroffset) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
+
+#define VERT_SET_SPEC( v0, c ) \
+do { \
+   if (specoffset) { \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.red, (c)[0]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.green, (c)[1]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.blue, (c)[2]); \
+   } \
+} while (0)
 
-#define R300_SPEC( dst, src )				\
-do {							\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[0], (src)[2]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[1], (src)[1]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[2], (src)[0]);	\
+#define VERT_COPY_SPEC( v0, v1 ) \
+do { \
+   if (specoffset) { \
+       v0->v.specular.red = v1->v.specular.red; \
+       v0->v.specular.green = v1->v.specular.green; \
+       v0->v.specular.blue = v1->v.specular.blue; \
+   } \
 } while (0)
 
-#define VERT_SET_SPEC( v, c )    if (specoffset) R300_SPEC( v->ub4[specoffset], c )
-#define VERT_COPY_SPEC( v0, v1 ) if (specoffset) COPY_3V(v0->ub4[specoffset], v1->ub4[specoffset])
+#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
 #define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
 #define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
 
@@ -445,7 +419,7 @@ do {							\
 
 #define LOCAL_VARS(n)							\
    r300ContextPtr rmesa = R300_CONTEXT(ctx);			\
-   GLuint color[n], spec[n];						\
+   GLuint color[n] = { 0, }, spec[n] = { 0, };				\
    GLuint coloroffset = rmesa->swtcl.coloroffset;	\
    GLuint specoffset = rmesa->swtcl.specoffset;			\
    (void) color; (void) spec; (void) coloroffset; (void) specoffset;
@@ -455,7 +429,7 @@ do {							\
  ***********************************************************************/
 
 #define RASTERIZE(x) r300RasterPrimitive( ctx, reduced_prim[x] )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
 #undef TAG
 #define TAG(x) x
 #include "tnl_dd/t_dd_unfilled.h"
@@ -471,26 +445,15 @@ do {							\
 #define TAG(x) x
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (R300_TWOSIDE_BIT)
-#define TAG(x) x##_twoside
-#include "tnl_dd/t_dd_tritmp.h"
-
 #define IND (R300_UNFILLED_BIT)
 #define TAG(x) x##_unfilled
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (R300_TWOSIDE_BIT|R300_UNFILLED_BIT)
-#define TAG(x) x##_twoside_unfilled
-#include "tnl_dd/t_dd_tritmp.h"
-
-
 
 static void init_rast_tab( void )
 {
    init();
-   init_twoside();
    init_unfilled();
-   init_twoside_unfilled();
 }
 
 /**********************************************************************/
@@ -512,8 +475,8 @@ static void init_rast_tab( void )
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
-   const char *r300verts = (char *)rmesa->swtcl.verts;		\
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
+   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;		\
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
    const GLboolean stipple = ctx->Line.StippleFlag;		\
    (void) elt; (void) stipple;
@@ -541,11 +504,11 @@ static void r300ChooseRenderState( GLcontext *ctx )
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	GLuint index = 0;
 	GLuint flags = ctx->_TriangleCaps;
+	radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s\n", __func__);
 
-	if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R300_TWOSIDE_BIT;
 	if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
 
-	if (index != rmesa->swtcl.RenderIndex) {
+	if (index != rmesa->radeon.swtcl.RenderIndex) {
 		tnl->Driver.Render.Points = rast_tab[index].points;
 		tnl->Driver.Render.Line = rast_tab[index].line;
 		tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -562,62 +525,64 @@ static void r300ChooseRenderState( GLcontext *ctx )
 			tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
 		}
 
-		rmesa->swtcl.RenderIndex = index;
+		rmesa->radeon.swtcl.RenderIndex = index;
 	}
 }
 
-
-static void r300RenderStart(GLcontext *ctx)
+void r300RenderStart(GLcontext *ctx)
 {
-        r300ContextPtr rmesa = R300_CONTEXT( ctx );
-	//	fprintf(stderr, "%s\n", __FUNCTION__);
+	radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s\n", __func__);
+	r300ContextPtr rmesa = R300_CONTEXT( ctx );
 
-	r300ChooseRenderState(ctx);	
-	r300SetVertexFormat(ctx);
+	r300ChooseRenderState(ctx);
 
 	r300UpdateShaders(rmesa);
+
+	r300PrepareVertices(ctx);
+
+	r300ValidateBuffers(ctx);
+
 	r300UpdateShaderStates(rmesa);
 
-	r300EmitCacheFlush(rmesa);
-	
-	if (rmesa->dma.flush != 0 && 
-	    rmesa->dma.flush != flush_last_swtcl_prim)
-		rmesa->dma.flush( rmesa );
 
+	/* investigate if we can put back flush optimisation if needed */
+	if (rmesa->radeon.dma.flush != NULL) {
+		rmesa->radeon.dma.flush(ctx);
+	}
 }
 
-static void r300RenderFinish(GLcontext *ctx)
+void r300RenderFinish(GLcontext *ctx)
 {
 }
 
 static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	
-	if (rmesa->swtcl.hw_primitive != hwprim) {
-	        R300_NEWPRIM( rmesa );
-		rmesa->swtcl.hw_primitive = hwprim;
+	radeon_print(RADEON_SWRENDER, RADEON_TRACE, "%s\n", __func__);
+
+	if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
+		R300_NEWPRIM( rmesa );
+		rmesa->radeon.swtcl.hw_primitive = hwprim;
 	}
 }
 
-static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
+void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 {
 
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	rmesa->swtcl.render_primitive = prim;
+	rmesa->radeon.swtcl.render_primitive = prim;
+	radeon_print(RADEON_SWRENDER, RADEON_TRACE, "%s\n", __func__);
 
 	if ((prim == GL_TRIANGLES) && (ctx->_TriangleCaps & DD_TRI_UNFILLED))
-	  return;
+		return;
 
 	r300RasterPrimitive( ctx, reduced_prim[prim] );
-	//	fprintf(stderr, "%s\n", __FUNCTION__);
-	
 }
 
-static void r300ResetLineStipple(GLcontext *ctx)
+void r300ResetLineStipple(GLcontext *ctx)
 {
-
-
+	if (RADEON_DEBUG & RADEON_VERTS)
+		fprintf(stderr, "%s\n", __func__);
 }
 
 void r300InitSwtcl(GLcontext *ctx)
@@ -625,12 +590,14 @@ void r300InitSwtcl(GLcontext *ctx)
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	static int firsttime = 1;
-	
+	radeon_print(RADEON_SWRENDER, RADEON_NORMAL, "%s\n", __func__);
+
 	if (firsttime) {
 		init_rast_tab();
 		firsttime = 0;
 	}
-	
+	rmesa->radeon.swtcl.emit_prediction = 0;
+
 	tnl->Driver.Render.Start = r300RenderStart;
 	tnl->Driver.Render.Finish = r300RenderFinish;
 	tnl->Driver.Render.PrimitiveNotify = r300RenderPrimitive;
@@ -638,60 +605,80 @@ void r300InitSwtcl(GLcontext *ctx)
 	tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
 	tnl->Driver.Render.CopyPV = _tnl_copy_pv;
 	tnl->Driver.Render.Interp = _tnl_interp;
-	
+
 	/* FIXME: what are these numbers? */
-	_tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+	_tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
 			    48 * sizeof(GLfloat) );
-	
-	rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
-	rmesa->swtcl.RenderIndex = ~0;
-	rmesa->swtcl.render_primitive = GL_TRIANGLES;
-	rmesa->swtcl.hw_primitive = 0;	
+
+	rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+	rmesa->radeon.swtcl.RenderIndex = ~0;
+	rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+	rmesa->radeon.swtcl.hw_primitive = 0;
 
 	_tnl_invalidate_vertex_state( ctx, ~0 );
 	_tnl_invalidate_vertices( ctx, ~0 );
-	RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
 
 	_tnl_need_projected_coords( ctx, GL_FALSE );
-	r300ChooseRenderState(ctx);
-
-	_mesa_validate_all_lighting_tables( ctx ); 
-
-	tnl->Driver.NotifyMaterialChange = 
-	  _mesa_validate_all_lighting_tables;
 }
 
 void r300DestroySwtcl(GLcontext *ctx)
 {
 }
 
-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset)
+static void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, struct radeon_bo *bo, GLuint offset)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-
-	drm_radeon_cmd_header_t *cmd = NULL;
-	if (RADEON_DEBUG & DEBUG_VERTS)
-	  fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
-		  __FUNCTION__, vertex_size, offset);
-
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2), 2);
-	e32(1);
-	e32(vertex_size | (vertex_size << 8));
-	e32(offset);
+	BATCH_LOCALS(&rmesa->radeon);
+
+	radeon_print(RADEON_SWRENDER, RADEON_TRACE,
+		"%s:  vertex_size %d, offset 0x%x \n",
+			__FUNCTION__, vertex_size, offset);
+
+	BEGIN_BATCH(7);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2);
+	OUT_BATCH(1);
+	OUT_BATCH(vertex_size | (vertex_size << 8));
+	OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+	END_BATCH();
 }
 
-void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
+static void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
 {
-
-	int cmd_reserved = 0;
-	int cmd_written = 0;
+	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	if (RADEON_DEBUG & RADEON_VERTS)
+		fprintf(stderr, "%s\n", __func__);
 
 	type = r300PrimitiveType(rmesa, primitive);
 	num_verts = r300NumVerts(rmesa, vertex_nr, primitive);
-	
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+
+	BEGIN_BATCH(3);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+	END_BATCH();
+}
+
+void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
+{
+	radeon_print(RADEON_SWRENDER, RADEON_TRACE, "%s\n", __func__);
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	r300EmitCacheFlush(rmesa);
+
+	radeonEmitState(&rmesa->radeon);
+    r300_emit_scissor(ctx);
+	r300EmitVertexAOS(rmesa,
+			rmesa->radeon.swtcl.vertex_size,
+			first_elem(&rmesa->radeon.dma.reserved)->bo,
+			current_offset);
+
+	r300EmitVbufPrim(rmesa,
+		   rmesa->radeon.swtcl.hw_primitive,
+		   rmesa->radeon.swtcl.numverts);
+	r300EmitCacheFlush(rmesa);
+	if ( rmesa->radeon.swtcl.emit_prediction < rmesa->radeon.cmdbuf.cs->cdw )
+		WARN_ONCE("Rendering was %d commands larger than predicted size."
+			" We might overflow  command buffer.\n",
+			rmesa->radeon.cmdbuf.cs->cdw - rmesa->radeon.swtcl.emit_prediction );
+	rmesa->radeon.swtcl.emit_prediction = 0;
+	COMMIT_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.h b/src/mesa/drivers/dri/r300/r300_swtcl.h
index 55df53c1ad..c271d26546 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.h
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.h
@@ -39,7 +39,27 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 #include "r300_context.h"
 
+/*
+ * Here are definitions of OVM locations of vertex attributes for non TCL hw
+ */
+#define SWTCL_OVM_POS 0
+#define SWTCL_OVM_COLOR0 2
+#define SWTCL_OVM_COLOR1 3
+#define SWTCL_OVM_COLOR2 4
+#define SWTCL_OVM_COLOR3 5
+#define SWTCL_OVM_TEX(n) ((n) + 6)
+#define SWTCL_OVM_POINT_SIZE 15
+
+extern void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *InputsRead,  GLuint *OutputsWritten);
+
 extern void r300InitSwtcl( GLcontext *ctx );
 extern void r300DestroySwtcl( GLcontext *ctx );
 
+extern void r300RenderStart(GLcontext *ctx);
+extern void r300RenderFinish(GLcontext *ctx);
+extern void r300RenderPrimitive(GLcontext *ctx, GLenum prim);
+extern void r300ResetLineStipple(GLcontext *ctx);
+
+extern void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 8ab382c83c..433e5a87d4 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/image.h"
+#include "main/mipmap.h"
 #include "main/simple_list.h"
 #include "main/texformat.h"
 #include "main/texstore.h"
@@ -49,6 +50,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_tex.h"
 
 #include "xmlpool.h"
@@ -77,20 +79,20 @@ static unsigned int translate_wrap_mode(GLenum wrapmode)
  *
  * \param t Texture object whose wrap modes are to be set
  */
-static void r300UpdateTexWrap(r300TexObjPtr t)
+static void r300UpdateTexWrap(radeonTexObjPtr t)
 {
-	struct gl_texture_object *tObj = t->base.tObj;
+	struct gl_texture_object *tObj = &t->base;
 
-	t->filter &=
+	t->pp_txfilter &=
 	    ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_R_MASK);
 
-	t->filter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
+	t->pp_txfilter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
 
 	if (tObj->Target != GL_TEXTURE_1D) {
-		t->filter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
+		t->pp_txfilter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
 
 		if (tObj->Target == GL_TEXTURE_3D)
-			t->filter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
+			t->pp_txfilter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
 	}
 }
 
@@ -117,10 +119,13 @@ static GLuint aniso_filter(GLfloat anisotropy)
  * \param magf Texture magnification mode
  * \param anisotropy Maximum anisotropy level
  */
-static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
+static void r300SetTexFilter(radeonTexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
 {
-	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
-	t->filter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
+	/* Force revalidation to account for switches from/to mipmapping. */
+	t->validated = GL_FALSE;
+
+	t->pp_txfilter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
+	t->pp_txfilter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
 
 	/* Note that EXT_texture_filter_anisotropic is extremely vague about
 	 * how anisotropic filtering interacts with the "normal" filter modes.
@@ -128,33 +133,33 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
 	 * filter settings completely. This includes driconf's settings.
 	 */
 	if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
-		t->filter |= R300_TX_MAG_FILTER_ANISO
+		t->pp_txfilter |= R300_TX_MAG_FILTER_ANISO
 			| R300_TX_MIN_FILTER_ANISO
 			| R300_TX_MIN_FILTER_MIP_LINEAR
 			| aniso_filter(anisotropy);
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
+		if (RADEON_DEBUG & RADEON_TEXTURE)
 			fprintf(stderr, "Using maximum anisotropy of %f\n", anisotropy);
 		return;
 	}
 
 	switch (minf) {
 	case GL_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST;
 		break;
 	case GL_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR;
 		break;
 	case GL_NEAREST_MIPMAP_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
 		break;
 	case GL_NEAREST_MIPMAP_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
 		break;
 	case GL_LINEAR_MIPMAP_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
 		break;
 	case GL_LINEAR_MIPMAP_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
 		break;
 	}
 
@@ -163,743 +168,25 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
 	 */
 	switch (magf) {
 	case GL_NEAREST:
-		t->filter |= R300_TX_MAG_FILTER_NEAREST;
+		t->pp_txfilter |= R300_TX_MAG_FILTER_NEAREST;
 		break;
 	case GL_LINEAR:
-		t->filter |= R300_TX_MAG_FILTER_LINEAR;
+		t->pp_txfilter |= R300_TX_MAG_FILTER_LINEAR;
 		break;
 	}
 }
 
-static void r300SetTexBorderColor(r300TexObjPtr t, GLubyte c[4])
+static void r300SetTexBorderColor(radeonTexObjPtr t, const GLfloat color[4])
 {
+	GLubyte c[4];
+	CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+	CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+	CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+	CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
 	t->pp_border_color = PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
 }
 
 /**
- * Allocate space for and load the mesa images into the texture memory block.
- * This will happen before drawing with a new texture, or drawing with a
- * texture after it was swapped out or teximaged again.
- */
-
-static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
-{
-	r300TexObjPtr t;
-
-	t = CALLOC_STRUCT(r300_tex_obj);
-	texObj->DriverData = t;
-	if (t != NULL) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE) {
-			fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
-				(void *)texObj, (void *)t);
-		}
-
-		/* Initialize non-image-dependent parts of the state:
-		 */
-		t->base.tObj = texObj;
-		t->border_fallback = GL_FALSE;
-
-		make_empty_list(&t->base);
-
-		r300UpdateTexWrap(t);
-		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
-		r300SetTexBorderColor(t, texObj->_BorderChan);
-	}
-
-	return t;
-}
-
-/* try to find a format which will only need a memcopy */
-static const struct gl_texture_format *r300Choose8888TexFormat(GLenum srcFormat,
-							       GLenum srcType)
-{
-	const GLuint ui = 1;
-	const GLubyte littleEndian = *((const GLubyte *)&ui);
-
-	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
-	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
-		return &_mesa_texformat_rgba8888;
-	} else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-		   (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
-		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
-		return &_mesa_texformat_rgba8888_rev;
-	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
-					    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
-		return &_mesa_texformat_argb8888_rev;
-	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
-					    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
-		return &_mesa_texformat_argb8888;
-	} else
-		return _dri_texformat_argb8888;
-}
-
-static const struct gl_texture_format *r300ChooseTextureFormat(GLcontext * ctx,
-							       GLint
-							       internalFormat,
-							       GLenum format,
-							       GLenum type)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	const GLboolean do32bpt =
-	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32);
-	const GLboolean force16bpt =
-	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16);
-	(void)format;
-
-#if 0
-	fprintf(stderr, "InternalFormat=%s(%d) type=%s format=%s\n",
-		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
-		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
-	fprintf(stderr, "do32bpt=%d force16bpt=%d\n", do32bpt, force16bpt);
-#endif
-
-	switch (internalFormat) {
-	case 4:
-	case GL_RGBA:
-	case GL_COMPRESSED_RGBA:
-		switch (type) {
-		case GL_UNSIGNED_INT_10_10_10_2:
-		case GL_UNSIGNED_INT_2_10_10_10_REV:
-			return do32bpt ? _dri_texformat_argb8888 :
-			    _dri_texformat_argb1555;
-		case GL_UNSIGNED_SHORT_4_4_4_4:
-		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-			return _dri_texformat_argb4444;
-		case GL_UNSIGNED_SHORT_5_5_5_1:
-		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-			return _dri_texformat_argb1555;
-		default:
-			return do32bpt ? r300Choose8888TexFormat(format, type) :
-			    _dri_texformat_argb4444;
-		}
-
-	case 3:
-	case GL_RGB:
-	case GL_COMPRESSED_RGB:
-		switch (type) {
-		case GL_UNSIGNED_SHORT_4_4_4_4:
-		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-			return _dri_texformat_argb4444;
-		case GL_UNSIGNED_SHORT_5_5_5_1:
-		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-			return _dri_texformat_argb1555;
-		case GL_UNSIGNED_SHORT_5_6_5:
-		case GL_UNSIGNED_SHORT_5_6_5_REV:
-			return _dri_texformat_rgb565;
-		default:
-			return do32bpt ? _dri_texformat_argb8888 :
-			    _dri_texformat_rgb565;
-		}
-
-	case GL_RGBA8:
-	case GL_RGB10_A2:
-	case GL_RGBA12:
-	case GL_RGBA16:
-		return !force16bpt ?
-		    r300Choose8888TexFormat(format,
-					    type) : _dri_texformat_argb4444;
-
-	case GL_RGBA4:
-	case GL_RGBA2:
-		return _dri_texformat_argb4444;
-
-	case GL_RGB5_A1:
-		return _dri_texformat_argb1555;
-
-	case GL_RGB8:
-	case GL_RGB10:
-	case GL_RGB12:
-	case GL_RGB16:
-		return !force16bpt ? _dri_texformat_argb8888 :
-		    _dri_texformat_rgb565;
-
-	case GL_RGB5:
-	case GL_RGB4:
-	case GL_R3_G3_B2:
-		return _dri_texformat_rgb565;
-
-	case GL_ALPHA:
-	case GL_ALPHA4:
-	case GL_ALPHA8:
-	case GL_ALPHA12:
-	case GL_ALPHA16:
-	case GL_COMPRESSED_ALPHA:
-		return _dri_texformat_a8;
-
-	case 1:
-	case GL_LUMINANCE:
-	case GL_LUMINANCE4:
-	case GL_LUMINANCE8:
-	case GL_LUMINANCE12:
-	case GL_LUMINANCE16:
-	case GL_COMPRESSED_LUMINANCE:
-		return _dri_texformat_l8;
-
-	case 2:
-	case GL_LUMINANCE_ALPHA:
-	case GL_LUMINANCE4_ALPHA4:
-	case GL_LUMINANCE6_ALPHA2:
-	case GL_LUMINANCE8_ALPHA8:
-	case GL_LUMINANCE12_ALPHA4:
-	case GL_LUMINANCE12_ALPHA12:
-	case GL_LUMINANCE16_ALPHA16:
-	case GL_COMPRESSED_LUMINANCE_ALPHA:
-		return _dri_texformat_al88;
-
-	case GL_INTENSITY:
-	case GL_INTENSITY4:
-	case GL_INTENSITY8:
-	case GL_INTENSITY12:
-	case GL_INTENSITY16:
-	case GL_COMPRESSED_INTENSITY:
-		return _dri_texformat_i8;
-
-	case GL_YCBCR_MESA:
-		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-		    type == GL_UNSIGNED_BYTE)
-			return &_mesa_texformat_ycbcr;
-		else
-			return &_mesa_texformat_ycbcr_rev;
-
-	case GL_RGB_S3TC:
-	case GL_RGB4_S3TC:
-	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-		return &_mesa_texformat_rgb_dxt1;
-
-	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-		return &_mesa_texformat_rgba_dxt1;
-
-	case GL_RGBA_S3TC:
-	case GL_RGBA4_S3TC:
-	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-		return &_mesa_texformat_rgba_dxt3;
-
-	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-		return &_mesa_texformat_rgba_dxt5;
-
-	case GL_ALPHA16F_ARB:
-		return &_mesa_texformat_alpha_float16;
-	case GL_ALPHA32F_ARB:
-		return &_mesa_texformat_alpha_float32;
-	case GL_LUMINANCE16F_ARB:
-		return &_mesa_texformat_luminance_float16;
-	case GL_LUMINANCE32F_ARB:
-		return &_mesa_texformat_luminance_float32;
-	case GL_LUMINANCE_ALPHA16F_ARB:
-		return &_mesa_texformat_luminance_alpha_float16;
-	case GL_LUMINANCE_ALPHA32F_ARB:
-		return &_mesa_texformat_luminance_alpha_float32;
-	case GL_INTENSITY16F_ARB:
-		return &_mesa_texformat_intensity_float16;
-	case GL_INTENSITY32F_ARB:
-		return &_mesa_texformat_intensity_float32;
-	case GL_RGB16F_ARB:
-		return &_mesa_texformat_rgba_float16;
-	case GL_RGB32F_ARB:
-		return &_mesa_texformat_rgba_float32;
-	case GL_RGBA16F_ARB:
-		return &_mesa_texformat_rgba_float16;
-	case GL_RGBA32F_ARB:
-		return &_mesa_texformat_rgba_float32;
-
-	case GL_DEPTH_COMPONENT:
-	case GL_DEPTH_COMPONENT16:
-	case GL_DEPTH_COMPONENT24:
-	case GL_DEPTH_COMPONENT32:
-#if 0
-		switch (type) {
-		case GL_UNSIGNED_BYTE:
-		case GL_UNSIGNED_SHORT:
-			return &_mesa_texformat_z16;
-		case GL_UNSIGNED_INT:
-			return &_mesa_texformat_z32;
-		case GL_UNSIGNED_INT_24_8_EXT:
-		default:
-			return &_mesa_texformat_z24_s8;
-		}
-#else
-		return &_mesa_texformat_z16;
-#endif
-
-	default:
-		_mesa_problem(ctx,
-			      "unexpected internalFormat 0x%x in r300ChooseTextureFormat",
-			      (int)internalFormat);
-		return NULL;
-	}
-
-	return NULL;		/* never get here */
-}
-
-static GLboolean
-r300ValidateClientStorage(GLcontext * ctx, GLenum target,
-			  GLint internalFormat,
-			  GLint srcWidth, GLint srcHeight,
-			  GLenum format, GLenum type, const void *pixels,
-			  const struct gl_pixelstore_attrib *packing,
-			  struct gl_texture_object *texObj,
-			  struct gl_texture_image *texImage)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "intformat %s format %s type %s\n",
-			_mesa_lookup_enum_by_nr(internalFormat),
-			_mesa_lookup_enum_by_nr(format),
-			_mesa_lookup_enum_by_nr(type));
-
-	if (!ctx->Unpack.ClientStorage)
-		return 0;
-
-	if (ctx->_ImageTransferState ||
-	    texImage->IsCompressed || texObj->GenerateMipmap)
-		return 0;
-
-	/* This list is incomplete, may be different on ppc???
-	 */
-	switch (internalFormat) {
-	case GL_RGBA:
-		if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV) {
-			texImage->TexFormat = _dri_texformat_argb8888;
-		} else
-			return 0;
-		break;
-
-	case GL_RGB:
-		if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
-			texImage->TexFormat = _dri_texformat_rgb565;
-		} else
-			return 0;
-		break;
-
-	case GL_YCBCR_MESA:
-		if (format == GL_YCBCR_MESA &&
-		    type == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
-			texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
-		} else if (format == GL_YCBCR_MESA &&
-			   (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-			    type == GL_UNSIGNED_BYTE)) {
-			texImage->TexFormat = &_mesa_texformat_ycbcr;
-		} else
-			return 0;
-		break;
-
-	default:
-		return 0;
-	}
-
-	/* Could deal with these packing issues, but currently don't:
-	 */
-	if (packing->SkipPixels ||
-	    packing->SkipRows || packing->SwapBytes || packing->LsbFirst) {
-		return 0;
-	}
-
-	GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
-						    format, type);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "%s: srcRowStride %d/%x\n",
-			__FUNCTION__, srcRowStride, srcRowStride);
-
-	/* Could check this later in upload, pitch restrictions could be
-	 * relaxed, but would need to store the image pitch somewhere,
-	 * as packing details might change before image is uploaded:
-	 */
-	if (!r300IsGartMemory(rmesa, pixels, srcHeight * srcRowStride)
-	    || (srcRowStride & 63))
-		return 0;
-
-	/* Have validated that _mesa_transfer_teximage would be a straight
-	 * memcpy at this point.  NOTE: future calls to TexSubImage will
-	 * overwrite the client data.  This is explicitly mentioned in the
-	 * extension spec.
-	 */
-	texImage->Data = (void *)pixels;
-	texImage->IsClientData = GL_TRUE;
-	texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
-
-	return 1;
-}
-
-static void r300TexImage1D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-			return;
-		}
-	}
-
-	/* Note, this will call ChooseTextureFormat */
-	_mesa_store_teximage1d(ctx, target, level, internalFormat,
-			       width, border, format, type, pixels,
-			       &ctx->Unpack, texObj, texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-static void r300TexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
-			      GLint xoffset,
-			      GLsizei width,
-			      GLenum format, GLenum type,
-			      const GLvoid * pixels,
-			      const struct gl_pixelstore_attrib *packing,
-			      struct gl_texture_object *texObj,
-			      struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
-			return;
-		}
-	}
-
-	_mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
-				  format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-static void r300TexImage2D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint height, GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	if (t != NULL) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_teximage2d(ctx, target, level, internalFormat,
-				       width, height, border, format, type,
-				       pixels, &ctx->Unpack, texObj, texImage);
-
-		t->dirty_images[face] |= (1 << level);
-	}
-}
-
-static void r300TexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
-			      GLint xoffset, GLint yoffset,
-			      GLsizei width, GLsizei height,
-			      GLenum format, GLenum type,
-			      const GLvoid * pixels,
-			      const struct gl_pixelstore_attrib *packing,
-			      struct gl_texture_object *texObj,
-			      struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
-			return;
-		}
-	}
-
-	_mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-				  height, format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[face] |= (1 << level);
-}
-
-static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
-				     GLint level, GLint internalFormat,
-				     GLint width, GLint height, GLint border,
-				     GLsizei imageSize, const GLvoid * data,
-				     struct gl_texture_object *texObj,
-				     struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	if (t != NULL) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
-				    "glCompressedTexImage2D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-	/* can't call this, different parameters. Would never evaluate to true anyway currently */
-#if 0
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else
-#endif
-	{
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_compressed_teximage2d(ctx, target, level,
-						  internalFormat, width, height,
-						  border, imageSize, data,
-						  texObj, texImage);
-
-		t->dirty_images[face] |= (1 << level);
-	}
-}
-
-static void r300CompressedTexSubImage2D(GLcontext * ctx, GLenum target,
-					GLint level, GLint xoffset,
-					GLint yoffset, GLsizei width,
-					GLsizei height, GLenum format,
-					GLsizei imageSize, const GLvoid * data,
-					struct gl_texture_object *texObj,
-					struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
-				    "glCompressedTexSubImage3D");
-			return;
-		}
-	}
-
-	_mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset,
-					     yoffset, width, height, format,
-					     imageSize, data, texObj, texImage);
-
-	t->dirty_images[face] |= (1 << level);
-}
-
-static void r300TexImage3D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint height, GLint depth,
-			   GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-#if 0
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else
-#endif
-	{
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_teximage3d(ctx, target, level, internalFormat,
-				       width, height, depth, border,
-				       format, type, pixels,
-				       &ctx->Unpack, texObj, texImage);
-
-		t->dirty_images[0] |= (1 << level);
-	}
-}
-
-static void
-r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
-		  GLint xoffset, GLint yoffset, GLint zoffset,
-		  GLsizei width, GLsizei height, GLsizei depth,
-		  GLenum format, GLenum type,
-		  const GLvoid * pixels,
-		  const struct gl_pixelstore_attrib *packing,
-		  struct gl_texture_object *texObj,
-		  struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-/*     fprintf(stderr, "%s\n", __FUNCTION__); */
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
-			return;
-		}
-		texObj->DriverData = t;
-	}
-
-	_mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
-				  width, height, depth,
-				  format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-/**
  * Changes variables and flags for a state update, which will happen at the
  * next UpdateTextureState
  */
@@ -908,9 +195,9 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 			     struct gl_texture_object *texObj,
 			     GLenum pname, const GLfloat * params)
 {
-	r300TexObjPtr t = (r300TexObjPtr) texObj->DriverData;
+	radeonTexObj* t = radeon_tex_obj(texObj);
 
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+	if (RADEON_DEBUG & (RADEON_STATE | RADEON_TEXTURE)) {
 		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
 			_mesa_lookup_enum_by_nr(pname));
 	}
@@ -929,7 +216,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 		break;
 
 	case GL_TEXTURE_BORDER_COLOR:
-		r300SetTexBorderColor(t, texObj->_BorderChan);
+		r300SetTexBorderColor(t, texObj->BorderColor);
 		break;
 
 	case GL_TEXTURE_BASE_LEVEL:
@@ -941,7 +228,11 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 		 * we just have to rely on loading the right subset of mipmap levels
 		 * to simulate a clamped LOD.
 		 */
-		driSwapOutTextureObject((driTextureObject *) t);
+		if (t->mt) {
+			radeon_miptree_unreference(t->mt);
+			t->mt = 0;
+			t->validated = GL_FALSE;
+		}
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
@@ -964,42 +255,35 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 	}
 }
 
-static void r300BindTexture(GLcontext * ctx, GLenum target,
-			    struct gl_texture_object *texObj)
-{
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
-		fprintf(stderr, "%s( %p ) unit=%d\n", __FUNCTION__,
-			(void *)texObj, ctx->Texture.CurrentUnit);
-	}
-
-	if ((target == GL_TEXTURE_1D)
-	    || (target == GL_TEXTURE_2D)
-	    || (target == GL_TEXTURE_3D)
-	    || (target == GL_TEXTURE_CUBE_MAP)
-	    || (target == GL_TEXTURE_RECTANGLE_NV)) {
-		assert(texObj->DriverData != NULL);
-	}
-}
-
 static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	radeonTexObj* t = radeon_tex_obj(texObj);
 
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+	if (RADEON_DEBUG & (RADEON_STATE | RADEON_TEXTURE)) {
 		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
 			(void *)texObj,
 			_mesa_lookup_enum_by_nr(texObj->Target));
 	}
 
-	if (t != NULL) {
-		if (rmesa) {
-			R300_FIREVERTICES(rmesa);
-		}
+	if (rmesa) {
+		int i;
+		radeon_firevertices(&rmesa->radeon);
+
+		for(i = 0; i < R300_MAX_TEXTURE_UNITS; ++i)
+			if (rmesa->hw.textures[i] == t)
+				rmesa->hw.textures[i] = 0;
+	}
 
-		driDestroyTextureObject(t);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = 0;
 	}
-	/* Free mipmap images and the texture object itself */
 	_mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -1008,8 +292,6 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
  * Called via ctx->Driver.NewTextureObject.
  * Note: this function will be called during context creation to
  * allocate the default texture objects.
- * Note: we could use containment here to 'derive' the driver-specific
- * texture object from the core mesa gl_texture_object.  Not done at this time.
  * Fixup MaxAnisotropy according to user preference.
  */
 static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
@@ -1017,14 +299,23 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
 						      GLenum target)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_object *obj;
-	obj = _mesa_new_texture_object(ctx, name, target);
-	if (!obj)
-		return NULL;
-	obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+	radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
 
-	r300AllocTexObj(obj);
-	return obj;
+	if (RADEON_DEBUG & (RADEON_STATE | RADEON_TEXTURE)) {
+		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+			t, _mesa_lookup_enum_by_nr(target));
+	}
+
+	_mesa_initialize_texture_object(&t->base, name, target);
+	t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+	/* Initialize hardware state */
+	r300UpdateTexWrap(t);
+	r300SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
+	r300SetTexBorderColor(t, t->base.BorderColor);
+
+	return &t->base;
 }
 
 void r300InitTextureFuncs(struct dd_function_table *functions)
@@ -1032,22 +323,30 @@ void r300InitTextureFuncs(struct dd_function_table *functions)
 	/* Note: we only plug in the functions we implement in the driver
 	 * since _mesa_init_driver_functions() was already called.
 	 */
-	functions->ChooseTextureFormat = r300ChooseTextureFormat;
-	functions->TexImage1D = r300TexImage1D;
-	functions->TexImage2D = r300TexImage2D;
-	functions->TexImage3D = r300TexImage3D;
-	functions->TexSubImage1D = r300TexSubImage1D;
-	functions->TexSubImage2D = r300TexSubImage2D;
-	functions->TexSubImage3D = r300TexSubImage3D;
+	functions->NewTextureImage = radeonNewTextureImage;
+	functions->FreeTexImageData = radeonFreeTexImageData;
+	functions->MapTexture = radeonMapTexture;
+	functions->UnmapTexture = radeonUnmapTexture;
+
+	functions->ChooseTextureFormat = radeonChooseTextureFormat_mesa;
+	functions->TexImage1D = radeonTexImage1D;
+	functions->TexImage2D = radeonTexImage2D;
+	functions->TexImage3D = radeonTexImage3D;
+	functions->TexSubImage1D = radeonTexSubImage1D;
+	functions->TexSubImage2D = radeonTexSubImage2D;
+	functions->TexSubImage3D = radeonTexSubImage3D;
+	functions->GetTexImage = radeonGetTexImage;
+	functions->GetCompressedTexImage = radeonGetCompressedTexImage;
 	functions->NewTextureObject = r300NewTextureObject;
-	functions->BindTexture = r300BindTexture;
 	functions->DeleteTexture = r300DeleteTexture;
 	functions->IsTextureResident = driIsTextureResident;
 
 	functions->TexParameter = r300TexParameter;
 
-	functions->CompressedTexImage2D = r300CompressedTexImage2D;
-	functions->CompressedTexSubImage2D = r300CompressedTexSubImage2D;
+	functions->CompressedTexImage2D = radeonCompressedTexImage2D;
+	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
+
+	functions->GenerateMipmap = radeonGenerateMipmap;
 
 	driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_tex.h b/src/mesa/drivers/dri/r300/r300_tex.h
index b86d45bfe0..8a653ea2d1 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.h
+++ b/src/mesa/drivers/dri/r300/r300_tex.h
@@ -37,16 +37,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 extern void r300SetDepthTexMode(struct gl_texture_object *tObj);
 
+extern void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target,
+			     __DRIdrawable *dPriv);
+
+extern void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
+			      GLint format, __DRIdrawable *dPriv);
+
 extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 			     unsigned long long offset, GLint depth,
 			     GLuint pitch);
 
-extern void r300UpdateTextureState(GLcontext * ctx);
-
-extern int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t,
-			       GLuint face);
-
-extern void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t);
+extern GLboolean r300ValidateBuffers(GLcontext * ctx);
 
 extern void r300InitTextureFuncs(struct dd_function_table *functions);
 
diff --git a/src/mesa/drivers/dri/r300/r300_texmem.c b/src/mesa/drivers/dri/r300/r300_texmem.c
deleted file mode 100644
index b03eefaa7c..0000000000
--- a/src/mesa/drivers/dri/r300/r300_texmem.c
+++ /dev/null
@@ -1,567 +0,0 @@
-/**************************************************************************
-
-Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.
-The Weather Channel, Inc. funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86
-license. This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation on the rights to use, copy, modify, merge, publish,
-distribute, sub license, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**************************************************************************/
-
-/**
- * \file
- *
- * \author Gareth Hughes <gareth@valinux.com>
- *
- * \author Kevin E. Martin <martin@valinux.com>
- */
-
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/macros.h"
-#include "main/simple_list.h"
-#include "radeon_reg.h"		/* gets definition for usleep */
-#include "r300_context.h"
-#include "r300_state.h"
-#include "r300_cmdbuf.h"
-#include "radeon_ioctl.h"
-#include "r300_tex.h"
-#include "r300_ioctl.h"
-#include <unistd.h>		/* for usleep() */
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
-
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t)
-{
-	int i;
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
-		fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
-			(void *)t, (void *)t->base.tObj);
-	}
-
-	for (i = 0; i < rmesa->radeon.glCtx->Const.MaxTextureUnits; i++) {
-		if (rmesa->state.texture.unit[i].texobj == t) {
-			rmesa->state.texture.unit[i].texobj = NULL;
-		}
-	}
-}
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-static void r300UploadGARTClientSubImage(r300ContextPtr rmesa,
-					 r300TexObjPtr t,
-					 struct gl_texture_image *texImage,
-					 GLint hwlevel,
-					 GLint x, GLint y,
-					 GLint width, GLint height)
-{
-	const struct gl_texture_format *texFormat = texImage->TexFormat;
-	GLuint srcPitch, dstPitch;
-	int blit_format;
-	int srcOffset;
-
-	/*
-	 * XXX it appears that we always upload the full image, not a subimage.
-	 * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
-	 * changed, the src pitch will have to change.
-	 */
-	switch (texFormat->TexelBytes) {
-	case 1:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 2:
-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 4:
-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 8:
-	case 16:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	default:
-		return;
-	}
-
-	t->image[0][hwlevel].data = texImage->Data;
-	srcOffset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
-
-	assert(srcOffset != ~0);
-
-	/* Don't currently need to cope with small pitches?
-	 */
-	width = texImage->Width;
-	height = texImage->Height;
-
-	if (texFormat->TexelBytes > 4) {
-		width *= texFormat->TexelBytes;
-	}
-
-	r300EmitWait(rmesa, R300_WAIT_3D);
-
-	r300EmitBlit(rmesa, blit_format,
-		     srcPitch,
-		     srcOffset,
-		     dstPitch,
-		     t->bufAddr,
-		     x,
-		     y,
-		     t->image[0][hwlevel].x + x,
-		     t->image[0][hwlevel].y + y, width, height);
-
-	r300EmitWait(rmesa, R300_WAIT_2D);
-}
-
-static void r300UploadRectSubImage(r300ContextPtr rmesa,
-				   r300TexObjPtr t,
-				   struct gl_texture_image *texImage,
-				   GLint x, GLint y, GLint width, GLint height)
-{
-	const struct gl_texture_format *texFormat = texImage->TexFormat;
-	int blit_format, dstPitch, done;
-
-	switch (texFormat->TexelBytes) {
-	case 1:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		break;
-	case 2:
-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
-		break;
-	case 4:
-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
-		break;
-	case 8:
-	case 16:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		break;
-	default:
-		return;
-	}
-
-	t->image[0][0].data = texImage->Data;
-
-	/* Currently don't need to cope with small pitches.
-	 */
-	width = texImage->Width;
-	height = texImage->Height;
-	dstPitch = t->pitch;
-
-	if (texFormat->TexelBytes > 4) {
-		width *= texFormat->TexelBytes;
-	}
-
-	if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
-		/* In this case, could also use GART texturing.  This is
-		 * currently disabled, but has been tested & works.
-		 */
-		t->offset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
-		t->pitch = texImage->RowStride * texFormat->TexelBytes - 32;
-
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"Using GART texturing for rectangular client texture\n");
-
-		/* Release FB memory allocated for this image:
-		 */
-		/* FIXME This may not be correct as driSwapOutTextureObject sets
-		 * FIXME dirty_images.  It may be fine, though.
-		 */
-		if (t->base.memBlock) {
-			driSwapOutTextureObject((driTextureObject *) t);
-		}
-	} else if (texImage->IsClientData) {
-		/* Data already in GART memory, with usable pitch.
-		 */
-		GLuint srcPitch;
-		srcPitch = texImage->RowStride * texFormat->TexelBytes;
-		r300EmitBlit(rmesa,
-			     blit_format,
-			     srcPitch,
-			     r300GartOffsetFromVirtual(rmesa, texImage->Data),
-			     dstPitch, t->bufAddr, 0, 0, 0, 0, width, height);
-	} else {
-		/* Data not in GART memory, or bad pitch.
-		 */
-		for (done = 0; done < height;) {
-			struct r300_dma_region region;
-			int lines =
-			    MIN2(height - done, RADEON_BUFFER_SIZE / dstPitch);
-			int src_pitch;
-			char *tex;
-
-			src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-			tex = (char *)texImage->Data + done * src_pitch;
-
-			memset(&region, 0, sizeof(region));
-			r300AllocDmaRegion(rmesa, &region, lines * dstPitch,
-					   1024);
-
-			/* Copy texdata to dma:
-			 */
-			if (RADEON_DEBUG & DEBUG_TEXTURE)
-				fprintf(stderr,
-					"%s: src_pitch %d dst_pitch %d\n",
-					__FUNCTION__, src_pitch, dstPitch);
-
-			if (src_pitch == dstPitch) {
-				memcpy(region.address + region.start, tex,
-				       lines * src_pitch);
-			} else {
-				char *buf = region.address + region.start;
-				int i;
-				for (i = 0; i < lines; i++) {
-					memcpy(buf, tex, src_pitch);
-					buf += dstPitch;
-					tex += src_pitch;
-				}
-			}
-
-			r300EmitWait(rmesa, R300_WAIT_3D);
-
-			/* Blit to framebuffer
-			 */
-			r300EmitBlit(rmesa,
-				     blit_format,
-				     dstPitch, GET_START(&region),
-				     dstPitch | (t->tile_bits >> 16),
-				     t->bufAddr, 0, 0, 0, done, width, lines);
-
-			r300EmitWait(rmesa, R300_WAIT_2D);
-#ifdef USER_BUFFERS
-			r300_mem_use(rmesa, region.buf->id);
-#endif
-
-			r300ReleaseDmaRegion(rmesa, &region, __FUNCTION__);
-			done += lines;
-		}
-	}
-}
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
-			       GLint hwlevel,
-			       GLint x, GLint y, GLint width, GLint height,
-			       GLuint face)
-{
-	struct gl_texture_image *texImage = NULL;
-	GLuint offset;
-	GLint imageWidth, imageHeight;
-	GLint ret;
-	drm_radeon_texture_t tex;
-	drm_radeon_tex_image_t tmp;
-	const int level = hwlevel + t->base.firstLevel;
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
-		fprintf(stderr,
-			"%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n",
-			__FUNCTION__, (void *)t, (void *)t->base.tObj, level,
-			width, height, face);
-	}
-
-	ASSERT(face < 6);
-
-	/* Ensure we have a valid texture to upload */
-	if ((hwlevel < 0) || (hwlevel >= RADEON_MAX_TEXTURE_LEVELS)) {
-		_mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-		return;
-	}
-
-	texImage = t->base.tObj->Image[face][level];
-
-	if (!texImage) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: texImage %d is NULL!\n",
-				__FUNCTION__, level);
-		return;
-	}
-	if (!texImage->Data) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: image data is NULL!\n",
-				__FUNCTION__);
-		return;
-	}
-
-	if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		assert(level == 0);
-		assert(hwlevel == 0);
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: image data is rectangular\n",
-				__FUNCTION__);
-		r300UploadRectSubImage(rmesa, t, texImage, x, y, width, height);
-		return;
-	} else if (texImage->IsClientData) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"%s: image data is in GART client storage\n",
-				__FUNCTION__);
-		r300UploadGARTClientSubImage(rmesa, t, texImage, hwlevel, x, y,
-					     width, height);
-		return;
-	} else if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "%s: image data is in normal memory\n",
-			__FUNCTION__);
-
-	imageWidth = texImage->Width;
-	imageHeight = texImage->Height;
-
-	offset = t->bufAddr;
-
-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
-		GLint imageX = 0;
-		GLint imageY = 0;
-		GLint blitX = t->image[face][hwlevel].x;
-		GLint blitY = t->image[face][hwlevel].y;
-		GLint blitWidth = t->image[face][hwlevel].width;
-		GLint blitHeight = t->image[face][hwlevel].height;
-		fprintf(stderr, "   upload image: %d,%d at %d,%d\n",
-			imageWidth, imageHeight, imageX, imageY);
-		fprintf(stderr, "   upload  blit: %d,%d at %d,%d\n",
-			blitWidth, blitHeight, blitX, blitY);
-		fprintf(stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-			(GLuint) offset, hwlevel, level);
-	}
-
-	t->image[face][hwlevel].data = texImage->Data;
-
-	/* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-	 * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-	 * We used to use 1, 2 and 4-byte texels and used to use the texture
-	 * width to dictate the blit width - but that won't work for compressed
-	 * textures. (Brian)
-	 * NOTE: can't do that with texture tiling. (sroland)
-	 */
-	tex.offset = offset;
-	tex.image = &tmp;
-	/* copy (x,y,width,height,data) */
-	memcpy(&tmp, &t->image[face][hwlevel], sizeof(tmp));
-
-	if (texImage->TexFormat->TexelBytes > 4) {
-		const int log2TexelBytes =
-		    (3 + (texImage->TexFormat->TexelBytes >> 4));
-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
-		tex.pitch =
-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
-			 64, 1);
-		tex.height = imageHeight;
-		tex.width = imageWidth << log2TexelBytes;
-		tex.offset += (tmp.x << log2TexelBytes) & ~1023;
-		tmp.x = tmp.x % (1024 >> log2TexelBytes);
-		tmp.width = tmp.width << log2TexelBytes;
-	} else if (texImage->TexFormat->TexelBytes) {
-		/* use multi-byte upload scheme */
-		tex.height = imageHeight;
-		tex.width = imageWidth;
-		switch (texImage->TexFormat->TexelBytes) {
-		case 1:
-			tex.format = RADEON_TXFORMAT_I8;
-			break;
-		case 2:
-			tex.format = RADEON_TXFORMAT_AI88;
-			break;
-		case 4:
-			tex.format = RADEON_TXFORMAT_ARGB8888;
-			break;
-		}
-		tex.pitch =
-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
-			 64, 1);
-		tex.offset += tmp.x & ~1023;
-		tmp.x = tmp.x % 1024;
-
-		if (t->tile_bits & R300_TXO_MICRO_TILE) {
-			/* need something like "tiled coordinates" ? */
-			tmp.y = tmp.x / (tex.pitch * 128) * 2;
-			tmp.x =
-			    tmp.x % (tex.pitch * 128) / 2 /
-			    texImage->TexFormat->TexelBytes;
-			tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-		} else {
-			tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-		}
-#if 1
-		if ((t->tile_bits & R300_TXO_MACRO_TILE) &&
-		    (texImage->Width * texImage->TexFormat->TexelBytes >= 256)
-		    && ((!(t->tile_bits & R300_TXO_MICRO_TILE)
-			 && (texImage->Height >= 8))
-			|| (texImage->Height >= 16))) {
-			/* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
-			   OR if height is smaller than 8 automatically, but if micro tiling is active
-			   the limit is height 16 instead ? */
-			tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-		}
-#endif
-	} else {
-		/* In case of for instance 8x8 texture (2x2 dxt blocks),
-		   padding after the first two blocks is needed (only
-		   with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-		/* set tex.height to 1/4 since 1 "macropixel" (dxt-block)
-		   has 4 real pixels. Needed so the kernel module reads
-		   the right amount of data. */
-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
-		tex.pitch = (R300_BLIT_WIDTH_BYTES / 64);
-		tex.height = (imageHeight + 3) / 4;
-		tex.width = (imageWidth + 3) / 4;
-		if ((t->format & R300_TX_FORMAT_DXT1) == R300_TX_FORMAT_DXT1) {
-			tex.width *= 8;
-		} else {
-			tex.width *= 16;
-		}
-	}
-
-	LOCK_HARDWARE(&rmesa->radeon);
-	do {
-		ret =
-		    drmCommandWriteRead(rmesa->radeon.dri.fd,
-					DRM_RADEON_TEXTURE, &tex,
-					sizeof(drm_radeon_texture_t));
-		if (ret) {
-			if (RADEON_DEBUG & DEBUG_IOCTL)
-				fprintf(stderr,
-					"DRM_RADEON_TEXTURE:  again!\n");
-			usleep(1);
-		}
-	} while (ret == -EAGAIN);
-
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	if (ret) {
-		fprintf(stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret);
-		fprintf(stderr, "   offset=0x%08x\n", offset);
-		fprintf(stderr, "   image width=%d height=%d\n",
-			imageWidth, imageHeight);
-		fprintf(stderr, "    blit width=%d height=%d data=%p\n",
-			t->image[face][hwlevel].width,
-			t->image[face][hwlevel].height,
-			t->image[face][hwlevel].data);
-		_mesa_exit(-1);
-	}
-}
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- *
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
-{
-	const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-	if (t->image_override)
-		return 0;
-
-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
-		fprintf(stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-			(void *)rmesa->radeon.glCtx, (void *)t->base.tObj,
-			t->base.totalSize, t->base.firstLevel,
-			t->base.lastLevel);
-	}
-
-	if (t->base.totalSize == 0)
-		return 0;
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-		radeonFinish(rmesa->radeon.glCtx);
-	}
-
-	LOCK_HARDWARE(&rmesa->radeon);
-
-	if (t->base.memBlock == NULL) {
-		int heap;
-
-		heap = driAllocateTexture(rmesa->texture_heaps, rmesa->nr_heaps,
-					  (driTextureObject *) t);
-		if (heap == -1) {
-			UNLOCK_HARDWARE(&rmesa->radeon);
-			return -1;
-		}
-
-		/* Set the base offset of the texture image */
-		t->bufAddr = rmesa->radeon.radeonScreen->texOffset[heap]
-		    + t->base.memBlock->ofs;
-		t->offset = t->bufAddr;
-
-		if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-			/* hope it's safe to add that here... */
-			t->offset |= t->tile_bits;
-		}
-	}
-
-	/* Let the world know we've used this memory recently.
-	 */
-	driUpdateTextureLRU((driTextureObject *) t);
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	/* Upload any images that are new */
-	if (t->base.dirty_images[face]) {
-		int i;
-		for (i = 0; i < numLevels; i++) {
-			if ((t->base.
-			     dirty_images[face] & (1 <<
-						   (i + t->base.firstLevel))) !=
-			    0) {
-				r300UploadSubImage(rmesa, t, i, 0, 0,
-						   t->image[face][i].width,
-						   t->image[face][i].height,
-						   face);
-			}
-		}
-		t->base.dirty_images[face] = 0;
-	}
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-		radeonFinish(rmesa->radeon.glCtx);
-	}
-
-	return 0;
-}
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index e2329f04ec..f030451b28 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -43,11 +43,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
+#include "main/simple_list.h"
 
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
-#include "radeon_ioctl.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_tex.h"
 #include "r300_reg.h"
 
@@ -117,7 +118,12 @@ static const struct tx_table {
 	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
 	_ASSIGN(Z16, R300_EASY_TX_FORMAT(X, X, X, X, X16)),
 	_ASSIGN(Z24_S8, R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8)),
+	_ASSIGN(S8_Z24, R300_EASY_TX_FORMAT(Y, Y, Y, Y, X24_Y8)),
 	_ASSIGN(Z32, R300_EASY_TX_FORMAT(X, X, X, X, X32)),
+	/* EXT_texture_sRGB */
+	_ASSIGN(SRGBA8, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA),
+	_ASSIGN(SLA8, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8) | R300_TX_FORMAT_GAMMA),
+	_ASSIGN(SL8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8) | R300_TX_FORMAT_GAMMA),
 	/* *INDENT-ON* */
 };
 
@@ -143,13 +149,12 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 		},
 	};
 	const GLuint *format;
-	r300TexObjPtr t;
+	radeonTexObjPtr t;
 
 	if (!tObj)
 		return;
 
-	t = (r300TexObjPtr) tObj->DriverData;
-
+	t = radeon_tex_obj(tObj);
 
 	switch (tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat) {
 	case MESA_FORMAT_Z16:
@@ -171,13 +176,13 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 	switch (tObj->DepthMode) {
 	case GL_LUMINANCE:
-		t->format = format[0];
+		t->pp_txformat = format[0];
 		break;
 	case GL_INTENSITY:
-		t->format = format[1];
+		t->pp_txformat = format[1];
 		break;
 	case GL_ALPHA:
-		t->format = format[2];
+		t->pp_txformat = format[2];
 		break;
 	default:
 		/* Error...which should have already been caught by higher
@@ -190,479 +195,307 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 
 /**
- * Compute sizes and fill in offset and blit information for the given
- * image (determined by \p face and \p level).
- *
- * \param curOffset points to the offset at which the image is to be stored
- * and is updated by this function according to the size of the image.
- */
-static void compute_tex_image_offset(
-	struct gl_texture_object *tObj,
-	GLuint face,
-	GLint level,
-	GLint* curOffset)
-{
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	const struct gl_texture_image* texImage;
-	GLuint blitWidth = R300_BLIT_WIDTH_BYTES;
-	GLuint texelBytes;
-	GLuint size;
-
-	texImage = tObj->Image[0][level + t->base.firstLevel];
-	if (!texImage)
-		return;
-
-	texelBytes = texImage->TexFormat->TexelBytes;
-
-	/* find image size in bytes */
-	if (texImage->IsCompressed) {
-		if ((t->format & R300_TX_FORMAT_DXT1) ==
-			R300_TX_FORMAT_DXT1) {
-			// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
-			if ((texImage->Width + 3) < 8)	/* width one block */
-				size = texImage->CompressedSize * 4;
-			else if ((texImage->Width + 3) < 16)
-				size = texImage->CompressedSize * 2;
-			else
-				size = texImage->CompressedSize;
-		} else {
-			/* DXT3/5, 16 bytes per block */
-			WARN_ONCE
-				("DXT 3/5 suffers from multitexturing problems!\n");
-			// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
-			if ((texImage->Width + 3) < 8)
-				size = texImage->CompressedSize * 2;
-			else
-				size = texImage->CompressedSize;
-		}
-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		size =
-			((texImage->Width * texelBytes +
-			63) & ~63) * texImage->Height;
-		blitWidth = 64 / texelBytes;
-	} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
-		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-			though the actual offset may be different (if texture is less than
-			32 bytes width) to the untiled case */
-		int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-		size =
-			(w * ((texImage->Height + 1) / 2)) *
-			texImage->Depth;
-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-	} else {
-		int w = (texImage->Width * texelBytes + 31) & ~31;
-		size = w * texImage->Height * texImage->Depth;
-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-	}
-	assert(size > 0);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
-			texImage->Width, texImage->Height,
-			texImage->Depth,
-			texImage->TexFormat->TexelBytes,
-			texImage->InternalFormat);
-
-	/* All images are aligned to a 32-byte offset */
-	*curOffset = (*curOffset + 0x1f) & ~0x1f;
-
-	if (texelBytes) {
-		/* fix x and y coords up later together with offset */
-		t->image[face][level].x = *curOffset;
-		t->image[face][level].y = 0;
-		t->image[face][level].width =
-			MIN2(size / texelBytes, blitWidth);
-		t->image[face][level].height =
-			(size / texelBytes) / t->image[face][level].width;
-	} else {
-		t->image[face][level].x = *curOffset % R300_BLIT_WIDTH_BYTES;
-		t->image[face][level].y = *curOffset / R300_BLIT_WIDTH_BYTES;
-		t->image[face][level].width =
-			MIN2(size, R300_BLIT_WIDTH_BYTES);
-		t->image[face][level].height = size / t->image[face][level].width;
-	}
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr,
-			"level %d, face %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-			level, face, texImage->Width, texImage->Height,
-			t->image[face][level].x, t->image[face][level].y,
-			t->image[face][level].width, t->image[face][level].height,
-			size, *curOffset);
-
-	*curOffset += size;
-}
-
-
-
-/**
- * This function computes the number of bytes of storage needed for
- * the given texture object (all mipmap levels, all cube faces).
- * The \c image[face][level].x/y/width/height parameters for upload/blitting
- * are computed here.  \c filter, \c format, etc. will be set here
- * too.
+ * Compute the cached hardware register values for the given texture object.
  *
  * \param rmesa Context pointer
- * \param tObj GL texture object whose images are to be posted to
- *                 hardware state.
+ * \param t the r300 texture object
  */
-static void r300SetTexImages(r300ContextPtr rmesa,
-			     struct gl_texture_object *tObj)
+static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 {
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	const struct gl_texture_image *baseImage =
-	    tObj->Image[0][tObj->BaseLevel];
-	GLint curOffset;
-	GLint i, texelBytes;
-	GLint numLevels;
-	GLint log2Width, log2Height, log2Depth;
-
-	/* Set the hardware texture format
-	 */
+	const struct gl_texture_image *firstImage;
+	int firstlevel = t->mt ? t->mt->firstLevel : 0;
+	    
+	firstImage = t->base.Image[0][firstlevel];
+
 	if (!t->image_override
-	    && VALID_FORMAT(baseImage->TexFormat->MesaFormat)) {
-		if (baseImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
-			r300SetDepthTexMode(tObj);
+	    && VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+		if (firstImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
+			r300SetDepthTexMode(&t->base);
 		} else {
-			t->format = tx_table[baseImage->TexFormat->MesaFormat].format;
+			t->pp_txformat = tx_table[firstImage->TexFormat->MesaFormat].format;
 		}
 
-		t->filter |= tx_table[baseImage->TexFormat->MesaFormat].filter;
+		t->pp_txfilter |= tx_table[firstImage->TexFormat->MesaFormat].filter;
 	} else if (!t->image_override) {
 		_mesa_problem(NULL, "unexpected texture format in %s",
 			      __FUNCTION__);
 		return;
 	}
 
-	texelBytes = baseImage->TexFormat->TexelBytes;
-
-	/* Compute which mipmap levels we really want to send to the hardware.
-	 */
-	driCalculateTextureFirstLastLevel((driTextureObject *) t);
-	log2Width = tObj->Image[0][t->base.firstLevel]->WidthLog2;
-	log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
-	log2Depth = tObj->Image[0][t->base.firstLevel]->DepthLog2;
-
-	numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+	if (t->image_override && t->bo)
+		return;
 
-	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+	t->pp_txsize = (((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)
+			| ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT)
+			| ((firstImage->DepthLog2) << R300_TX_DEPTHMASK_SHIFT)
+			| ((t->mt->lastLevel - t->mt->firstLevel) << R300_TX_MAX_MIP_LEVEL_SHIFT));
 
-	/* Calculate mipmap offsets and dimensions for blitting (uploading)
-	 * The idea is that we lay out the mipmap levels within a block of
-	 * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
-	 */
 	t->tile_bits = 0;
 
-	/* figure out if this texture is suitable for tiling. */
-#if 0				/* Disabled for now */
-	if (texelBytes) {
-		if ((tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
-		    /* texrect might be able to use micro tiling too in theory? */
-		    (baseImage->Height > 1)) {
-
-			/* allow 32 (bytes) x 1 mip (which will use two times the space
-			   the non-tiled version would use) max if base texture is large enough */
-			if ((numLevels == 1) ||
-			    (((baseImage->Width * texelBytes /
-			       baseImage->Height) <= 32)
-			     && (baseImage->Width * texelBytes > 64))
-			    ||
-			    ((baseImage->Width * texelBytes /
-			      baseImage->Height) <= 16)) {
-				t->tile_bits |= R300_TXO_MICRO_TILE;
-			}
-		}
-
-		if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
-			/* we can set macro tiling even for small textures, they will be untiled anyway */
-			t->tile_bits |= R300_TXO_MACRO_TILE;
-		}
-	}
-#endif
+	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
+		t->pp_txformat |= R300_TX_FORMAT_CUBIC_MAP;
+	if (t->base.Target == GL_TEXTURE_3D)
+		t->pp_txformat |= R300_TX_FORMAT_3D;
 
-	curOffset = 0;
 
-	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-		ASSERT(log2Width == log2Height);
-		t->format |= R300_TX_FORMAT_CUBIC_MAP;
-
-		for(i = 0; i < numLevels; i++) {
-			GLuint face;
-			for(face = 0; face < 6; face++)
-				compute_tex_image_offset(tObj, face, i, &curOffset);
-		}
-	} else {
-		if (tObj->Target == GL_TEXTURE_3D)
-                	t->format |= R300_TX_FORMAT_3D;
-
-		for (i = 0; i < numLevels; i++)
-			compute_tex_image_offset(tObj, 0, i, &curOffset);
-	}
-
-	/* Align the total size of texture memory block.
-	 */
-	t->base.totalSize =
-	    (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-
-	t->size =
-	    (((tObj->Image[0][t->base.firstLevel]->Width -
-	       1) << R300_TX_WIDTHMASK_SHIFT)
-	     | ((tObj->Image[0][t->base.firstLevel]->Height - 1) <<
-		R300_TX_HEIGHTMASK_SHIFT)
-	     | ((tObj->Image[0][t->base.firstLevel]->DepthLog2) <<
-		R300_TX_DEPTHMASK_SHIFT))
-	    | ((numLevels - 1) << R300_TX_MAX_MIP_LEVEL_SHIFT);
-
-	t->pitch = 0;
-
-	/* Only need to round to nearest 32 for textures, but the blitter
-	 * requires 64-byte aligned pitches, and we may/may not need the
-	 * blitter.   NPOT only!
-	 */
-	if (baseImage->IsCompressed) {
-		t->pitch |=
-		    (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		unsigned int align = (64 / texelBytes) - 1;
-		t->pitch |= ((tObj->Image[0][t->base.firstLevel]->Width *
-			     texelBytes) + 63) & ~(63);
-		t->size |= R300_TX_SIZE_TXPITCH_EN;
+	if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+		unsigned int align = (64 / t->mt->bpp) - 1;
+		t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
 		if (!t->image_override)
-			t->pitch_reg =
-			    (((tObj->Image[0][t->base.firstLevel]->Width) +
-			      align) & ~align) - 1;
-	} else {
-		t->pitch |=
-		    ((tObj->Image[0][t->base.firstLevel]->Width *
-		      texelBytes) + 63) & ~(63);
+			t->pp_txpitch = ((firstImage->Width + align) & ~align) - 1;
 	}
 
 	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-	    if (tObj->Image[0][t->base.firstLevel]->Width > 2048)
-		t->pitch_reg |= R500_TXWIDTH_BIT11;
-	    if (tObj->Image[0][t->base.firstLevel]->Height > 2048)
-		t->pitch_reg |= R500_TXHEIGHT_BIT11;
+	    if (firstImage->Width > 2048)
+		t->pp_txpitch |= R500_TXWIDTH_BIT11;
+	    if (firstImage->Height > 2048)
+		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
 	}
 }
 
-/* ================================================================
- * Texture unit state management
+/**
+ * Ensure the given texture is ready for rendering.
+ *
+ * Mostly this means populating the texture object's mipmap tree.
  */
-
-static GLboolean r300EnableTexture2D(GLcontext * ctx, int unit)
+static GLboolean r300_validate_texture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-
-	ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
+	radeonTexObj *t = radeon_tex_obj(texObj);
 
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
+	if (!radeon_validate_texture_miptree(ctx, texObj))
+		return GL_FALSE;
 
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock && !t->image_override)
-			return GL_FALSE;
-	}
+	/* Configure the hardware registers (more precisely, the cached version
+	 * of the hardware registers). */
+	setup_hardware_state(rmesa, t);
 
+	t->validated = GL_TRUE;
 	return GL_TRUE;
 }
 
-static GLboolean r300EnableTexture3D(GLcontext * ctx, int unit)
+/**
+ * Ensure all enabled and complete textures are uploaded along with any buffers being used.
+ */
+GLboolean r300ValidateBuffers(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	struct radeon_renderbuffer *rrb;
+	int i;
+	int ret;
 
-	ASSERT(tObj->Target == GL_TEXTURE_3D);
+	radeon_cs_space_reset_bos(rmesa->radeon.cmdbuf.cs);
 
-	/* r300 does not support mipmaps for 3D textures. */
-	if ((tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR)) {
-		return GL_FALSE;
+	rrb = radeon_get_colorbuffer(&rmesa->radeon);
+	/* color buffer */
+	if (rrb && rrb->bo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+						  rrb->bo, 0,
+						  RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock)
-			return GL_FALSE;
+	/* depth buffer */
+	rrb = radeon_get_depthbuffer(&rmesa->radeon);
+	if (rrb && rrb->bo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+						  rrb->bo, 0,
+						  RADEON_GEM_DOMAIN_VRAM);
 	}
+	
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+		radeonTexObj *t;
 
-	return GL_TRUE;
-}
-
-static GLboolean r300EnableTextureCube(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	GLuint face;
-
-	ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
-
-	if (t->base.dirty_images[0] || t->base.dirty_images[1] ||
-	    t->base.dirty_images[2] || t->base.dirty_images[3] ||
-	    t->base.dirty_images[4] || t->base.dirty_images[5]) {
-		/* flush */
-		R300_FIREVERTICES(rmesa);
-		/* layout memory space, once for all faces */
-		r300SetTexImages(rmesa, tObj);
-	}
+		if (!ctx->Texture.Unit[i]._ReallyEnabled)
+			continue;
 
-	/* upload (per face) */
-	for (face = 0; face < 6; face++) {
-		if (t->base.dirty_images[face]) {
-			r300UploadTexImages(rmesa,
-					    (r300TexObjPtr) tObj->DriverData,
-					    face);
+		if (!r300_validate_texture(ctx, ctx->Texture.Unit[i]._Current)) {
+			_mesa_warning(ctx,
+				      "failed to validate texture for unit %d.\n",
+				      i);
 		}
+		t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+		if (t->image_override && t->bo)
+			radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+							  t->bo,
+							  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+		else if (t->mt->bo)
+			radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+							  t->mt->bo,
+							  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
 	}
 
-	if (!t->base.memBlock) {
-		/* texmem alloc failed, use s/w fallback */
+	ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, first_elem(&rmesa->radeon.dma.reserved)->bo, RADEON_GEM_DOMAIN_GTT, 0);
+	if (ret)
 		return GL_FALSE;
-	}
-
 	return GL_TRUE;
 }
 
-static GLboolean r300EnableTextureRect(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-
-	ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
-
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
-
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock && !t->image_override &&
-		    !rmesa->prefer_gart_client_texturing)
-			return GL_FALSE;
-	}
-
-	return GL_TRUE;
-}
-
-static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-
-	/* Fallback if there's a texture border */
-	if (tObj->Image[0][tObj->BaseLevel]->Border > 0)
-		return GL_FALSE;
-
-	/* Update state if this is a different texture object to last
-	 * time.
-	 */
-	if (rmesa->state.texture.unit[unit].texobj != t) {
-		if (rmesa->state.texture.unit[unit].texobj != NULL) {
-			/* The old texture is no longer bound to this texture unit.
-			 * Mark it as such.
-			 */
-
-			rmesa->state.texture.unit[unit].texobj->base.bound &=
-			    ~(1 << unit);
-		}
-
-		rmesa->state.texture.unit[unit].texobj = t;
-		t->base.bound |= (1 << unit);
-		driUpdateTextureLRU((driTextureObject *) t);	/* XXX: should be locked! */
-	}
-
-	return !t->border_fallback;
-}
-
 void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 		      unsigned long long offset, GLint depth, GLuint pitch)
 {
 	r300ContextPtr rmesa = pDRICtx->driverPrivate;
 	struct gl_texture_object *tObj =
 	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
-	r300TexObjPtr t;
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
 	uint32_t pitch_val;
 
 	if (!tObj)
 		return;
 
-	t = (r300TexObjPtr) tObj->DriverData;
-
 	t->image_override = GL_TRUE;
 
 	if (!offset)
 		return;
 
-	t->offset = offset;
-	t->pitch_reg &= (1 << 13) -1;
+	t->bo = NULL;
+	t->override_offset = offset;
+	t->pp_txpitch &= (1 << 13) -1;
 	pitch_val = pitch;
 
 	switch (depth) {
 	case 32:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
-		t->filter |= tx_table[2].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[2].filter;
 		pitch_val /= 4;
 		break;
 	case 24:
 	default:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
-		t->filter |= tx_table[4].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[4].filter;
 		pitch_val /= 4;
 		break;
 	case 16:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
-		t->filter |= tx_table[5].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+		t->pp_txfilter |= tx_table[5].filter;
 		pitch_val /= 2;
 		break;
 	}
 	pitch_val--;
 
-	t->pitch_reg |= pitch_val;
+	t->pp_txpitch |= pitch_val;
 }
 
-static GLboolean r300UpdateTextureUnit(GLcontext * ctx, int unit)
+void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format, __DRIdrawable *dPriv)
 {
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-
-	if (texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
-		return (r300EnableTextureRect(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT)) {
-		return (r300EnableTexture2D(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
-		return (r300EnableTexture3D(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
-		return (r300EnableTextureCube(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled) {
-		return GL_FALSE;
-	} else {
-		return GL_TRUE;
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	r300ContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
 	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+	
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->tile_bits = 0;
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		else
+			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[2].filter;
+		pitch_val /= 4;
+		break;
+	case 3:
+	default:
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[4].filter;
+		pitch_val /= 4;
+		break;
+	case 2:
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+		t->pp_txfilter |= tx_table[5].filter;
+		pitch_val /= 2;
+		break;
+	}
+	pitch_val--;
+	t->pp_txsize = ((rb->base.Width - 1) << R300_TX_WIDTHMASK_SHIFT) |
+              ((rb->base.Height - 1) << R300_TX_HEIGHTMASK_SHIFT);
+	t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
+	t->pp_txpitch |= pitch_val;
+
+	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+	    if (rb->base.Width > 2048)
+		t->pp_txpitch |= R500_TXWIDTH_BIT11;
+	    if (rb->base.Height > 2048)
+		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+	}
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
 }
 
-void r300UpdateTextureState(GLcontext * ctx)
+void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
 {
-	int i;
-
-	for (i = 0; i < 8; i++) {
-		if (!r300UpdateTextureUnit(ctx, i)) {
-			_mesa_warning(ctx,
-				      "failed to update texture state for unit %d.\n",
-				      i);
-		}
-	}
+        r300SetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
 }
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.c b/src/mesa/drivers/dri/r300/r300_vertprog.c
index c4e325e6a7..dd0f27f9cb 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.c
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.c
@@ -32,1450 +32,327 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/enums.h"
 #include "shader/program.h"
+#include "shader/programopt.h"
 #include "shader/prog_instruction.h"
+#include "shader/prog_optimize.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "tnl/tnl.h"
 
+#include "compiler/radeon_compiler.h"
+#include "compiler/radeon_nqssadce.h"
 #include "r300_context.h"
+#include "r300_state.h"
 
-/* TODO: Get rid of t_src_class call */
-#define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
-		       ((t_src_class(a.File) == PVS_SRC_REG_CONSTANT && \
-			 t_src_class(b.File) == PVS_SRC_REG_CONSTANT) || \
-			(t_src_class(a.File) == PVS_SRC_REG_INPUT && \
-			 t_src_class(b.File) == PVS_SRC_REG_INPUT)))) \
-
-/*
- * Take an already-setup and valid source then swizzle it appropriately to
- * obtain a constant ZERO or ONE source.
+/**
+ * Write parameter array for the given vertex program into dst.
+ * Return the total number of components written.
  */
-#define __CONST(x, y)	\
-	(PVS_SRC_OPERAND(t_src_index(vp, &src[x]),	\
-			   t_swizzle(y),	\
-			   t_swizzle(y),	\
-			   t_swizzle(y),	\
-			   t_swizzle(y),	\
-			   t_src_class(src[x].File), \
-			   VSF_FLAG_NONE) | (src[x].RelAddr << 4))
-
-#define FREE_TEMPS() \
-	do { \
-		int u_temp_used = (VSF_MAX_FRAGMENT_TEMPS - 1) - u_temp_i; \
-		if((vp->num_temporaries + u_temp_used) > VSF_MAX_FRAGMENT_TEMPS) { \
-			WARN_ONCE("Ran out of temps, num temps %d, us %d\n", vp->num_temporaries, u_temp_used); \
-			vp->native = GL_FALSE; \
-		} \
-		u_temp_i=VSF_MAX_FRAGMENT_TEMPS-1; \
-	} while (0)
-
-int r300VertexProgUpdateParams(GLcontext * ctx,
-			       struct r300_vertex_program_cont *vp, float *dst)
+static int r300VertexProgUpdateParams(GLcontext * ctx, struct r300_vertex_program *vp, float *dst)
 {
-	int pi;
-	struct gl_vertex_program *mesa_vp = &vp->mesa_program;
-	float *dst_o = dst;
-	struct gl_program_parameter_list *paramList;
+	int i;
 
-	if (mesa_vp->IsNVProgram) {
+	if (vp->Base->IsNVProgram) {
 		_mesa_load_tracked_matrices(ctx);
-
-		for (pi = 0; pi < MAX_NV_VERTEX_PROGRAM_PARAMS; pi++) {
-			*dst++ = ctx->VertexProgram.Parameters[pi][0];
-			*dst++ = ctx->VertexProgram.Parameters[pi][1];
-			*dst++ = ctx->VertexProgram.Parameters[pi][2];
-			*dst++ = ctx->VertexProgram.Parameters[pi][3];
+	} else {
+		if (vp->Base->Base.Parameters) {
+			_mesa_load_state_parameters(ctx, vp->Base->Base.Parameters);
 		}
-		return dst - dst_o;
 	}
 
-	assert(mesa_vp->Base.Parameters);
-	_mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters);
-
-	if (mesa_vp->Base.Parameters->NumParameters * 4 >
-	    VSF_MAX_FRAGMENT_LENGTH) {
+	if (vp->code.constants.Count * 4 > VSF_MAX_FRAGMENT_LENGTH) {
+		/* Should have checked this earlier... */
 		fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
 		_mesa_exit(-1);
 	}
 
-	paramList = mesa_vp->Base.Parameters;
-	for (pi = 0; pi < paramList->NumParameters; pi++) {
-		switch (paramList->Parameters[pi].Type) {
-		case PROGRAM_STATE_VAR:
-		case PROGRAM_NAMED_PARAM:
-			//fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
-		case PROGRAM_CONSTANT:
-			*dst++ = paramList->ParameterValues[pi][0];
-			*dst++ = paramList->ParameterValues[pi][1];
-			*dst++ = paramList->ParameterValues[pi][2];
-			*dst++ = paramList->ParameterValues[pi][3];
-			break;
-		default:
-			_mesa_problem(NULL, "Bad param type in %s",
-				      __FUNCTION__);
-		}
-
-	}
-
-	return dst - dst_o;
-}
-
-static unsigned long t_dst_mask(GLuint mask)
-{
-	/* WRITEMASK_* is equivalent to VSF_FLAG_* */
-	return mask & VSF_FLAG_ALL;
-}
-
-static unsigned long t_dst_class(enum register_file file)
-{
-
-	switch (file) {
-	case PROGRAM_TEMPORARY:
-		return PVS_DST_REG_TEMPORARY;
-	case PROGRAM_OUTPUT:
-		return PVS_DST_REG_OUT;
-	case PROGRAM_ADDRESS:
-		return PVS_DST_REG_A0;
-		/*
-		   case PROGRAM_INPUT:
-		   case PROGRAM_LOCAL_PARAM:
-		   case PROGRAM_ENV_PARAM:
-		   case PROGRAM_NAMED_PARAM:
-		   case PROGRAM_STATE_VAR:
-		   case PROGRAM_WRITE_ONLY:
-		   case PROGRAM_ADDRESS:
-		 */
-	default:
-		fprintf(stderr, "problem in %s", __FUNCTION__);
-		_mesa_exit(-1);
-		return -1;
-	}
-}
-
-static unsigned long t_dst_index(struct r300_vertex_program *vp,
-				 struct prog_dst_register *dst)
-{
-	if (dst->File == PROGRAM_OUTPUT)
-		return vp->outputs[dst->Index];
-
-	return dst->Index;
-}
-
-static unsigned long t_src_class(enum register_file file)
-{
-	switch (file) {
-	case PROGRAM_TEMPORARY:
-		return PVS_SRC_REG_TEMPORARY;
-	case PROGRAM_INPUT:
-		return PVS_SRC_REG_INPUT;
-	case PROGRAM_LOCAL_PARAM:
-	case PROGRAM_ENV_PARAM:
-	case PROGRAM_NAMED_PARAM:
-	case PROGRAM_CONSTANT:
-	case PROGRAM_STATE_VAR:
-		return PVS_SRC_REG_CONSTANT;
-		/*
-		   case PROGRAM_OUTPUT:
-		   case PROGRAM_WRITE_ONLY:
-		   case PROGRAM_ADDRESS:
-		 */
-	default:
-		fprintf(stderr, "problem in %s", __FUNCTION__);
-		_mesa_exit(-1);
-		return -1;
-	}
-}
+	for(i = 0; i < vp->code.constants.Count; ++i) {
+		const float * src = 0;
+		const struct rc_constant * constant = &vp->code.constants.Constants[i];
 
-static INLINE unsigned long t_swizzle(GLubyte swizzle)
-{
-/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
-	return swizzle;
-}
+		switch(constant->Type) {
+		case RC_CONSTANT_EXTERNAL:
+			if (vp->Base->IsNVProgram) {
+				src = ctx->VertexProgram.Parameters[constant->u.External];
+			} else {
+				src = vp->Base->Base.Parameters->ParameterValues[constant->u.External];
+			}
+			break;
 
-#if 0
-static void vp_dump_inputs(struct r300_vertex_program *vp, char *caller)
-{
-	int i;
+		case RC_CONSTANT_IMMEDIATE:
+			src = constant->u.Immediate;
+			break;
+		}
 
-	if (vp == NULL) {
-		fprintf(stderr, "vp null in call to %s from %s\n", __FUNCTION__,
-			caller);
-		return;
+		dst[4*i] = src[0];
+		dst[4*i + 1] = src[1];
+		dst[4*i + 2] = src[2];
+		dst[4*i + 3] = src[3];
 	}
 
-	fprintf(stderr, "%s:<", caller);
-	for (i = 0; i < VERT_ATTRIB_MAX; i++)
-		fprintf(stderr, "%d ", vp->inputs[i]);
-	fprintf(stderr, ">\n");
-
+	return 4 * vp->code.constants.Count;
 }
-#endif
 
-static unsigned long t_src_index(struct r300_vertex_program *vp,
-				 struct prog_src_register *src)
+static GLbitfield compute_required_outputs(struct gl_vertex_program * vp, GLbitfield fpreads)
 {
+	GLbitfield outputs = 0;
 	int i;
-	int max_reg = -1;
 
-	if (src->File == PROGRAM_INPUT) {
-		if (vp->inputs[src->Index] != -1)
-			return vp->inputs[src->Index];
-
-		for (i = 0; i < VERT_ATTRIB_MAX; i++)
-			if (vp->inputs[i] > max_reg)
-				max_reg = vp->inputs[i];
-
-		vp->inputs[src->Index] = max_reg + 1;
-
-		//vp_dump_inputs(vp, __FUNCTION__);
-
-		return vp->inputs[src->Index];
-	} else {
-		if (src->Index < 0) {
-			fprintf(stderr,
-				"negative offsets for indirect addressing do not work.\n");
-			return 0;
-		}
-		return src->Index;
-	}
-}
-
-/* these two functions should probably be merged... */
-
-static unsigned long t_src(struct r300_vertex_program *vp,
-			   struct prog_src_register *src)
-{
-	/* src->NegateBase uses the NEGATE_ flags from program_instruction.h,
-	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
-	 */
-	return PVS_SRC_OPERAND(t_src_index(vp, src),
-			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
-			       t_swizzle(GET_SWZ(src->Swizzle, 1)),
-			       t_swizzle(GET_SWZ(src->Swizzle, 2)),
-			       t_swizzle(GET_SWZ(src->Swizzle, 3)),
-			       t_src_class(src->File),
-			       src->NegateBase) | (src->RelAddr << 4);
-}
+#define ADD_OUTPUT(fp_attr, vp_result) \
+	do { \
+		if (fpreads & (1 << (fp_attr))) \
+			outputs |= (1 << (vp_result)); \
+	} while (0)
 
-static unsigned long t_src_scalar(struct r300_vertex_program *vp,
-				  struct prog_src_register *src)
-{
-	/* src->NegateBase uses the NEGATE_ flags from program_instruction.h,
-	 * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
-	 */
-	return PVS_SRC_OPERAND(t_src_index(vp, src),
-			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
-			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
-			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
-			       t_swizzle(GET_SWZ(src->Swizzle, 0)),
-			       t_src_class(src->File),
-			       src->
-			       NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src->RelAddr << 4);
-}
+	ADD_OUTPUT(FRAG_ATTRIB_COL0, VERT_RESULT_COL0);
+	ADD_OUTPUT(FRAG_ATTRIB_COL1, VERT_RESULT_COL1);
 
-static GLboolean valid_dst(struct r300_vertex_program *vp,
-			   struct prog_dst_register *dst)
-{
-	if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) {
-		return GL_FALSE;
-	} else if (dst->File == PROGRAM_ADDRESS) {
-		assert(dst->Index == 0);
+	for (i = 0; i <= 7; ++i) {
+		ADD_OUTPUT(FRAG_ATTRIB_TEX0 + i, VERT_RESULT_TEX0 + i);
 	}
 
-	return GL_TRUE;
-}
-
-static GLuint *r300TranslateOpcodeABS(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	//MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
-
-	inst[0] = PVS_OP_DST_OPERAND(VE_MAXIMUM,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
-				  t_src_class(src[0].File),
-				  (!src[0].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-	inst[3] = 0;
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeADD(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeARL(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_FLT2FIX_DX,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeDP3(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	//DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO}
-
-	inst[0] = PVS_OP_DST_OPERAND(VE_DOT_PRODUCT,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
-				  SWIZZLE_ZERO,
-				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-	inst[2] =
-	    PVS_SRC_OPERAND(t_src_index(vp, &src[1]),
-			    t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
-			    t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
-			    t_swizzle(GET_SWZ(src[1].Swizzle, 2)), SWIZZLE_ZERO,
-			    t_src_class(src[1].File),
-			    src[1].
-			    NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
-	    (src[1].RelAddr << 4);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeDP4(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_DOT_PRODUCT,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeDPH(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	//DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W}
-	inst[0] = PVS_OP_DST_OPERAND(VE_DOT_PRODUCT,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
-				  PVS_SRC_SELECT_FORCE_1,
-				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeDST(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_DISTANCE_VECTOR,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeEX2(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(ME_EXP_BASE2_FULL_DX,
-				     GL_TRUE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src_scalar(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeEXP(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(ME_EXP_BASE2_DX,
-				     GL_TRUE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src_scalar(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeFLR(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3],
-				      int *u_temp_i)
-{
-	/* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W}
-	   ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
-
-	inst[0] = PVS_OP_DST_OPERAND(VE_FRACTION,
-				     GL_FALSE,
-				     GL_FALSE,
-				     *u_temp_i,
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     PVS_DST_REG_TEMPORARY);
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-	inst += 4;
-
-	inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = PVS_SRC_OPERAND(*u_temp_i,
-				  PVS_SRC_SELECT_X,
-				  PVS_SRC_SELECT_Y,
-				  PVS_SRC_SELECT_Z,
-				  PVS_SRC_SELECT_W, PVS_SRC_REG_TEMPORARY,
-				  /* Not 100% sure about this */
-				  (!src[0].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE
-				  /*VSF_FLAG_ALL */ );
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-	(*u_temp_i)--;
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeFRC(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_FRACTION,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeLG2(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	// LG2 RESULT 1.X Y Z W PARAM 0{} {X X X X}
-
-	inst[0] = PVS_OP_DST_OPERAND(ME_LOG_BASE2_FULL_DX,
-				     GL_TRUE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
-				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeLIT(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	//LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
-
-	inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
-				     GL_TRUE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	/* NOTE: Users swizzling might not work. */
-	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
-				  PVS_SRC_SELECT_FORCE_0,	// Z
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
-				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
-				  PVS_SRC_SELECT_FORCE_0,	// Z
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
-				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
-				  PVS_SRC_SELECT_FORCE_0,	// Z
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
-				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeLOG(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(ME_LOG_BASE2_DX,
-				     GL_TRUE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src_scalar(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeMAD(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
-				     GL_FALSE,
-				     GL_TRUE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = t_src(vp, &src[2]);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeMAX(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_MAXIMUM,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeMIN(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_MINIMUM,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeMOV(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	//ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
-
-	inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeMUL(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodePOW(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
-				     GL_TRUE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src_scalar(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = t_src_scalar(vp, &src[1]);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeRCP(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(ME_RECIP_DX,
-				     GL_TRUE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src_scalar(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeRSQ(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(ME_RECIP_SQRT_DX,
-				     GL_TRUE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src_scalar(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
-}
-
-static GLuint *r300TranslateOpcodeSGE(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_SET_GREATER_THAN_EQUAL,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
+#undef ADD_OUTPUT
 
-static GLuint *r300TranslateOpcodeSLT(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	inst[0] = PVS_OP_DST_OPERAND(VE_SET_LESS_THAN,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = t_src(vp, &src[1]);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-
-	return inst;
-}
+	if ((fpreads & (1 << FRAG_ATTRIB_COL0)) &&
+	    (vp->Base.OutputsWritten & (1 << VERT_RESULT_BFC0)))
+		outputs |= 1 << VERT_RESULT_BFC0;
+	if ((fpreads & (1 << FRAG_ATTRIB_COL1)) &&
+	    (vp->Base.OutputsWritten & (1 << VERT_RESULT_BFC1)))
+		outputs |= 1 << VERT_RESULT_BFC1;
 
-static GLuint *r300TranslateOpcodeSUB(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	//ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
-
-#if 0
-	inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]),
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
-				  t_src_class(src[1].File),
-				  (!src[1].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[1].RelAddr << 4);
-	inst[3] = 0;
-#else
-	inst[0] =
-	    PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
-			       GL_FALSE,
-			       GL_FALSE,
-			       t_dst_index(vp, &vpi->DstReg),
-			       t_dst_mask(vpi->DstReg.WriteMask),
-			       t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ONE);
-	inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]),
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
-				  t_src_class(src[1].File),
-				  (!src[1].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[1].RelAddr << 4);
-#endif
-
-	return inst;
-}
+	outputs |= 1 << VERT_RESULT_HPOS;
+	if (vp->Base.OutputsWritten & (1 << VERT_RESULT_PSIZ))
+		outputs |= 1 << VERT_RESULT_PSIZ;
 
-static GLuint *r300TranslateOpcodeSWZ(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3])
-{
-	//ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
-
-	inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = t_src(vp, &src[0]);
-	inst[2] = __CONST(0, SWIZZLE_ZERO);
-	inst[3] = __CONST(0, SWIZZLE_ZERO);
-
-	return inst;
+	return outputs;
 }
 
-static GLuint *r300TranslateOpcodeXPD(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi,
-				      GLuint * inst,
-				      struct prog_src_register src[3],
-				      int *u_temp_i)
-{
-	/* mul r0, r1.yzxw, r2.zxyw
-	   mad r0, -r2.yzxw, r1.zxyw, r0
-	 */
-
-	inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
-				     GL_FALSE,
-				     GL_FALSE,
-				     *u_temp_i,
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     PVS_DST_REG_TEMPORARY);
-	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 2)),	// Z
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
-				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 2)),	// Z
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 0)),	// X
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 1)),	// Y
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 3)),	// W
-				  t_src_class(src[1].File),
-				  src[1].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[1].RelAddr << 4);
-	inst[3] = __CONST(1, SWIZZLE_ZERO);
-	inst += 4;
-
-	inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
-				     GL_FALSE,
-				     GL_FALSE,
-				     t_dst_index(vp, &vpi->DstReg),
-				     t_dst_mask(vpi->DstReg.WriteMask),
-				     t_dst_class(vpi->DstReg.File));
-	inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 1)),	// Y
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 2)),	// Z
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 0)),	// X
-				  t_swizzle(GET_SWZ(src[1].Swizzle, 3)),	// W
-				  t_src_class(src[1].File),
-				  (!src[1].
-				   NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[1].RelAddr << 4);
-	inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 2)),	// Z
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 0)),	// X
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 1)),	// Y
-				  t_swizzle(GET_SWZ(src[0].Swizzle, 3)),	// W
-				  t_src_class(src[0].File),
-				  src[0].
-				  NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
-	    (src[0].RelAddr << 4);
-	inst[3] =
-	    PVS_SRC_OPERAND(*u_temp_i, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
-			    PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W,
-			    PVS_SRC_REG_TEMPORARY, VSF_FLAG_NONE);
-
-	(*u_temp_i)--;
-
-	return inst;
-}
 
-static void t_inputs_outputs(struct r300_vertex_program *vp)
+static void t_inputs_outputs(struct r300_vertex_program_compiler * c)
 {
 	int i;
-	int cur_reg = 0;
+	int cur_reg;
+	GLuint OutputsWritten, InputsRead;
 
-	for (i = 0; i < VERT_ATTRIB_MAX; i++)
-		vp->inputs[i] = -1;
+	OutputsWritten = c->Base.Program.OutputsWritten;
+	InputsRead = c->Base.Program.InputsRead;
 
+	cur_reg = -1;
+	for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+		if (InputsRead & (1 << i))
+			c->code->inputs[i] = ++cur_reg;
+		else
+			c->code->inputs[i] = -1;
+	}
+
+	cur_reg = 0;
 	for (i = 0; i < VERT_RESULT_MAX; i++)
-		vp->outputs[i] = -1;
+		c->code->outputs[i] = -1;
 
-	assert(vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS));
+	assert(OutputsWritten & (1 << VERT_RESULT_HPOS));
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS)) {
-		vp->outputs[VERT_RESULT_HPOS] = cur_reg++;
+	if (OutputsWritten & (1 << VERT_RESULT_HPOS)) {
+		c->code->outputs[VERT_RESULT_HPOS] = cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_PSIZ)) {
-		vp->outputs[VERT_RESULT_PSIZ] = cur_reg++;
+	if (OutputsWritten & (1 << VERT_RESULT_PSIZ)) {
+		c->code->outputs[VERT_RESULT_PSIZ] = cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL0)) {
-		vp->outputs[VERT_RESULT_COL0] = cur_reg++;
+	/* If we're writing back facing colors we need to send
+	 * four colors to make front/back face colors selection work.
+	 * If the vertex program doesn't write all 4 colors, lets
+	 * pretend it does by skipping output index reg so the colors
+	 * get written into appropriate output vectors.
+	 */
+	if (OutputsWritten & (1 << VERT_RESULT_COL0)) {
+		c->code->outputs[VERT_RESULT_COL0] = cur_reg++;
+	} else if (OutputsWritten & (1 << VERT_RESULT_BFC0) ||
+		OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL1)) {
-		vp->outputs[VERT_RESULT_COL1] =
-		    vp->outputs[VERT_RESULT_COL0] + 1;
-		cur_reg = vp->outputs[VERT_RESULT_COL1] + 1;
+	if (OutputsWritten & (1 << VERT_RESULT_COL1)) {
+		c->code->outputs[VERT_RESULT_COL1] = cur_reg++;
+	} else if (OutputsWritten & (1 << VERT_RESULT_BFC0) ||
+		OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0)) {
-		vp->outputs[VERT_RESULT_BFC0] =
-		    vp->outputs[VERT_RESULT_COL0] + 2;
-		cur_reg = vp->outputs[VERT_RESULT_BFC0] + 2;
+	if (OutputsWritten & (1 << VERT_RESULT_BFC0)) {
+		c->code->outputs[VERT_RESULT_BFC0] = cur_reg++;
+	} else if (OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
-		vp->outputs[VERT_RESULT_BFC1] =
-		    vp->outputs[VERT_RESULT_COL0] + 3;
-		cur_reg = vp->outputs[VERT_RESULT_BFC1] + 1;
+	if (OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		c->code->outputs[VERT_RESULT_BFC1] = cur_reg++;
+	} else if (OutputsWritten & (1 << VERT_RESULT_BFC0)) {
+		cur_reg++;
 	}
-#if 0
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_FOGC)) {
-		vp->outputs[VERT_RESULT_FOGC] = cur_reg++;
-	}
-#endif
 
 	for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++) {
-		if (vp->key.OutputsWritten & (1 << i)) {
-			vp->outputs[i] = cur_reg++;
-		}
-	}
-}
-
-static void r300TranslateVertexShader(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi)
-{
-	int i;
-	GLuint *inst;
-	unsigned long num_operands;
-	/* Initial value should be last tmp reg that hw supports.
-	   Strangely enough r300 doesnt mind even though these would be out of range.
-	   Smart enough to realize that it doesnt need it? */
-	int u_temp_i = VSF_MAX_FRAGMENT_TEMPS - 1;
-	struct prog_src_register src[3];
-
-	vp->pos_end = 0;	/* Not supported yet */
-	vp->program.length = 0;
-	/*vp->num_temporaries=mesa_vp->Base.NumTemporaries; */
-	vp->translated = GL_TRUE;
-	vp->native = GL_TRUE;
-
-	t_inputs_outputs(vp);
-
-	for (inst = vp->program.body.i; vpi->Opcode != OPCODE_END;
-	     vpi++, inst += 4) {
-
-		FREE_TEMPS();
-
-		if (!valid_dst(vp, &vpi->DstReg)) {
-			/* redirect result to unused temp */
-			vpi->DstReg.File = PROGRAM_TEMPORARY;
-			vpi->DstReg.Index = u_temp_i;
-		}
-
-		num_operands = _mesa_num_inst_src_regs(vpi->Opcode);
-
-		/* copy the sources (src) from mesa into a local variable... is this needed? */
-		for (i = 0; i < num_operands; i++) {
-			src[i] = vpi->SrcReg[i];
-		}
-
-		if (num_operands == 3) {	/* TODO: scalars */
-			if (CMP_SRCS(src[1], src[2])
-			    || CMP_SRCS(src[0], src[2])) {
-				inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-							     GL_FALSE,
-							     GL_FALSE,
-							     u_temp_i,
-							     VSF_FLAG_ALL,
-							     PVS_DST_REG_TEMPORARY);
-				inst[1] =
-				    PVS_SRC_OPERAND(t_src_index(vp, &src[2]),
-						    SWIZZLE_X,
-						    SWIZZLE_Y,
-						    SWIZZLE_Z,
-						    SWIZZLE_W,
-						    t_src_class(src[2].File),
-						    VSF_FLAG_NONE) | (src[2].
-								      RelAddr <<
-								      4);
-				inst[2] = __CONST(2, SWIZZLE_ZERO);
-				inst[3] = __CONST(2, SWIZZLE_ZERO);
-				inst += 4;
-
-				src[2].File = PROGRAM_TEMPORARY;
-				src[2].Index = u_temp_i;
-				src[2].RelAddr = 0;
-				u_temp_i--;
-			}
-		}
-
-		if (num_operands >= 2) {
-			if (CMP_SRCS(src[1], src[0])) {
-				inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-							     GL_FALSE,
-							     GL_FALSE,
-							     u_temp_i,
-							     VSF_FLAG_ALL,
-							     PVS_DST_REG_TEMPORARY);
-				inst[1] =
-				    PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
-						    SWIZZLE_X,
-						    SWIZZLE_Y,
-						    SWIZZLE_Z,
-						    SWIZZLE_W,
-						    t_src_class(src[0].File),
-						    VSF_FLAG_NONE) | (src[0].
-								      RelAddr <<
-								      4);
-				inst[2] = __CONST(0, SWIZZLE_ZERO);
-				inst[3] = __CONST(0, SWIZZLE_ZERO);
-				inst += 4;
-
-				src[0].File = PROGRAM_TEMPORARY;
-				src[0].Index = u_temp_i;
-				src[0].RelAddr = 0;
-				u_temp_i--;
-			}
-		}
-
-		switch (vpi->Opcode) {
-		case OPCODE_ABS:
-			inst = r300TranslateOpcodeABS(vp, vpi, inst, src);
-			break;
-		case OPCODE_ADD:
-			inst = r300TranslateOpcodeADD(vp, vpi, inst, src);
-			break;
-		case OPCODE_ARL:
-			inst = r300TranslateOpcodeARL(vp, vpi, inst, src);
-			break;
-		case OPCODE_DP3:
-			inst = r300TranslateOpcodeDP3(vp, vpi, inst, src);
-			break;
-		case OPCODE_DP4:
-			inst = r300TranslateOpcodeDP4(vp, vpi, inst, src);
-			break;
-		case OPCODE_DPH:
-			inst = r300TranslateOpcodeDPH(vp, vpi, inst, src);
-			break;
-		case OPCODE_DST:
-			inst = r300TranslateOpcodeDST(vp, vpi, inst, src);
-			break;
-		case OPCODE_EX2:
-			inst = r300TranslateOpcodeEX2(vp, vpi, inst, src);
-			break;
-		case OPCODE_EXP:
-			inst = r300TranslateOpcodeEXP(vp, vpi, inst, src);
-			break;
-		case OPCODE_FLR:
-			inst = r300TranslateOpcodeFLR(vp, vpi, inst, src,	/* FIXME */
-						      &u_temp_i);
-			break;
-		case OPCODE_FRC:
-			inst = r300TranslateOpcodeFRC(vp, vpi, inst, src);
-			break;
-		case OPCODE_LG2:
-			inst = r300TranslateOpcodeLG2(vp, vpi, inst, src);
-			break;
-		case OPCODE_LIT:
-			inst = r300TranslateOpcodeLIT(vp, vpi, inst, src);
-			break;
-		case OPCODE_LOG:
-			inst = r300TranslateOpcodeLOG(vp, vpi, inst, src);
-			break;
-		case OPCODE_MAD:
-			inst = r300TranslateOpcodeMAD(vp, vpi, inst, src);
-			break;
-		case OPCODE_MAX:
-			inst = r300TranslateOpcodeMAX(vp, vpi, inst, src);
-			break;
-		case OPCODE_MIN:
-			inst = r300TranslateOpcodeMIN(vp, vpi, inst, src);
-			break;
-		case OPCODE_MOV:
-			inst = r300TranslateOpcodeMOV(vp, vpi, inst, src);
-			break;
-		case OPCODE_MUL:
-			inst = r300TranslateOpcodeMUL(vp, vpi, inst, src);
-			break;
-		case OPCODE_POW:
-			inst = r300TranslateOpcodePOW(vp, vpi, inst, src);
-			break;
-		case OPCODE_RCP:
-			inst = r300TranslateOpcodeRCP(vp, vpi, inst, src);
-			break;
-		case OPCODE_RSQ:
-			inst = r300TranslateOpcodeRSQ(vp, vpi, inst, src);
-			break;
-		case OPCODE_SGE:
-			inst = r300TranslateOpcodeSGE(vp, vpi, inst, src);
-			break;
-		case OPCODE_SLT:
-			inst = r300TranslateOpcodeSLT(vp, vpi, inst, src);
-			break;
-		case OPCODE_SUB:
-			inst = r300TranslateOpcodeSUB(vp, vpi, inst, src);
-			break;
-		case OPCODE_SWZ:
-			inst = r300TranslateOpcodeSWZ(vp, vpi, inst, src);
-			break;
-		case OPCODE_XPD:
-			inst = r300TranslateOpcodeXPD(vp, vpi, inst, src,	/* FIXME */
-						      &u_temp_i);
-			break;
-		default:
-			assert(0);
-			break;
-		}
-	}
-
-	/* Some outputs may be artificially added, to match the inputs
-	   of the fragment program. Blank the outputs here. */
-	for (i = 0; i < VERT_RESULT_MAX; i++) {
-		if (vp->key.OutputsAdded & (1 << i)) {
-			inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-						     GL_FALSE,
-						     GL_FALSE,
-						     vp->outputs[i],
-						     VSF_FLAG_ALL,
-						     PVS_DST_REG_OUT);
-			inst[1] = __CONST(0, SWIZZLE_ZERO);
-			inst[2] = __CONST(0, SWIZZLE_ZERO);
-			inst[3] = __CONST(0, SWIZZLE_ZERO);
-			inst += 4;
+		if (OutputsWritten & (1 << i)) {
+			c->code->outputs[i] = cur_reg++;
 		}
 	}
 
-	vp->program.length = (inst - vp->program.body.i);
-	if (vp->program.length >= VSF_MAX_FRAGMENT_LENGTH) {
-		vp->program.length = 0;
-		vp->native = GL_FALSE;
+	if (OutputsWritten & (1 << VERT_RESULT_FOGC)) {
+		c->code->outputs[VERT_RESULT_FOGC] = cur_reg++;
 	}
-#if 0
-	fprintf(stderr, "hw program:\n");
-	for (i = 0; i < vp->program.length; i++)
-		fprintf(stderr, "%08x\n", vp->program.body.d[i]);
-#endif
 }
 
-/* DP4 version seems to trigger some hw peculiarity */
-//#define PREFER_DP4
 
-static void position_invariant(struct gl_program *prog)
+static struct r300_vertex_program *build_program(GLcontext *ctx,
+						 struct r300_vertex_program_key *wanted_key,
+						 const struct gl_vertex_program *mesa_vp)
 {
-	struct prog_instruction *vpi;
-	struct gl_program_parameter_list *paramList;
-	int i;
-
-	gl_state_index tokens[STATE_LENGTH] = { STATE_MVP_MATRIX, 0, 0, 0, 0 };
-
-	/* tokens[4] = matrix modifier */
-#ifdef PREFER_DP4
-	tokens[4] = 0;		/* not transposed or inverted */
-#else
-	tokens[4] = STATE_MATRIX_TRANSPOSE;
-#endif
-	paramList = prog->Parameters;
-
-	vpi = _mesa_alloc_instructions(prog->NumInstructions + 4);
-	_mesa_init_instructions(vpi, prog->NumInstructions + 4);
-
-	for (i = 0; i < 4; i++) {
-		GLint idx;
-		tokens[2] = tokens[3] = i;	/* matrix row[i]..row[i] */
-		idx = _mesa_add_state_reference(paramList, tokens);
-#ifdef PREFER_DP4
-		vpi[i].Opcode = OPCODE_DP4;
-		vpi[i].StringPos = 0;
-		vpi[i].Data = 0;
-
-		vpi[i].DstReg.File = PROGRAM_OUTPUT;
-		vpi[i].DstReg.Index = VERT_RESULT_HPOS;
-		vpi[i].DstReg.WriteMask = 1 << i;
-		vpi[i].DstReg.CondMask = COND_TR;
-
-		vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
-		vpi[i].SrcReg[0].Index = idx;
-		vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-		vpi[i].SrcReg[1].File = PROGRAM_INPUT;
-		vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
-		vpi[i].SrcReg[1].Swizzle = SWIZZLE_XYZW;
-#else
-		if (i == 0)
-			vpi[i].Opcode = OPCODE_MUL;
-		else
-			vpi[i].Opcode = OPCODE_MAD;
-
-		vpi[i].StringPos = 0;
-		vpi[i].Data = 0;
-
-		if (i == 3)
-			vpi[i].DstReg.File = PROGRAM_OUTPUT;
-		else
-			vpi[i].DstReg.File = PROGRAM_TEMPORARY;
-		vpi[i].DstReg.Index = 0;
-		vpi[i].DstReg.WriteMask = 0xf;
-		vpi[i].DstReg.CondMask = COND_TR;
-
-		vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
-		vpi[i].SrcReg[0].Index = idx;
-		vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-		vpi[i].SrcReg[1].File = PROGRAM_INPUT;
-		vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
-		vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(i, i, i, i);
-
-		if (i > 0) {
-			vpi[i].SrcReg[2].File = PROGRAM_TEMPORARY;
-			vpi[i].SrcReg[2].Index = 0;
-			vpi[i].SrcReg[2].Swizzle = SWIZZLE_XYZW;
-		}
-#endif
-	}
-
-	_mesa_copy_instructions(&vpi[i], prog->Instructions,
-				prog->NumInstructions);
-
-	free(prog->Instructions);
-
-	prog->Instructions = vpi;
-
-	prog->NumInstructions += 4;
-	vpi = &prog->Instructions[prog->NumInstructions - 1];
-
-	assert(vpi->Opcode == OPCODE_END);
-}
+	struct r300_vertex_program *vp;
+	struct r300_vertex_program_compiler compiler;
 
-static void insert_wpos(struct r300_vertex_program *vp, struct gl_program *prog,
-			GLuint temp_index)
-{
-	struct prog_instruction *vpi;
-	struct prog_instruction *vpi_insert;
-	int i = 0;
+	vp = _mesa_calloc(sizeof(*vp));
+	vp->Base = (struct gl_vertex_program *) _mesa_clone_program(ctx, &mesa_vp->Base);
+	_mesa_memcpy(&vp->key, wanted_key, sizeof(vp->key));
 
-	vpi = _mesa_alloc_instructions(prog->NumInstructions + 2);
-	_mesa_init_instructions(vpi, prog->NumInstructions + 2);
-	/* all but END */
-	_mesa_copy_instructions(vpi, prog->Instructions,
-				prog->NumInstructions - 1);
-	/* END */
-	_mesa_copy_instructions(&vpi[prog->NumInstructions + 1],
-				&prog->Instructions[prog->NumInstructions - 1],
-				1);
-	vpi_insert = &vpi[prog->NumInstructions - 1];
+	rc_init(&compiler.Base);
+	compiler.Base.Debug = (RADEON_DEBUG & RADEON_VERTS) ? GL_TRUE : GL_FALSE;
 
-	vpi_insert[i].Opcode = OPCODE_MOV;
+	compiler.code = &vp->code;
+	compiler.RequiredOutputs = compute_required_outputs(vp->Base, vp->key.FpReads);
+	compiler.SetHwInputOutput = &t_inputs_outputs;
 
-	vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
-	vpi_insert[i].DstReg.Index = VERT_RESULT_HPOS;
-	vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
-	vpi_insert[i].DstReg.CondMask = COND_TR;
+	if (compiler.Base.Debug) {
+		fprintf(stderr, "Initial vertex program:\n");
+		_mesa_print_program(&vp->Base->Base);
+		fflush(stderr);
+	}
 
-	vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	vpi_insert[i].SrcReg[0].Index = temp_index;
-	vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-	i++;
+	if (mesa_vp->IsPositionInvariant) {
+		_mesa_insert_mvp_code(ctx, vp->Base);
+	}
 
-	vpi_insert[i].Opcode = OPCODE_MOV;
+	rc_mesa_to_rc_program(&compiler.Base, &vp->Base->Base);
 
-	vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
-	vpi_insert[i].DstReg.Index = VERT_RESULT_TEX0 + vp->wpos_idx;
-	vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
-	vpi_insert[i].DstReg.CondMask = COND_TR;
+	rc_move_output(&compiler.Base, VERT_RESULT_PSIZ, VERT_RESULT_PSIZ, WRITEMASK_X);
 
-	vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	vpi_insert[i].SrcReg[0].Index = temp_index;
-	vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-	i++;
+	if (vp->key.WPosAttr != FRAG_ATTRIB_MAX) {
+		rc_copy_output(&compiler.Base,
+			VERT_RESULT_HPOS,
+			vp->key.WPosAttr - FRAG_ATTRIB_TEX0 + VERT_RESULT_TEX0);
+	}
 
-	free(prog->Instructions);
+	if (vp->key.FogAttr != FRAG_ATTRIB_MAX) {
+		rc_move_output(&compiler.Base,
+			VERT_RESULT_FOGC,
+			vp->key.FogAttr - FRAG_ATTRIB_TEX0 + VERT_RESULT_TEX0, WRITEMASK_X);
+	}
 
-	prog->Instructions = vpi;
+	r3xx_compile_vertex_program(&compiler);
+	vp->error = compiler.Base.Error;
 
-	prog->NumInstructions += i;
-	vpi = &prog->Instructions[prog->NumInstructions - 1];
+	vp->Base->Base.InputsRead = vp->code.InputsRead;
+	vp->Base->Base.OutputsWritten = vp->code.OutputsWritten;
 
-	assert(vpi->Opcode == OPCODE_END);
-}
+	rc_destroy(&compiler.Base);
 
-static void pos_as_texcoord(struct r300_vertex_program *vp,
-			    struct gl_program *prog)
-{
-	struct prog_instruction *vpi;
-	GLuint tempregi = prog->NumTemporaries;
-	/* should do something else if no temps left... */
-	prog->NumTemporaries++;
-
-	for (vpi = prog->Instructions; vpi->Opcode != OPCODE_END; vpi++) {
-		if (vpi->DstReg.File == PROGRAM_OUTPUT
-		    && vpi->DstReg.Index == VERT_RESULT_HPOS) {
-			vpi->DstReg.File = PROGRAM_TEMPORARY;
-			vpi->DstReg.Index = tempregi;
-		}
-	}
-	insert_wpos(vp, prog, tempregi);
+	return vp;
 }
 
-static struct r300_vertex_program *build_program(struct r300_vertex_program_key
-						 *wanted_key, struct gl_vertex_program
-						 *mesa_vp, GLint wpos_idx)
+struct r300_vertex_program * r300SelectAndTranslateVertexShader(GLcontext *ctx)
 {
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_program_key wanted_key = { 0 };
+	struct r300_vertex_program_cont *vpc;
 	struct r300_vertex_program *vp;
 
-	vp = _mesa_calloc(sizeof(*vp));
-	_mesa_memcpy(&vp->key, wanted_key, sizeof(vp->key));
-	vp->wpos_idx = wpos_idx;
-
-	if (mesa_vp->IsPositionInvariant) {
-		position_invariant(&mesa_vp->Base);
-	}
+	vpc = (struct r300_vertex_program_cont *)ctx->VertexProgram._Current;
+	wanted_key.FpReads = r300->selected_fp->InputsRead;
+	wanted_key.FogAttr = r300->selected_fp->fog_attr;
+	wanted_key.WPosAttr = r300->selected_fp->wpos_attr;
 
-	if (wpos_idx > -1) {
-		pos_as_texcoord(vp, &mesa_vp->Base);
+	for (vp = vpc->progs; vp; vp = vp->next) {
+		if (_mesa_memcmp(&vp->key, &wanted_key, sizeof(wanted_key))
+		    == 0) {
+			return r300->selected_vp = vp;
+		}
 	}
 
-	assert(mesa_vp->Base.NumInstructions);
-	vp->num_temporaries = mesa_vp->Base.NumTemporaries;
-	r300TranslateVertexShader(vp, mesa_vp->Base.Instructions);
+	vp = build_program(ctx, &wanted_key, &vpc->mesa_program);
+	vp->next = vpc->progs;
+	vpc->progs = vp;
 
-	return vp;
+	return r300->selected_vp = vp;
 }
 
-static void add_outputs(struct r300_vertex_program_key *key, GLint vert)
+#define bump_vpu_count(ptr, new_count)   do { \
+		drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr)); \
+		int _nc=(new_count)/4; \
+		assert(_nc < 256); \
+		if(_nc>_p->vpu.count)_p->vpu.count=_nc; \
+	} while(0)
+
+static void r300EmitVertexProgram(r300ContextPtr r300, int dest, struct r300_vertex_program_code *code)
 {
-	if (key->OutputsWritten & (1 << vert))
-		return;
+	int i;
 
-	key->OutputsWritten |= 1 << vert;
-	key->OutputsAdded |= 1 << vert;
-}
+	assert((code->length > 0) && (code->length % 4 == 0));
 
-void r300SelectVertexShader(r300ContextPtr r300)
-{
-	GLcontext *ctx = ctx = r300->radeon.glCtx;
-	GLuint InputsRead;
-	struct r300_vertex_program_key wanted_key = { 0 };
-	GLint i;
-	struct r300_vertex_program_cont *vpc;
-	struct r300_vertex_program *vp;
-	GLint wpos_idx;
+	R300_STATECHANGE( r300, vap_flush );
 
-	vpc = (struct r300_vertex_program_cont *)ctx->VertexProgram._Current;
-	wanted_key.InputsRead = vpc->mesa_program.Base.InputsRead;
-	wanted_key.OutputsWritten = vpc->mesa_program.Base.OutputsWritten;
-	InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-
-	wpos_idx = -1;
-	if (InputsRead & FRAG_BIT_WPOS) {
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
-			if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
-				break;
-
-		if (i == ctx->Const.MaxTextureUnits) {
-			fprintf(stderr, "\tno free texcoord found\n");
+	switch ((dest >> 8) & 0xf) {
+		case 0:
+			R300_STATECHANGE(r300, vpi);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vpi.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		case 2:
+			R300_STATECHANGE(r300, vpp);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vpp.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		case 4:
+			R300_STATECHANGE(r300, vps);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vps.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		default:
+			fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
 			_mesa_exit(-1);
-		}
-
-		wanted_key.OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-		wpos_idx = i;
 	}
+}
 
-	add_outputs(&wanted_key, VERT_RESULT_HPOS);
+void r300SetupVertexProgram(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+	struct r300_vertex_program *prog = rmesa->selected_vp;
+	int inst_count = 0;
+	int param_count = 0;
 
-	if (InputsRead & FRAG_BIT_COL0) {
-		add_outputs(&wanted_key, VERT_RESULT_COL0);
-	}
+	/* Reset state, in case we don't use something */
+	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
 
-	if (InputsRead & FRAG_BIT_COL1) {
-		add_outputs(&wanted_key, VERT_RESULT_COL1);
-	}
+	R300_STATECHANGE(rmesa, vap_flush);
+	R300_STATECHANGE(rmesa, vpp);
+	param_count = r300VertexProgUpdateParams(ctx, prog, (float *)&rmesa->hw.vpp.cmd[R300_VPP_PARAM_0]);
+	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
+	param_count /= 4;
 
-	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-			add_outputs(&wanted_key, VERT_RESULT_TEX0 + i);
-		}
-	}
+	r300EmitVertexProgram(rmesa, R300_PVS_CODE_START, &(prog->code));
+	inst_count = (prog->code.length / 4) - 1;
 
-	if (vpc->mesa_program.IsPositionInvariant) {
-		/* we wan't position don't we ? */
-		wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
-	}
+	r300VapCntl(rmesa, _mesa_bitcount(prog->code.InputsRead),
+				 _mesa_bitcount(prog->code.OutputsWritten), prog->code.num_temporaries);
 
-	for (vp = vpc->progs; vp; vp = vp->next)
-		if (_mesa_memcmp(&vp->key, &wanted_key, sizeof(wanted_key))
-		    == 0) {
-			r300->selected_vp = vp;
-			return;
-		}
-	//_mesa_print_program(&vpc->mesa_program.Base);
+	R300_STATECHANGE(rmesa, pvs);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] = (0 << R300_PVS_FIRST_INST_SHIFT) | (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
+				(inst_count << R300_PVS_LAST_INST_SHIFT);
 
-	vp = build_program(&wanted_key, &vpc->mesa_program, wpos_idx);
-	vp->next = vpc->progs;
-	vpc->progs = vp;
-	r300->selected_vp = vp;
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] = (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) | (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] = (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
 }
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.h b/src/mesa/drivers/dri/r300/r300_vertprog.h
index 2f35f02bc8..ccec896be4 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.h
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.h
@@ -3,33 +3,9 @@
 
 #include "r300_reg.h"
 
-#define PVS_OP_DST_OPERAND(opcode, math_inst, macro_inst, reg_index, reg_writemask, reg_class)	\
-	 (((opcode & PVS_DST_OPCODE_MASK) << PVS_DST_OPCODE_SHIFT)	\
-	 | ((math_inst & PVS_DST_MATH_INST_MASK) << PVS_DST_MATH_INST_SHIFT)	\
-	 | ((macro_inst & PVS_DST_MACRO_INST_MASK) << PVS_DST_MACRO_INST_SHIFT)	\
-	 | ((reg_index & PVS_DST_OFFSET_MASK) << PVS_DST_OFFSET_SHIFT)	\
-	 | ((reg_writemask & 0xf) << PVS_DST_WE_X_SHIFT)	/* X Y Z W */	\
-	 | ((reg_class & PVS_DST_REG_TYPE_MASK) << PVS_DST_REG_TYPE_SHIFT))
 
-#define PVS_SRC_OPERAND(in_reg_index, comp_x, comp_y, comp_z, comp_w, reg_class, negate)	\
-	(((in_reg_index & PVS_SRC_OFFSET_MASK) << PVS_SRC_OFFSET_SHIFT)				\
-	 | ((comp_x & PVS_SRC_SWIZZLE_X_MASK) << PVS_SRC_SWIZZLE_X_SHIFT)			\
-	 | ((comp_y & PVS_SRC_SWIZZLE_Y_MASK) << PVS_SRC_SWIZZLE_Y_SHIFT)			\
-	 | ((comp_z & PVS_SRC_SWIZZLE_Z_MASK) << PVS_SRC_SWIZZLE_Z_SHIFT)			\
-	 | ((comp_w & PVS_SRC_SWIZZLE_W_MASK) << PVS_SRC_SWIZZLE_W_SHIFT)			\
-	 | ((negate & 0xf) << PVS_SRC_MODIFIER_X_SHIFT)	/* X Y Z W */				\
-	 | ((reg_class & PVS_SRC_REG_TYPE_MASK) << PVS_SRC_REG_TYPE_SHIFT))
+void r300SetupVertexProgram(r300ContextPtr rmesa);
 
-#if 1
-
-#define VSF_FLAG_X	1
-#define VSF_FLAG_Y	2
-#define VSF_FLAG_Z	4
-#define VSF_FLAG_W	8
-#define VSF_FLAG_XYZ	(VSF_FLAG_X | VSF_FLAG_Y | VSF_FLAG_Z)
-#define VSF_FLAG_ALL  0xf
-#define VSF_FLAG_NONE  0
-
-#endif
+struct r300_vertex_program * r300SelectAndTranslateVertexShader(GLcontext *ctx);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c
deleted file mode 100644
index 75dae86fa8..0000000000
--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "r500_fragprog.h"
-
-#include "radeon_nqssadce.h"
-#include "radeon_program_alu.h"
-
-
-static struct prog_src_register shadow_ambient(struct gl_program *program, int tmu)
-{
-	gl_state_index fail_value_tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_SHADOW_AMBIENT, 0, 0, 0
-	};
-	struct prog_src_register reg = { 0, };
-
-	fail_value_tokens[2] = tmu;
-	reg.File = PROGRAM_STATE_VAR;
-	reg.Index = _mesa_add_state_reference(program->Parameters, fail_value_tokens);
-	reg.Swizzle = SWIZZLE_WWWW;
-	return reg;
-}
-
-/**
- * Transform TEX, TXP, TXB, and KIL instructions in the following way:
- *  - premultiply texture coordinates for RECT
- *  - extract operand swizzles
- *  - introduce a temporary register when write masks are needed
- *
- */
-static GLboolean transform_TEX(
-	struct radeon_transform_context *t,
-	struct prog_instruction* orig_inst, void* data)
-{
-	struct r500_fragment_program_compiler *compiler =
-		(struct r500_fragment_program_compiler*)data;
-	struct prog_instruction inst = *orig_inst;
-	struct prog_instruction* tgt;
-	GLboolean destredirect = GL_FALSE;
-
-	if (inst.Opcode != OPCODE_TEX &&
-	    inst.Opcode != OPCODE_TXB &&
-	    inst.Opcode != OPCODE_TXP &&
-	    inst.Opcode != OPCODE_KIL)
-		return GL_FALSE;
-
-	/* ARB_shadow & EXT_shadow_funcs */
-	if (inst.Opcode != OPCODE_KIL &&
-	    t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) {
-		GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
-
-		if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
-			tgt = radeonAppendInstructions(t->Program, 1);
-
-			tgt->Opcode = OPCODE_MOV;
-			tgt->DstReg = inst.DstReg;
-			if (comparefunc == GL_ALWAYS) {
-				tgt->SrcReg[0].File = PROGRAM_BUILTIN;
-				tgt->SrcReg[0].Swizzle = SWIZZLE_1111;
-			} else {
-				tgt->SrcReg[0] = shadow_ambient(t->Program, inst.TexSrcUnit);
-			}
-			return GL_TRUE;
-		}
-
-		inst.DstReg.File = PROGRAM_TEMPORARY;
-		inst.DstReg.Index = radeonFindFreeTemporary(t);
-		inst.DstReg.WriteMask = WRITEMASK_XYZW;
-	} else if (inst.Opcode != OPCODE_KIL && inst.DstReg.File != PROGRAM_TEMPORARY) {
-		int tempreg = radeonFindFreeTemporary(t);
-
-		inst.DstReg.File = PROGRAM_TEMPORARY;
-		inst.DstReg.Index = tempreg;
-		inst.DstReg.WriteMask = WRITEMASK_XYZW;
-		destredirect = GL_TRUE;
-	}
-
-	tgt = radeonAppendInstructions(t->Program, 1);
-	_mesa_copy_instructions(tgt, &inst, 1);
-
-	if (inst.Opcode != OPCODE_KIL &&
-	    t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) {
-		GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
-		GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode;
-		int rcptemp = radeonFindFreeTemporary(t);
-		int pass, fail;
-
-		tgt = radeonAppendInstructions(t->Program, 3);
-
-		tgt[0].Opcode = OPCODE_RCP;
-		tgt[0].DstReg.File = PROGRAM_TEMPORARY;
-		tgt[0].DstReg.Index = rcptemp;
-		tgt[0].DstReg.WriteMask = WRITEMASK_W;
-		tgt[0].SrcReg[0] = inst.SrcReg[0];
-		tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-
-		tgt[1].Opcode = OPCODE_MAD;
-		tgt[1].DstReg = inst.DstReg;
-		tgt[1].DstReg.WriteMask = orig_inst->DstReg.WriteMask;
-		tgt[1].SrcReg[0] = inst.SrcReg[0];
-		tgt[1].SrcReg[0].Swizzle = SWIZZLE_ZZZZ;
-		tgt[1].SrcReg[1].File = PROGRAM_TEMPORARY;
-		tgt[1].SrcReg[1].Index = rcptemp;
-		tgt[1].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-		tgt[1].SrcReg[2].File = PROGRAM_TEMPORARY;
-		tgt[1].SrcReg[2].Index = inst.DstReg.Index;
-		if (depthmode == 0) /* GL_LUMINANCE */
-			tgt[1].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
-		else if (depthmode == 2) /* GL_ALPHA */
-			tgt[1].SrcReg[2].Swizzle = SWIZZLE_WWWW;
-
-		/* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
-		 *   r  < tex  <=>      -tex+r < 0
-		 *   r >= tex  <=> not (-tex+r < 0 */
-		if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
-			tgt[1].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW;
-		else
-			tgt[1].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
-
-		tgt[2].Opcode = OPCODE_CMP;
-		tgt[2].DstReg = orig_inst->DstReg;
-		tgt[2].SrcReg[0].File = PROGRAM_TEMPORARY;
-		tgt[2].SrcReg[0].Index = tgt[1].DstReg.Index;
-
-		if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
-			pass = 1;
-			fail = 2;
-		} else {
-			pass = 2;
-			fail = 1;
-		}
-
-		tgt[2].SrcReg[pass].File = PROGRAM_BUILTIN;
-		tgt[2].SrcReg[pass].Swizzle = SWIZZLE_1111;
-		tgt[2].SrcReg[fail] = shadow_ambient(t->Program, inst.TexSrcUnit);
-	} else if (destredirect) {
-		tgt = radeonAppendInstructions(t->Program, 1);
-
-		tgt->Opcode = OPCODE_MOV;
-		tgt->DstReg = orig_inst->DstReg;
-		tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
-		tgt->SrcReg[0].Index = inst.DstReg.Index;
-	}
-
-	return GL_TRUE;
-}
-
-
-static void update_params(r300ContextPtr r300, struct r500_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
-}
-
-
-/**
- * Transform the program to support fragment.position.
- *
- * Introduce a small fragment at the start of the program that will be
- * the only code that directly reads the FRAG_ATTRIB_WPOS input.
- * All other code pieces that reference that input will be rewritten
- * to read from a newly allocated temporary.
- *
- * \todo if/when r5xx supports the radeon_program architecture, this is a
- * likely candidate for code sharing.
- */
-static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler)
-{
-	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
-
-	if (!(InputsRead & FRAG_BIT_WPOS))
-		return;
-
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
-
-	_mesa_insert_instructions(compiler->program, 0, 3);
-	fpi = compiler->program->Instructions;
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
-
-	for (; i < compiler->program->NumInstructions; ++i) {
-		int reg;
-		for (reg = 0; reg < 3; reg++) {
-			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
-			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
-				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
-				fpi[i].SrcReg[reg].Index = tempregi;
-			}
-		}
-	}
-}
-
-
-static void nqssadce_init(struct nqssadce_state* s)
-{
-	s->Outputs[FRAG_RESULT_COLR].Sourced = WRITEMASK_XYZW;
-	s->Outputs[FRAG_RESULT_DEPR].Sourced = WRITEMASK_W;
-}
-
-static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
-{
-	GLuint relevant;
-	int i;
-
-	if (opcode == OPCODE_TEX ||
-	    opcode == OPCODE_TXB ||
-	    opcode == OPCODE_TXP ||
-	    opcode == OPCODE_KIL) {
-		if (reg.Abs)
-			return GL_FALSE;
-
-		if (reg.NegateAbs)
-			reg.NegateBase ^= 15;
-
-		if (opcode == OPCODE_KIL) {
-			if (reg.Swizzle != SWIZZLE_NOOP)
-				return GL_FALSE;
-		} else {
-			for(i = 0; i < 4; ++i) {
-				GLuint swz = GET_SWZ(reg.Swizzle, i);
-				if (swz == SWIZZLE_NIL) {
-					reg.NegateBase &= ~(1 << i);
-					continue;
-				}
-				if (swz >= 4)
-					return GL_FALSE;
-			}
-		}
-
-		if (reg.NegateBase)
-			return GL_FALSE;
-
-		return GL_TRUE;
-	} else if (opcode == OPCODE_DDX || opcode == OPCODE_DDY) {
-		/* DDX/MDH and DDY/MDV explicitly ignore incoming swizzles;
-		 * if it doesn't fit perfectly into a .xyzw case... */
-		if (reg.Swizzle == SWIZZLE_NOOP && !reg.Abs
-				&& !reg.NegateBase && !reg.NegateAbs)
-			return GL_TRUE;
-
-		return GL_FALSE;
-	} else {
-		/* ALU instructions support almost everything */
-		if (reg.Abs)
-			return GL_TRUE;
-
-		relevant = 0;
-		for(i = 0; i < 3; ++i) {
-			GLuint swz = GET_SWZ(reg.Swizzle, i);
-			if (swz != SWIZZLE_NIL && swz != SWIZZLE_ZERO)
-				relevant |= 1 << i;
-		}
-		if ((reg.NegateBase & relevant) && ((reg.NegateBase & relevant) != relevant))
-			return GL_FALSE;
-
-		return GL_TRUE;
-	}
-}
-
-/**
- * Implement a MOV with a potentially non-native swizzle.
- *
- * The only thing we *cannot* do in an ALU instruction is per-component
- * negation. Therefore, we split the MOV into two instructions when necessary.
- */
-static void nqssadce_build_swizzle(struct nqssadce_state *s,
-	struct prog_dst_register dst, struct prog_src_register src)
-{
-	struct prog_instruction *inst;
-	GLuint negatebase[2] = { 0, 0 };
-	int i;
-
-	for(i = 0; i < 4; ++i) {
-		GLuint swz = GET_SWZ(src.Swizzle, i);
-		if (swz == SWIZZLE_NIL)
-			continue;
-		negatebase[GET_BIT(src.NegateBase, i)] |= 1 << i;
-	}
-
-	_mesa_insert_instructions(s->Program, s->IP, (negatebase[0] ? 1 : 0) + (negatebase[1] ? 1 : 0));
-	inst = s->Program->Instructions + s->IP;
-
-	for(i = 0; i <= 1; ++i) {
-		if (!negatebase[i])
-			continue;
-
-		inst->Opcode = OPCODE_MOV;
-		inst->DstReg = dst;
-		inst->DstReg.WriteMask = negatebase[i];
-		inst->SrcReg[0] = src;
-		inst++;
-		s->IP++;
-	}
-}
-
-static GLuint build_dtm(GLuint depthmode)
-{
-	switch(depthmode) {
-	default:
-	case GL_LUMINANCE: return 0;
-	case GL_INTENSITY: return 1;
-	case GL_ALPHA: return 2;
-	}
-}
-
-static GLuint build_func(GLuint comparefunc)
-{
-	return comparefunc - GL_NEVER;
-}
-
-
-/**
- * Collect all external state that is relevant for compiling the given
- * fragment program.
- */
-static void build_state(
-	r300ContextPtr r300,
-	struct r500_fragment_program *fp,
-	struct r500_fragment_program_external_state *state)
-{
-	int unit;
-
-	_mesa_bzero(state, sizeof(*state));
-
-	for(unit = 0; unit < 16; ++unit) {
-		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
-			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
-
-			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
-			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
-		}
-	}
-}
-
-static void dump_program(struct r500_fragment_program_code *code);
-
-void r500TranslateFragmentShader(r300ContextPtr r300,
-				 struct r500_fragment_program *fp)
-{
-	struct r500_fragment_program_external_state state;
-
-	build_state(r300, fp, &state);
-	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
-		/* TODO: cache compiled programs */
-		fp->translated = GL_FALSE;
-		_mesa_memcpy(&fp->state, &state, sizeof(state));
-	}
-
-	if (!fp->translated) {
-		struct r500_fragment_program_compiler compiler;
-
-		compiler.r300 = r300;
-		compiler.fp = fp;
-		compiler.code = &fp->code;
-		compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: Initial program:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		insert_WPOS_trailer(&compiler);
-
-		struct radeon_program_transformation transformations[] = {
-			{ &transform_TEX, &compiler },
-			{ &radeonTransformALU, 0 },
-			{ &radeonTransformDeriv, 0 },
-			{ &radeonTransformTrigScale, 0 }
-		};
-		radeonLocalTransform(r300->radeon.glCtx, compiler.program,
-			4, transformations);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after native rewrite:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &is_native_swizzle,
-			.BuildSwizzle = &nqssadce_build_swizzle,
-			.RewriteDepthOut = GL_TRUE
-		};
-		radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after NqSSA-DCE:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		fp->translated = r500FragmentProgramEmit(&compiler);
-
-		/* Subtle: Rescue any parameters that have been added during transformations */
-		_mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
-		fp->mesa_program.Base.Parameters = compiler.program->Parameters;
-		compiler.program->Parameters = 0;
-
-		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0);
-
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			if (fp->translated) {
-				_mesa_printf("Machine-readable code:\n");
-				dump_program(&fp->code);
-			}
-		}
-
-	}
-
-	update_params(r300, fp);
-
-}
-
-static char *toswiz(int swiz_val) {
-  switch(swiz_val) {
-  case 0: return "R";
-  case 1: return "G";
-  case 2: return "B";
-  case 3: return "A";
-  case 4: return "0";
-  case 5: return "1/2";
-  case 6: return "1";
-  case 7: return "U";
-  }
-  return NULL;
-}
-
-static char *toop(int op_val)
-{
-  char *str = NULL;
-  switch (op_val) {
-  case 0: str = "MAD"; break;
-  case 1: str = "DP3"; break;
-  case 2: str = "DP4"; break;
-  case 3: str = "D2A"; break;
-  case 4: str = "MIN"; break;
-  case 5: str = "MAX"; break;
-  case 6: str = "Reserved"; break;
-  case 7: str = "CND"; break;
-  case 8: str = "CMP"; break;
-  case 9: str = "FRC"; break;
-  case 10: str = "SOP"; break;
-  case 11: str = "MDH"; break;
-  case 12: str = "MDV"; break;
-  }
-  return str;
-}
-
-static char *to_alpha_op(int op_val)
-{
-  char *str = NULL;
-  switch (op_val) {
-  case 0: str = "MAD"; break;
-  case 1: str = "DP"; break;
-  case 2: str = "MIN"; break;
-  case 3: str = "MAX"; break;
-  case 4: str = "Reserved"; break;
-  case 5: str = "CND"; break;
-  case 6: str = "CMP"; break;
-  case 7: str = "FRC"; break;
-  case 8: str = "EX2"; break;
-  case 9: str = "LN2"; break;
-  case 10: str = "RCP"; break;
-  case 11: str = "RSQ"; break;
-  case 12: str = "SIN"; break;
-  case 13: str = "COS"; break;
-  case 14: str = "MDH"; break;
-  case 15: str = "MDV"; break;
-  }
-  return str;
-}
-
-static char *to_mask(int val)
-{
-  char *str = NULL;
-  switch(val) {
-  case 0: str = "NONE"; break;
-  case 1: str = "R"; break;
-  case 2: str = "G"; break;
-  case 3: str = "RG"; break;
-  case 4: str = "B"; break;
-  case 5: str = "RB"; break;
-  case 6: str = "GB"; break;
-  case 7: str = "RGB"; break;
-  case 8: str = "A"; break;
-  case 9: str = "AR"; break;
-  case 10: str = "AG"; break;
-  case 11: str = "ARG"; break;
-  case 12: str = "AB"; break;
-  case 13: str = "ARB"; break;
-  case 14: str = "AGB"; break;
-  case 15: str = "ARGB"; break;
-  }
-  return str;
-}
-
-static char *to_texop(int val)
-{
-  switch(val) {
-  case 0: return "NOP";
-  case 1: return "LD";
-  case 2: return "TEXKILL";
-  case 3: return "PROJ";
-  case 4: return "LODBIAS";
-  case 5: return "LOD";
-  case 6: return "DXDY";
-  }
-  return NULL;
-}
-
-static void dump_program(struct r500_fragment_program_code *code)
-{
-
-  fprintf(stderr, "R500 Fragment Program:\n--------\n");
-
-  int n;
-  uint32_t inst;
-  uint32_t inst0;
-  char *str = NULL;
-
-  if (code->const_nr) {
-    fprintf(stderr, "--------\nConstants:\n");
-    for (n = 0; n < code->const_nr; n++) {
-      fprintf(stderr, "Constant %d: %i[%i]\n", n,
-        code->constant[n].File, code->constant[n].Index);
-    }
-    fprintf(stderr, "--------\n");
-  }
-
-  for (n = 0; n < code->inst_end+1; n++) {
-    inst0 = inst = code->inst[n].inst0;
-    fprintf(stderr,"%d\t0:CMN_INST   0x%08x:", n, inst);
-    switch(inst & 0x3) {
-    case R500_INST_TYPE_ALU: str = "ALU"; break;
-    case R500_INST_TYPE_OUT: str = "OUT"; break;
-    case R500_INST_TYPE_FC: str = "FC"; break;
-    case R500_INST_TYPE_TEX: str = "TEX"; break;
-    };
-    fprintf(stderr,"%s %s %s %s %s ", str,
-	    inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "",
-	    inst & R500_INST_LAST ? "LAST" : "",
-	    inst & R500_INST_NOP ? "NOP" : "",
-	    inst & R500_INST_ALU_WAIT ? "ALU WAIT" : "");
-    fprintf(stderr,"wmask: %s omask: %s\n", to_mask((inst >> 11) & 0xf),
-	    to_mask((inst >> 15) & 0xf));
-
-    switch(inst0 & 0x3) {
-    case 0:
-    case 1:
-      fprintf(stderr,"\t1:RGB_ADDR   0x%08x:", code->inst[n].inst1);
-      inst = code->inst[n].inst1;
-
-      fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
-	      inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
-	      (inst >> 10) & 0xff, (inst & (1<<18)) ? 'c' : 't',
-	      (inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
-	      (inst >> 30));
-
-      fprintf(stderr,"\t2:ALPHA_ADDR 0x%08x:", code->inst[n].inst2);
-      inst = code->inst[n].inst2;
-      fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
-	      inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
-	      (inst >> 10) & 0xff, (inst & (1<<18)) ? 'c' : 't',
-	      (inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
-	      (inst >> 30));
-      fprintf(stderr,"\t3 RGB_INST:  0x%08x:", code->inst[n].inst3);
-      inst = code->inst[n].inst3;
-      fprintf(stderr,"rgb_A_src:%d %s/%s/%s %d rgb_B_src:%d %s/%s/%s %d\n",
-	      (inst) & 0x3, toswiz((inst >> 2) & 0x7), toswiz((inst >> 5) & 0x7), toswiz((inst >> 8) & 0x7),
-	      (inst >> 11) & 0x3,
-	      (inst >> 13) & 0x3, toswiz((inst >> 15) & 0x7), toswiz((inst >> 18) & 0x7), toswiz((inst >> 21) & 0x7),
-	      (inst >> 24) & 0x3);
-
-
-      fprintf(stderr,"\t4 ALPHA_INST:0x%08x:", code->inst[n].inst4);
-      inst = code->inst[n].inst4;
-      fprintf(stderr,"%s dest:%d%s alp_A_src:%d %s %d alp_B_src:%d %s %d w:%d\n", to_alpha_op(inst & 0xf),
-	      (inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
-	      (inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), (inst >> 17) & 0x3,
-	      (inst >> 19) & 0x3, toswiz((inst >> 21) & 0x7), (inst >> 24) & 0x3,
-	      (inst >> 31) & 0x1);
-
-      fprintf(stderr,"\t5 RGBA_INST: 0x%08x:", code->inst[n].inst5);
-      inst = code->inst[n].inst5;
-      fprintf(stderr,"%s dest:%d%s rgb_C_src:%d %s/%s/%s %d alp_C_src:%d %s %d\n", toop(inst & 0xf),
-	      (inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
-	      (inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), toswiz((inst >> 17) & 0x7), toswiz((inst >> 20) & 0x7),
-	      (inst >> 23) & 0x3,
-	      (inst >> 25) & 0x3, toswiz((inst >> 27) & 0x7), (inst >> 30) & 0x3);
-      break;
-    case 2:
-      break;
-    case 3:
-      inst = code->inst[n].inst1;
-      fprintf(stderr,"\t1:TEX_INST:  0x%08x: id: %d op:%s, %s, %s %s\n", inst, (inst >> 16) & 0xf,
-	      to_texop((inst >> 22) & 0x7), (inst & (1<<25)) ? "ACQ" : "",
-	      (inst & (1<<26)) ? "IGNUNC" : "", (inst & (1<<27)) ? "UNSCALED" : "SCALED");
-      inst = code->inst[n].inst2;
-      fprintf(stderr,"\t2:TEX_ADDR:  0x%08x: src: %d%s %s/%s/%s/%s dst: %d%s %s/%s/%s/%s\n", inst,
-	      inst & 127, inst & (1<<7) ? "(rel)" : "",
-	      toswiz((inst >> 8) & 0x3), toswiz((inst >> 10) & 0x3),
-	      toswiz((inst >> 12) & 0x3), toswiz((inst >> 14) & 0x3),
-	      (inst >> 16) & 127, inst & (1<<23) ? "(rel)" : "",
-	      toswiz((inst >> 24) & 0x3), toswiz((inst >> 26) & 0x3),
-	      toswiz((inst >> 28) & 0x3), toswiz((inst >> 30) & 0x3));
-
-      fprintf(stderr,"\t3:TEX_DXDY:  0x%08x\n", code->inst[n].inst3);
-      break;
-    }
-    fprintf(stderr,"\n");
-  }
-
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_bo_legacy.c b/src/mesa/drivers/dri/r300/radeon_bo_legacy.c
new file mode 120000
index 0000000000..79ad050e6b
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_bo_legacy.c
@@ -0,0 +1 @@
+../radeon/radeon_bo_legacy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_bo_legacy.h b/src/mesa/drivers/dri/r300/radeon_bo_legacy.h
new file mode 120000
index 0000000000..83b0f7ffab
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_bo_legacy.h
@@ -0,0 +1 @@
+../radeon/radeon_bo_legacy.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_bocs_wrapper.h b/src/mesa/drivers/dri/r300/radeon_bocs_wrapper.h
new file mode 120000
index 0000000000..ca894b2443
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_bocs_wrapper.h
@@ -0,0 +1 @@
+../radeon/radeon_bocs_wrapper.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_buffer_objects.c b/src/mesa/drivers/dri/r300/radeon_buffer_objects.c
new file mode 120000
index 0000000000..f6a5f66470
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_buffer_objects.c
@@ -0,0 +1 @@
+../radeon/radeon_buffer_objects.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_buffer_objects.h b/src/mesa/drivers/dri/r300/radeon_buffer_objects.h
new file mode 120000
index 0000000000..2f134fd17b
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_buffer_objects.h
@@ -0,0 +1 @@
+../radeon/radeon_buffer_objects.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_chipset.h b/src/mesa/drivers/dri/r300/radeon_chipset.h
new file mode 120000
index 0000000000..eba99001ff
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_chipset.h
@@ -0,0 +1 @@
+../radeon/radeon_chipset.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_cmdbuf.h b/src/mesa/drivers/dri/r300/radeon_cmdbuf.h
new file mode 120000
index 0000000000..a799e1dc6d
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_cmdbuf.h
@@ -0,0 +1 @@
+../radeon/radeon_cmdbuf.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_common.c b/src/mesa/drivers/dri/r300/radeon_common.c
new file mode 120000
index 0000000000..67b19ba940
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_common.c
@@ -0,0 +1 @@
+../radeon/radeon_common.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_common.h b/src/mesa/drivers/dri/r300/radeon_common.h
new file mode 120000
index 0000000000..5bcb696a9f
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_common.h
@@ -0,0 +1 @@
+../radeon/radeon_common.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_common_context.c b/src/mesa/drivers/dri/r300/radeon_common_context.c
new file mode 120000
index 0000000000..86800f3819
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_common_context.c
@@ -0,0 +1 @@
+../radeon/radeon_common_context.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_common_context.h b/src/mesa/drivers/dri/r300/radeon_common_context.h
new file mode 120000
index 0000000000..4d66312550
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_common_context.h
@@ -0,0 +1 @@
+../radeon/radeon_common_context.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_context.c b/src/mesa/drivers/dri/r300/radeon_context.c
deleted file mode 100644
index 5267fe9a77..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_context.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/**
- * \file radeon_context.c
- * Common context initialization.
- *
- * \author Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include <dlfcn.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/state.h"
-#include "main/matrix.h"
-#include "main/framebuffer.h"
-
-#include "drivers/common/driverfuncs.h"
-#include "swrast/swrast.h"
-
-#include "radeon_screen.h"
-#include "radeon_ioctl.h"
-#include "radeon_macros.h"
-#include "radeon_reg.h"
-
-#include "radeon_state.h"
-#include "r300_state.h"
-
-#include "utils.h"
-#include "vblank.h"
-#include "xmlpool.h"		/* for symbolic values of enum-type options */
-
-#define DRIVER_DATE "20060815"
-
-
-/* Return various strings for glGetString().
- */
-static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	static char buffer[128];
-
-	switch (name) {
-	case GL_VENDOR:
-		if (IS_R300_CLASS(radeon->radeonScreen))
-			return (GLubyte *) "DRI R300 Project";
-		else
-			return (GLubyte *) "Tungsten Graphics, Inc.";
-
-	case GL_RENDERER:
-	{
-		unsigned offset;
-		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
-			radeon->radeonScreen->AGPMode;
-		const char* chipname;
-
-		if (IS_R300_CLASS(radeon->radeonScreen))
-			chipname = "R300";
-		else
-			chipname = "R200";
-
-		offset = driGetRendererString(buffer, chipname, DRIVER_DATE,
-					      agp_mode);
-
-		if (IS_R300_CLASS(radeon->radeonScreen)) {
-		sprintf(&buffer[offset], " %sTCL",
-			(radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
-			? "" : "NO-");
-		} else {
-			sprintf(&buffer[offset], " %sTCL",
-			!(radeon->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
-			? "" : "NO-");
-		}
-
-		return (GLubyte *) buffer;
-	}
-
-	default:
-		return NULL;
-	}
-}
-
-/* Initialize the driver's misc functions.
- */
-static void radeonInitDriverFuncs(struct dd_function_table *functions)
-{
-	functions->GetString = radeonGetString;
-}
-
-
-/**
- * Create and initialize all common fields of the context,
- * including the Mesa context itself.
- */
-GLboolean radeonInitContext(radeonContextPtr radeon,
-			    struct dd_function_table* functions,
-			    const __GLcontextModes * glVisual,
-			    __DRIcontextPrivate * driContextPriv,
-			    void *sharedContextPrivate)
-{
-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
-	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
-	GLcontext* ctx;
-	GLcontext* shareCtx;
-	int fthrottle_mode;
-
-	/* Fill in additional standard functions. */
-	radeonInitDriverFuncs(functions);
-
-	radeon->radeonScreen = screen;
-	/* Allocate and initialize the Mesa context */
-	if (sharedContextPrivate)
-		shareCtx = ((radeonContextPtr)sharedContextPrivate)->glCtx;
-	else
-		shareCtx = NULL;
-	radeon->glCtx = _mesa_create_context(glVisual, shareCtx,
-					    functions, (void *)radeon);
-	if (!radeon->glCtx)
-		return GL_FALSE;
-
-	ctx = radeon->glCtx;
-	driContextPriv->driverPrivate = radeon;
-
-	/* DRI fields */
-	radeon->dri.context = driContextPriv;
-	radeon->dri.screen = sPriv;
-	radeon->dri.drawable = NULL;
-	radeon->dri.readable = NULL;
-	radeon->dri.hwContext = driContextPriv->hHWContext;
-	radeon->dri.hwLock = &sPriv->pSAREA->lock;
-	radeon->dri.fd = sPriv->fd;
-	radeon->dri.drmMinor = sPriv->drm_version.minor;
-
-	radeon->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
-					       screen->sarea_priv_offset);
-
-	/* Setup IRQs */
-	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
-	radeon->iw.irq_seq = -1;
-	radeon->irqsEmitted = 0;
-	radeon->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
-			  radeon->radeonScreen->irq);
-
-	radeon->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
-
-	if (!radeon->do_irqs)
-		fprintf(stderr,
-			"IRQ's not enabled, falling back to %s: %d %d\n",
-			radeon->do_usleeps ? "usleeps" : "busy waits",
-			fthrottle_mode, radeon->radeonScreen->irq);
-
-	(*sPriv->systemTime->getUST) (&radeon->swap_ust);
-
-	return GL_TRUE;
-}
-
-
-/**
- * Cleanup common context fields.
- * Called by r200DestroyContext/r300DestroyContext
- */
-void radeonCleanupContext(radeonContextPtr radeon)
-{
-	/* _mesa_destroy_context() might result in calls to functions that
-	 * depend on the DriverCtx, so don't set it to NULL before.
-	 *
-	 * radeon->glCtx->DriverCtx = NULL;
-	 */
-
-	/* free the Mesa context */
-	_mesa_destroy_context(radeon->glCtx);
-
-	if (radeon->state.scissor.pClipRects) {
-		FREE(radeon->state.scissor.pClipRects);
-		radeon->state.scissor.pClipRects = 0;
-	}
-}
-
-
-/**
- * Swap front and back buffer.
- */
-void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
-{
-	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-		radeonContextPtr radeon;
-		GLcontext *ctx;
-
-		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-		ctx = radeon->glCtx;
-
-		if (ctx->Visual.doubleBufferMode) {
-			_mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
-			if (radeon->doPageFlip) {
-				radeonPageFlip(dPriv);
-			} else {
-			    radeonCopyBuffer(dPriv, NULL);
-			}
-		}
-	} else {
-		/* XXX this shouldn't be an error but we can't handle it for now */
-		_mesa_problem(NULL, "%s: drawable has no context!",
-			      __FUNCTION__);
-	}
-}
-
-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-			 int x, int y, int w, int h )
-{
-    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-	radeonContextPtr radeon;
-	GLcontext *ctx;
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-	ctx = radeon->glCtx;
-
-	if (ctx->Visual.doubleBufferMode) {
-	    drm_clip_rect_t rect;
-	    rect.x1 = x + dPriv->x;
-	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
-	    rect.x2 = rect.x1 + w;
-	    rect.y2 = rect.y1 + h;
-	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
-	    radeonCopyBuffer(dPriv, &rect);
-	}
-    } else {
-	/* XXX this shouldn't be an error but we can't handle it for now */
-	_mesa_problem(NULL, "%s: drawable has no context!",
-		      __FUNCTION__);
-    }
-}
-
-/* Force the context `c' to be the current context and associate with it
- * buffer `b'.
- */
-GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-			    __DRIdrawablePrivate * driDrawPriv,
-			    __DRIdrawablePrivate * driReadPriv)
-{
-	if (driContextPriv) {
-		radeonContextPtr radeon =
-			(radeonContextPtr) driContextPriv->driverPrivate;
-
-		if (RADEON_DEBUG & DEBUG_DRI)
-			fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
-				radeon->glCtx);
-
-		if (radeon->dri.drawable != driDrawPriv) {
-			if (driDrawPriv->swap_interval == (unsigned)-1) {
-				driDrawPriv->vblFlags =
-					(radeon->radeonScreen->irq != 0)
-					? driGetDefaultVBlankFlags(&radeon->
-								   optionCache)
-					: VBLANK_FLAG_NO_IRQ;
-
-				driDrawableInitVBlank(driDrawPriv);
-			}
-		}
-
-		radeon->dri.readable = driReadPriv;
-
-		if (radeon->dri.drawable != driDrawPriv ||
-		    radeon->lastStamp != driDrawPriv->lastStamp) {
-			radeon->dri.drawable = driDrawPriv;
-
-			radeonSetCliprects(radeon);
-			r300UpdateViewportOffset(radeon->glCtx);
-		}
-
-		_mesa_make_current(radeon->glCtx,
-				    (GLframebuffer *) driDrawPriv->
-				    driverPrivate,
-				    (GLframebuffer *) driReadPriv->
-				    driverPrivate);
-
-		_mesa_update_state(radeon->glCtx);		
-
-		radeonUpdatePageFlipping(radeon);
-	} else {
-		if (RADEON_DEBUG & DEBUG_DRI)
-			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
-		_mesa_make_current(0, 0, 0);
-	}
-
-	if (RADEON_DEBUG & DEBUG_DRI)
-		fprintf(stderr, "End %s\n", __FUNCTION__);
-	return GL_TRUE;
-}
-
-/* Force the context `c' to be unbound from its buffer.
- */
-GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
-{
-	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_DRI)
-		fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
-			radeon->glCtx);
-
-	return GL_TRUE;
-}
-
diff --git a/src/mesa/drivers/dri/r300/radeon_context.h b/src/mesa/drivers/dri/r300/radeon_context.h
index 47cbc22a72..250570f6b8 100644
--- a/src/mesa/drivers/dri/r300/radeon_context.h
+++ b/src/mesa/drivers/dri/r300/radeon_context.h
@@ -49,20 +49,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "dri_util.h"
 
-struct radeon_context;
-typedef struct radeon_context radeonContextRec;
-typedef struct radeon_context *radeonContextPtr;
-
-/* Rasterizing fallbacks */
-/* See correponding strings in r200_swtcl.c */
-#define RADEON_FALLBACK_TEXTURE		0x0001
-#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
-#define RADEON_FALLBACK_STENCIL		0x0004
-#define RADEON_FALLBACK_RENDER_MODE	0x0008
-#define RADEON_FALLBACK_BLEND_EQ	0x0010
-#define RADEON_FALLBACK_BLEND_FUNC	0x0020
-#define RADEON_FALLBACK_DISABLE		0x0040
-#define RADEON_FALLBACK_BORDER_MODE	0x0080
+#include "radeon_screen.h"
 
 #if R200_MERGED
 extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
@@ -79,155 +66,11 @@ extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 /* TCL fallbacks */
 extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 
-#define RADEON_TCL_FALLBACK_RASTER		0x0001	/* rasterization */
-#define RADEON_TCL_FALLBACK_UNFILLED		0x0002	/* unfilled tris */
-#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE	0x0004	/* twoside tris */
-#define RADEON_TCL_FALLBACK_MATERIAL		0x0008	/* material in vb */
-#define RADEON_TCL_FALLBACK_TEXGEN_0		0x0010	/* texgen, unit 0 */
-#define RADEON_TCL_FALLBACK_TEXGEN_1		0x0020	/* texgen, unit 1 */
-#define RADEON_TCL_FALLBACK_TEXGEN_2		0x0040	/* texgen, unit 2 */
-#define RADEON_TCL_FALLBACK_TEXGEN_3		0x0080	/* texgen, unit 3 */
-#define RADEON_TCL_FALLBACK_TEXGEN_4		0x0100	/* texgen, unit 4 */
-#define RADEON_TCL_FALLBACK_TEXGEN_5		0x0200	/* texgen, unit 5 */
-#define RADEON_TCL_FALLBACK_TCL_DISABLE		0x0400	/* user disable */
-#define RADEON_TCL_FALLBACK_BITMAP		0x0800	/* draw bitmap with points */
-#define RADEON_TCL_FALLBACK_VERTEX_PROGRAM	0x1000	/* vertex program active */
-
 #if R200_MERGED
 #define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
 #else
 #define TCL_FALLBACK( ctx, bit, mode )	;
 #endif
 
-struct radeon_dri_mirror {
-	__DRIcontextPrivate *context;	/* DRI context */
-	__DRIscreenPrivate *screen;	/* DRI screen */
-	/**
-	 * DRI drawable bound to this context for drawing.
-	 */
-	__DRIdrawablePrivate *drawable;
-
-	/**
-	 * DRI drawable bound to this context for reading.
-	 */
-	__DRIdrawablePrivate *readable;
-
-	drm_context_t hwContext;
-	drm_hw_lock_t *hwLock;
-	int fd;
-	int drmMinor;
-};
-
-/**
- * Derived state for internal purposes.
- */
-struct radeon_scissor_state {
-	drm_clip_rect_t rect;
-	GLboolean enabled;
-
-	GLuint numClipRects;	/* Cliprects active */
-	GLuint numAllocedClipRects;	/* Cliprects available */
-	drm_clip_rect_t *pClipRects;
-};
-
-struct radeon_colorbuffer_state {
-	GLuint clear;
-	GLint drawOffset, drawPitch;
-};
-
-struct radeon_state {
-	struct radeon_colorbuffer_state color;
-	struct radeon_scissor_state scissor;
-};
-
-/**
- * Common per-context variables shared by R200 and R300.
- * R200- and R300-specific code "derive" their own context from this
- * structure.
- */
-struct radeon_context {
-	GLcontext *glCtx;	/* Mesa context */
-	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
-
-	/* Fallback state */
-	GLuint Fallback;
-	GLuint TclFallback;
-
-	/* Page flipping */
-	GLuint doPageFlip;
-
-	/* Drawable, cliprect and scissor information */
-	GLuint numClipRects;	/* Cliprects for the draw buffer */
-	drm_clip_rect_t *pClipRects;
-	unsigned int lastStamp;
-	GLboolean lost_context;
-	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
-
-	/* Mirrors of some DRI state */
-	struct radeon_dri_mirror dri;
-
-	/* Busy waiting */
-	GLuint do_usleeps;
-	GLuint do_irqs;
-	GLuint irqsEmitted;
-	drm_radeon_irq_wait_t iw;
-
-	/* buffer swap */
-	int64_t swap_ust;
-	int64_t swap_missed_ust;
-
-	GLuint swap_count;
-	GLuint swap_missed_count;
-
-	/* Derived state */
-	struct radeon_state state;
-
-	/* Configuration cache
-	 */
-	driOptionCache optionCache;
-};
-
-#define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
-
-extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
-extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-				int x, int y, int w, int h);
-extern GLboolean radeonInitContext(radeonContextPtr radeon,
-				   struct dd_function_table *functions,
-				   const __GLcontextModes * glVisual,
-				   __DRIcontextPrivate * driContextPriv,
-				   void *sharedContextPrivate);
-extern void radeonCleanupContext(radeonContextPtr radeon);
-extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-				   __DRIdrawablePrivate * driDrawPriv,
-				   __DRIdrawablePrivate * driReadPriv);
-extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
-
-/* ================================================================
- * Debugging:
- */
-#define DO_DEBUG		1
-
-#if DO_DEBUG
-extern int RADEON_DEBUG;
-#else
-#define RADEON_DEBUG		0
-#endif
-
-#define DEBUG_TEXTURE	0x0001
-#define DEBUG_STATE	0x0002
-#define DEBUG_IOCTL	0x0004
-#define DEBUG_PRIMS	0x0008
-#define DEBUG_VERTS	0x0010
-#define DEBUG_FALLBACKS	0x0020
-#define DEBUG_VFMT	0x0040
-#define DEBUG_CODEGEN	0x0080
-#define DEBUG_VERBOSE	0x0100
-#define DEBUG_DRI       0x0200
-#define DEBUG_DMA       0x0400
-#define DEBUG_SANITY    0x0800
-#define DEBUG_SYNC      0x1000
-#define DEBUG_PIXEL     0x2000
-#define DEBUG_MEMORY    0x4000
 
 #endif				/* __RADEON_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r300/radeon_cs_legacy.c b/src/mesa/drivers/dri/r300/radeon_cs_legacy.c
new file mode 120000
index 0000000000..006720f8a4
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_cs_legacy.c
@@ -0,0 +1 @@
+../radeon/radeon_cs_legacy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_cs_legacy.h b/src/mesa/drivers/dri/r300/radeon_cs_legacy.h
new file mode 120000
index 0000000000..a5f95e0a3d
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_cs_legacy.h
@@ -0,0 +1 @@
+../radeon/radeon_cs_legacy.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_cs_space_drm.c b/src/mesa/drivers/dri/r300/radeon_cs_space_drm.c
new file mode 120000
index 0000000000..c248ea7d1a
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_cs_space_drm.c
@@ -0,0 +1 @@
+../radeon/radeon_cs_space_drm.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_debug.c b/src/mesa/drivers/dri/r300/radeon_debug.c
new file mode 120000
index 0000000000..c98c2e074c
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_debug.c
@@ -0,0 +1 @@
+../radeon/radeon_debug.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_debug.h b/src/mesa/drivers/dri/r300/radeon_debug.h
new file mode 120000
index 0000000000..bd8aa28e89
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_debug.h
@@ -0,0 +1 @@
+../radeon/radeon_debug.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_dma.c b/src/mesa/drivers/dri/r300/radeon_dma.c
new file mode 120000
index 0000000000..43be000625
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_dma.c
@@ -0,0 +1 @@
+../radeon/radeon_dma.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_dma.h b/src/mesa/drivers/dri/r300/radeon_dma.h
new file mode 120000
index 0000000000..82e50634e3
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_dma.h
@@ -0,0 +1 @@
+../radeon/radeon_dma.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_fbo.c b/src/mesa/drivers/dri/r300/radeon_fbo.c
new file mode 120000
index 0000000000..0d738d8d78
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_fbo.c
@@ -0,0 +1 @@
+../radeon/radeon_fbo.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.c b/src/mesa/drivers/dri/r300/radeon_ioctl.c
deleted file mode 100644
index 36502eb42d..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_ioctl.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include <sched.h>
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/macros.h"
-#include "main/context.h"
-#include "swrast/swrast.h"
-#include "r300_context.h"
-#include "radeon_ioctl.h"
-#include "r300_ioctl.h"
-#include "r300_state.h"
-#include "radeon_reg.h"
-
-#include "drirenderbuffer.h"
-#include "vblank.h"
-
-static void radeonWaitForIdle(radeonContextPtr radeon);
-
-/* ================================================================
- * SwapBuffers with client-side throttling
- */
-
-static uint32_t radeonGetLastFrame(radeonContextPtr radeon)
-{
-	drm_radeon_getparam_t gp;
-	int ret;
-	uint32_t frame;
-
-	gp.param = RADEON_PARAM_LAST_FRAME;
-	gp.value = (int *)&frame;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
-				  &gp, sizeof(gp));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-
-	return frame;
-}
-
-uint32_t radeonGetAge(radeonContextPtr radeon)
-{
-	drm_radeon_getparam_t gp;
-	int ret;
-	uint32_t age;
-
-	gp.param = RADEON_PARAM_LAST_CLEAR;
-	gp.value = (int *)&age;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
-				  &gp, sizeof(gp));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-
-	return age;
-}
-
-static void radeonEmitIrqLocked(radeonContextPtr radeon)
-{
-	drm_radeon_irq_emit_t ie;
-	int ret;
-
-	ie.irq_seq = &radeon->iw.irq_seq;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_IRQ_EMIT,
-				  &ie, sizeof(ie));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-}
-
-static void radeonWaitIrq(radeonContextPtr radeon)
-{
-	int ret;
-
-	do {
-		ret = drmCommandWrite(radeon->dri.fd, DRM_RADEON_IRQ_WAIT,
-				      &radeon->iw, sizeof(radeon->iw));
-	} while (ret && (errno == EINTR || errno == EBUSY));
-
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-}
-
-static void radeonWaitForFrameCompletion(radeonContextPtr radeon)
-{
-	drm_radeon_sarea_t *sarea = radeon->sarea;
-
-	if (radeon->do_irqs) {
-		if (radeonGetLastFrame(radeon) < sarea->last_frame) {
-			if (!radeon->irqsEmitted) {
-				while (radeonGetLastFrame(radeon) <
-				       sarea->last_frame) ;
-			} else {
-				UNLOCK_HARDWARE(radeon);
-				radeonWaitIrq(radeon);
-				LOCK_HARDWARE(radeon);
-			}
-			radeon->irqsEmitted = 10;
-		}
-
-		if (radeon->irqsEmitted) {
-			radeonEmitIrqLocked(radeon);
-			radeon->irqsEmitted--;
-		}
-	} else {
-		while (radeonGetLastFrame(radeon) < sarea->last_frame) {
-			UNLOCK_HARDWARE(radeon);
-			if (radeon->do_usleeps)
-				DO_USLEEP(1);
-			LOCK_HARDWARE(radeon);
-		}
-	}
-}
-
-/* Copy the back color buffer to the front color buffer.
- */
-void radeonCopyBuffer(__DRIdrawablePrivate * dPriv,
-		      const drm_clip_rect_t	 * rect)
-{
-	radeonContextPtr radeon;
-	GLint nbox, i, ret;
-	GLboolean missed_target;
-	int64_t ust;
-	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-	assert(dPriv);
-	assert(dPriv->driContextPriv);
-	assert(dPriv->driContextPriv->driverPrivate);
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "\n%s( %p )\n\n", __FUNCTION__,
-			(void *)radeon->glCtx);
-	}
-
-	r300Flush(radeon->glCtx);
-
-	LOCK_HARDWARE(radeon);
-
-	/* Throttle the frame rate -- only allow one pending swap buffers
-	 * request at a time.
-	 */
-	radeonWaitForFrameCompletion(radeon);
-	if (!rect)
-	{
-	    UNLOCK_HARDWARE(radeon);
-	    driWaitForVBlank(dPriv, &missed_target);
-	    LOCK_HARDWARE(radeon);
-	}
-
-	nbox = dPriv->numClipRects;	/* must be in locked region */
-
-	for (i = 0; i < nbox;) {
-		GLint nr = MIN2(i + RADEON_NR_SAREA_CLIPRECTS, nbox);
-		drm_clip_rect_t *box = dPriv->pClipRects;
-		drm_clip_rect_t *b = radeon->sarea->boxes;
-		GLint n = 0;
-
-		for ( ; i < nr ; i++ ) {
-
-		    *b = box[i];
-
-		    if (rect)
-		    {
-			if (rect->x1 > b->x1)
-			    b->x1 = rect->x1;
-			if (rect->y1 > b->y1)
-			    b->y1 = rect->y1;
-			if (rect->x2 < b->x2)
-			    b->x2 = rect->x2;
-			if (rect->y2 < b->y2)
-			    b->y2 = rect->y2;
-
-			if (b->x1 >= b->x2 || b->y1 >= b->y2)
-			    continue;
-		    }
-
-		    b++;
-		    n++;
-		}
-		radeon->sarea->nbox = n;
-
-		if (!n)
-		   continue;
-
-		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_SWAP);
-
-		if (ret) {
-			fprintf(stderr, "DRM_RADEON_SWAP: return = %d\n",
-				ret);
-			UNLOCK_HARDWARE(radeon);
-			exit(1);
-		}
-	}
-
-	UNLOCK_HARDWARE(radeon);
-	if (!rect)
-	{
-	    ((r300ContextPtr)radeon)->hw.all_dirty = GL_TRUE;
-
-	    radeon->swap_count++;
-	    (*psp->systemTime->getUST) (&ust);
-	    if (missed_target) {
-		radeon->swap_missed_count++;
-		radeon->swap_missed_ust = ust - radeon->swap_ust;
-	    }
-
-	    radeon->swap_ust = ust;
-
-	    sched_yield();
-	}
-}
-
-void radeonPageFlip(__DRIdrawablePrivate * dPriv)
-{
-	radeonContextPtr radeon;
-	GLint ret;
-	GLboolean missed_target;
-	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-	assert(dPriv);
-	assert(dPriv->driContextPriv);
-	assert(dPriv->driContextPriv->driverPrivate);
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
-			radeon->sarea->pfCurrentPage);
-	}
-
-	r300Flush(radeon->glCtx);
-	LOCK_HARDWARE(radeon);
-
-	if (!dPriv->numClipRects) {
-		UNLOCK_HARDWARE(radeon);
-		usleep(10000);	/* throttle invisible client 10ms */
-		return;
-	}
-
-	/* Need to do this for the perf box placement:
-	 */
-	{
-		drm_clip_rect_t *box = dPriv->pClipRects;
-		drm_clip_rect_t *b = radeon->sarea->boxes;
-		b[0] = box[0];
-		radeon->sarea->nbox = 1;
-	}
-
-	/* Throttle the frame rate -- only allow a few pending swap buffers
-	 * request at a time.
-	 */
-	radeonWaitForFrameCompletion(radeon);
-	UNLOCK_HARDWARE(radeon);
-	driWaitForVBlank(dPriv, &missed_target);
-	if (missed_target) {
-		radeon->swap_missed_count++;
-		(void)(*psp->systemTime->getUST) (&radeon->swap_missed_ust);
-	}
-	LOCK_HARDWARE(radeon);
-
-	ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_FLIP);
-
-	UNLOCK_HARDWARE(radeon);
-
-	if (ret) {
-		fprintf(stderr, "DRM_RADEON_FLIP: return = %d\n", ret);
-		exit(1);
-	}
-
-	radeon->swap_count++;
-	(void)(*psp->systemTime->getUST) (&radeon->swap_ust);
-
-        driFlipRenderbuffers(radeon->glCtx->WinSysDrawBuffer, 
-                             radeon->sarea->pfCurrentPage);
-
-	if (radeon->sarea->pfCurrentPage == 1) {
-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
-	} else {
-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
-	}
-
-	if (IS_R300_CLASS(radeon->radeonScreen)) {
-		r300ContextPtr r300 = (r300ContextPtr)radeon;
-		R300_STATECHANGE(r300, cb);
-		r300->hw.cb.cmd[R300_CB_OFFSET] = r300->radeon.state.color.drawOffset + 
-						r300->radeon.radeonScreen->fbLocation;
-		r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
-		
-		if (r300->radeon.radeonScreen->cpp == 4)
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-		else
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-	
-		if (r300->radeon.sarea->tiling_enabled)
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-	}
-}
-
-void radeonWaitForIdleLocked(radeonContextPtr radeon)
-{
-	int ret;
-	int i = 0;
-
-	do {
-		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_CP_IDLE);
-		if (ret)
-			DO_USLEEP(1);
-	} while (ret && ++i < 100);
-
-	if (ret < 0) {
-		UNLOCK_HARDWARE(radeon);
-		fprintf(stderr, "Error: R300 timed out... exiting\n");
-		exit(-1);
-	}
-}
-
-static void radeonWaitForIdle(radeonContextPtr radeon)
-{
-	LOCK_HARDWARE(radeon);
-	radeonWaitForIdleLocked(radeon);
-	UNLOCK_HARDWARE(radeon);
-}
-
-void radeonFlush(GLcontext * ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	if (IS_R300_CLASS(radeon->radeonScreen))
-		r300Flush(ctx);
-}
-
-
-/* Make sure all commands have been sent to the hardware and have
- * completed processing.
- */
-void radeonFinish(GLcontext * ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	radeonFlush(ctx);
-
-	if (radeon->do_irqs) {
-		LOCK_HARDWARE(radeon);
-		radeonEmitIrqLocked(radeon);
-		UNLOCK_HARDWARE(radeon);
-		radeonWaitIrq(radeon);
-	} else
-		radeonWaitForIdle(radeon);
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_lock.c b/src/mesa/drivers/dri/r300/radeon_lock.c
index 4f47afd5dc..af4108a8e3 100644..120000
--- a/src/mesa/drivers/dri/r300/radeon_lock.c
+++ b/src/mesa/drivers/dri/r300/radeon_lock.c
@@ -1,137 +1 @@
-/**************************************************************************
-
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Gareth Hughes <gareth@valinux.com>
- *   Keith Whitwell <keith@tungstengraphics.com>
- *   Kevin E. Martin <martin@valinux.com>
- */
-
-#include "radeon_lock.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
-#include "r300_context.h"
-#include "r300_state.h"
-
-#include "main/framebuffer.h"
-
-#include "drirenderbuffer.h"
-
-#if DEBUG_LOCKING
-char *prevLockFile = NULL;
-int prevLockLine = 0;
-#endif
-
-/* Turn on/off page flipping according to the flags in the sarea:
- */
-void radeonUpdatePageFlipping(radeonContextPtr rmesa)
-{
-	int use_back;
-
-	rmesa->doPageFlip = rmesa->sarea->pfState;
-	if (rmesa->glCtx->WinSysDrawBuffer) {
-		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-				     rmesa->sarea->pfCurrentPage);
-		r300UpdateDrawBuffer(rmesa->glCtx);
-	}
-
-	use_back = rmesa->glCtx->DrawBuffer ?
-	    (rmesa->glCtx->DrawBuffer->_ColorDrawBufferIndexes[0] ==
-	     BUFFER_BACK_LEFT) : 1;
-	use_back ^= (rmesa->sarea->pfCurrentPage == 1);
-
-	if (use_back) {
-		rmesa->state.color.drawOffset =
-		    rmesa->radeonScreen->backOffset;
-		rmesa->state.color.drawPitch = rmesa->radeonScreen->backPitch;
-	} else {
-		rmesa->state.color.drawOffset =
-		    rmesa->radeonScreen->frontOffset;
-		rmesa->state.color.drawPitch =
-		    rmesa->radeonScreen->frontPitch;
-	}
-}
-
-/* Update the hardware state.  This is called if another context has
- * grabbed the hardware lock, which includes the X server.  This
- * function also updates the driver's window state after the X server
- * moves, resizes or restacks a window -- the change will be reflected
- * in the drawable position and clip rects.  Since the X server grabs
- * the hardware lock when it changes the window state, this routine will
- * automatically be called after such a change.
- */
-void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
-{
-	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
-	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
-	drm_radeon_sarea_t *sarea = rmesa->sarea;
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-
-	assert(drawable != NULL);
-
-	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
-
-	/* The window might have moved, so we might need to get new clip
-	 * rects.
-	 *
-	 * NOTE: This releases and regrabs the hw lock to allow the X server
-	 * to respond to the DRI protocol request for new drawable info.
-	 * Since the hardware state depends on having the latest drawable
-	 * clip rects, all state checking must be done _after_ this call.
-	 */
-	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
-	if (drawable != readable) {
-		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
-	}
-
-	if (rmesa->lastStamp != drawable->lastStamp) {
-		radeonUpdatePageFlipping(rmesa);
-		radeonSetCliprects(rmesa);
-		r300UpdateViewportOffset(rmesa->glCtx);
-		driUpdateFramebufferSize(rmesa->glCtx, drawable);
-	}
-
-	if (sarea->ctx_owner != rmesa->dri.hwContext) {
-		int i;
-
-		sarea->ctx_owner = rmesa->dri.hwContext;
-		for (i = 0; i < r300->nr_heaps; i++) {
-			DRI_AGE_TEXTURES(r300->texture_heaps[i]);
-		}
-	}
-
-	rmesa->lost_context = GL_TRUE;
-}
+../radeon/radeon_lock.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_lock.h b/src/mesa/drivers/dri/r300/radeon_lock.h
index a344837f47..64bdf94ee7 100644..120000
--- a/src/mesa/drivers/dri/r300/radeon_lock.h
+++ b/src/mesa/drivers/dri/r300/radeon_lock.h
@@ -1,115 +1 @@
-/**************************************************************************
-
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Gareth Hughes <gareth@valinux.com>
- *   Keith Whitwell <keith@tungstengraphics.com>
- *   Kevin E. Martin <martin@valinux.com>
- */
-
-#ifndef __RADEON_LOCK_H__
-#define __RADEON_LOCK_H__
-
-#include "radeon_context.h"
-
-extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
-extern void radeonUpdatePageFlipping(radeonContextPtr rmesa);
-
-/* Turn DEBUG_LOCKING on to find locking conflicts.
- */
-#define DEBUG_LOCKING	0
-
-#if DEBUG_LOCKING
-extern char *prevLockFile;
-extern int prevLockLine;
-
-#define DEBUG_LOCK()							\
-   do {									\
-      prevLockFile = (__FILE__);					\
-      prevLockLine = (__LINE__);					\
-   } while (0)
-
-#define DEBUG_RESET()							\
-   do {									\
-      prevLockFile = 0;							\
-      prevLockLine = 0;							\
-   } while (0)
-
-#define DEBUG_CHECK_LOCK()						\
-   do {									\
-      if (prevLockFile) {						\
-	 fprintf(stderr,						\
-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
-		  prevLockFile, prevLockLine, __FILE__, __LINE__);	\
-	 exit(1);							\
-      }									\
-   } while (0)
-
-#else
-
-#define DEBUG_LOCK()
-#define DEBUG_RESET()
-#define DEBUG_CHECK_LOCK()
-
-#endif
-
-/*
- * !!! We may want to separate locks from locks with validation.  This
- * could be used to improve performance for those things commands that
- * do not do any drawing !!!
- */
-
-/* Lock the hardware and validate our state.
- */
-#define LOCK_HARDWARE( rmesa )						\
-	do {								\
-		char __ret = 0;						\
-		DEBUG_CHECK_LOCK();					\
-		DRM_CAS((rmesa)->dri.hwLock, (rmesa)->dri.hwContext,	\
-			(DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret); \
-		if (__ret)						\
-			radeonGetLock((rmesa), 0);			\
-		DEBUG_LOCK();						\
-	} while (0)
-
-#define UNLOCK_HARDWARE( rmesa )					\
-	do {								\
-		DRM_UNLOCK((rmesa)->dri.fd,				\
-			(rmesa)->dri.hwLock,				\
-			(rmesa)->dri.hwContext);			\
-		DEBUG_RESET();						\
-	} while (0)
-
-#endif				/* __RADEON_LOCK_H__ */
+../radeon/radeon_lock.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_mipmap_tree.c b/src/mesa/drivers/dri/r300/radeon_mipmap_tree.c
new file mode 120000
index 0000000000..31c0cfbe94
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_mipmap_tree.c
@@ -0,0 +1 @@
+../radeon/radeon_mipmap_tree.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_mipmap_tree.h b/src/mesa/drivers/dri/r300/radeon_mipmap_tree.h
new file mode 120000
index 0000000000..254d50cf8c
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_mipmap_tree.h
@@ -0,0 +1 @@
+../radeon/radeon_mipmap_tree.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_program.c b/src/mesa/drivers/dri/r300/radeon_program.c
deleted file mode 100644
index da5e7aefce..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_program.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2008 Nicolai Haehnle.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-#include "radeon_program.h"
-
-#include "shader/prog_print.h"
-
-
-/**
- * Transform the given clause in the following way:
- *  1. Replace it with an empty clause
- *  2. For every instruction in the original clause, try the given
- *     transformations in order.
- *  3. If one of the transformations returns GL_TRUE, assume that it
- *     has emitted the appropriate instruction(s) into the new clause;
- *     otherwise, copy the instruction verbatim.
- *
- * \note The transformation is currently not recursive; in other words,
- * instructions emitted by transformations are not transformed.
- *
- * \note The transform is called 'local' because it can only look at
- * one instruction at a time.
- */
-void radeonLocalTransform(
-	GLcontext *Ctx,
-	struct gl_program *program,
-	int num_transformations,
-	struct radeon_program_transformation* transformations)
-{
-	struct radeon_transform_context ctx;
-	int ip;
-
-	ctx.Ctx = Ctx;
-	ctx.Program = program;
-	ctx.OldInstructions = program->Instructions;
-	ctx.OldNumInstructions = program->NumInstructions;
-
-	program->Instructions = 0;
-	program->NumInstructions = 0;
-
-	for(ip = 0; ip < ctx.OldNumInstructions; ++ip) {
-		struct prog_instruction *instr = ctx.OldInstructions + ip;
-		int i;
-
-		for(i = 0; i < num_transformations; ++i) {
-			struct radeon_program_transformation* t = transformations + i;
-
-			if (t->function(&ctx, instr, t->userData))
-				break;
-		}
-
-		if (i >= num_transformations) {
-			struct prog_instruction* dest = radeonAppendInstructions(program, 1);
-			_mesa_copy_instructions(dest, instr, 1);
-		}
-	}
-
-	_mesa_free_instructions(ctx.OldInstructions, ctx.OldNumInstructions);
-}
-
-
-static void scan_instructions(GLboolean* used, const struct prog_instruction* insts, GLuint count)
-{
-	GLuint i;
-	for (i = 0; i < count; i++) {
-		const struct prog_instruction *inst = insts + i;
-		const GLuint n = _mesa_num_inst_src_regs(inst->Opcode);
-		GLuint k;
-
-		for (k = 0; k < n; k++) {
-			if (inst->SrcReg[k].File == PROGRAM_TEMPORARY)
-				used[inst->SrcReg[k].Index] = GL_TRUE;
-		}
-	}
-}
-
-GLint radeonFindFreeTemporary(struct radeon_transform_context *t)
-{
-	GLboolean used[MAX_PROGRAM_TEMPS];
-	GLuint i;
-
-	_mesa_memset(used, 0, sizeof(used));
-	scan_instructions(used, t->Program->Instructions, t->Program->NumInstructions);
-	scan_instructions(used, t->OldInstructions, t->OldNumInstructions);
-
-	for (i = 0; i < MAX_PROGRAM_TEMPS; i++) {
-		if (!used[i])
-			return i;
-	}
-
-	return -1;
-}
-
-
-/**
- * Append the given number of instructions to the program and return a
- * pointer to the first new instruction.
- */
-struct prog_instruction *radeonAppendInstructions(struct gl_program *program, int count)
-{
-	int oldnum = program->NumInstructions;
-	_mesa_insert_instructions(program, oldnum, count);
-	return program->Instructions + oldnum;
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_program_alu.c b/src/mesa/drivers/dri/r300/radeon_program_alu.c
deleted file mode 100644
index 1ef71e74dc..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_program_alu.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
- * Copyright (C) 2008 Nicolai Haehnle.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-/**
- * @file
- *
- * Shareable transformations that transform "special" ALU instructions
- * into ALU instructions that are supported by hardware.
- *
- */
-
-#include "radeon_program_alu.h"
-
-#include "shader/prog_parameter.h"
-
-
-static struct prog_instruction *emit1(struct gl_program* p,
-	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
-	struct prog_src_register SrcReg)
-{
-	struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
-
-	fpi->Opcode = Opcode;
-	fpi->SaturateMode = Saturate;
-	fpi->DstReg = DstReg;
-	fpi->SrcReg[0] = SrcReg;
-	return fpi;
-}
-
-static struct prog_instruction *emit2(struct gl_program* p,
-	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
-	struct prog_src_register SrcReg0, struct prog_src_register SrcReg1)
-{
-	struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
-
-	fpi->Opcode = Opcode;
-	fpi->SaturateMode = Saturate;
-	fpi->DstReg = DstReg;
-	fpi->SrcReg[0] = SrcReg0;
-	fpi->SrcReg[1] = SrcReg1;
-	return fpi;
-}
-
-static struct prog_instruction *emit3(struct gl_program* p,
-	gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
-	struct prog_src_register SrcReg0, struct prog_src_register SrcReg1,
-	struct prog_src_register SrcReg2)
-{
-	struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
-
-	fpi->Opcode = Opcode;
-	fpi->SaturateMode = Saturate;
-	fpi->DstReg = DstReg;
-	fpi->SrcReg[0] = SrcReg0;
-	fpi->SrcReg[1] = SrcReg1;
-	fpi->SrcReg[2] = SrcReg2;
-	return fpi;
-}
-
-static void set_swizzle(struct prog_src_register *SrcReg, int coordinate, int swz)
-{
-	SrcReg->Swizzle &= ~(7 << (3*coordinate));
-	SrcReg->Swizzle |= swz << (3*coordinate);
-}
-
-static void set_negate_base(struct prog_src_register *SrcReg, int coordinate, int negate)
-{
-	SrcReg->NegateBase &= ~(1 << coordinate);
-	SrcReg->NegateBase |= (negate << coordinate);
-}
-
-static struct prog_dst_register dstreg(int file, int index)
-{
-	struct prog_dst_register dst;
-	dst.File = file;
-	dst.Index = index;
-	dst.WriteMask = WRITEMASK_XYZW;
-	dst.CondMask = COND_TR;
-	dst.CondSwizzle = SWIZZLE_NOOP;
-	dst.CondSrc = 0;
-	dst.pad = 0;
-	return dst;
-}
-
-static struct prog_dst_register dstregtmpmask(int index, int mask)
-{
-	struct prog_dst_register dst;
-	dst.File = PROGRAM_TEMPORARY;
-	dst.Index = index;
-	dst.WriteMask = mask;
-	dst.CondMask = COND_TR;
-	dst.CondSwizzle = SWIZZLE_NOOP;
-	dst.CondSrc = 0;
-	dst.pad = 0;
-	return dst;
-}
-
-static const struct prog_src_register builtin_zero = {
-	.File = PROGRAM_BUILTIN,
-	.Index = 0,
-	.Swizzle = SWIZZLE_0000
-};
-static const struct prog_src_register builtin_one = {
-	.File = PROGRAM_BUILTIN,
-	.Index = 0,
-	.Swizzle = SWIZZLE_1111
-};
-static const struct prog_src_register srcreg_undefined = {
-	.File = PROGRAM_UNDEFINED,
-	.Index = 0,
-	.Swizzle = SWIZZLE_NOOP
-};
-
-static struct prog_src_register srcreg(int file, int index)
-{
-	struct prog_src_register src = srcreg_undefined;
-	src.File = file;
-	src.Index = index;
-	return src;
-}
-
-static struct prog_src_register srcregswz(int file, int index, int swz)
-{
-	struct prog_src_register src = srcreg_undefined;
-	src.File = file;
-	src.Index = index;
-	src.Swizzle = swz;
-	return src;
-}
-
-static struct prog_src_register absolute(struct prog_src_register reg)
-{
-	struct prog_src_register newreg = reg;
-	newreg.Abs = 1;
-	newreg.NegateBase = 0;
-	newreg.NegateAbs = 0;
-	return newreg;
-}
-
-static struct prog_src_register negate(struct prog_src_register reg)
-{
-	struct prog_src_register newreg = reg;
-	newreg.NegateAbs = !newreg.NegateAbs;
-	return newreg;
-}
-
-static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x, GLuint y, GLuint z, GLuint w)
-{
-	struct prog_src_register swizzled = reg;
-	swizzled.Swizzle = MAKE_SWIZZLE4(
-		x >= 4 ? x : GET_SWZ(reg.Swizzle, x),
-		y >= 4 ? y : GET_SWZ(reg.Swizzle, y),
-		z >= 4 ? z : GET_SWZ(reg.Swizzle, z),
-		w >= 4 ? w : GET_SWZ(reg.Swizzle, w));
-	return swizzled;
-}
-
-static struct prog_src_register scalar(struct prog_src_register reg)
-{
-	return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
-}
-
-static void transform_ABS(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	struct prog_src_register src = inst->SrcReg[0];
-	src.Abs = 1;
-	src.NegateBase = 0;
-	src.NegateAbs = 0;
-	emit1(t->Program, OPCODE_MOV, inst->SaturateMode, inst->DstReg, src);
-}
-
-static void transform_DPH(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	struct prog_src_register src0 = inst->SrcReg[0];
-	if (src0.NegateAbs) {
-		if (src0.Abs) {
-			int tempreg = radeonFindFreeTemporary(t);
-			emit1(t->Program, OPCODE_MOV, 0, dstreg(PROGRAM_TEMPORARY, tempreg), src0);
-			src0 = srcreg(src0.File, src0.Index);
-		} else {
-			src0.NegateAbs = 0;
-			src0.NegateBase ^= NEGATE_XYZW;
-		}
-	}
-	set_swizzle(&src0, 3, SWIZZLE_ONE);
-	set_negate_base(&src0, 3, 0);
-	emit2(t->Program, OPCODE_DP4, inst->SaturateMode, inst->DstReg, src0, inst->SrcReg[1]);
-}
-
-/**
- * [1, src0.y*src1.y, src0.z, src1.w]
- * So basically MUL with lotsa swizzling.
- */
-static void transform_DST(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	emit2(t->Program, OPCODE_MUL, inst->SaturateMode, inst->DstReg,
-		swizzle(inst->SrcReg[0], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE),
-		swizzle(inst->SrcReg[1], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_ONE, SWIZZLE_W));
-}
-
-static void transform_FLR(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	int tempreg = radeonFindFreeTemporary(t);
-	emit1(t->Program, OPCODE_FRC, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]);
-	emit2(t->Program, OPCODE_ADD, inst->SaturateMode, inst->DstReg,
-		inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
-}
-
-/**
- * Definition of LIT (from ARB_fragment_program):
- *
- *  tmp = VectorLoad(op0);
- *  if (tmp.x < 0) tmp.x = 0;
- *  if (tmp.y < 0) tmp.y = 0;
- *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
- *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
- *  result.x = 1.0;
- *  result.y = tmp.x;
- *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
- *  result.w = 1.0;
- *
- * The longest path of computation is the one leading to result.z,
- * consisting of 5 operations. This implementation of LIT takes
- * 5 slots, if the subsequent optimization passes are clever enough
- * to pair instructions correctly.
- */
-static void transform_LIT(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	static const GLfloat LitConst[4] = { -127.999999 };
-
-	GLuint constant;
-	GLuint constant_swizzle;
-	GLuint temp;
-	int needTemporary = 0;
-	struct prog_src_register srctemp;
-
-	constant = _mesa_add_unnamed_constant(t->Program->Parameters, LitConst, 1, &constant_swizzle);
-
-	if (inst->DstReg.WriteMask != WRITEMASK_XYZW) {
-		needTemporary = 1;
-	} else if (inst->DstReg.File != PROGRAM_TEMPORARY) {
-		// LIT is typically followed by DP3/DP4, so there's no point
-		// in creating special code for this case
-		needTemporary = 1;
-	}
-
-	if (needTemporary) {
-		temp = radeonFindFreeTemporary(t);
-	} else {
-		temp = inst->DstReg.Index;
-	}
-	srctemp = srcreg(PROGRAM_TEMPORARY, temp);
-
-	// tmp.x = max(0.0, Src.x);
-	// tmp.y = max(0.0, Src.y);
-	// tmp.w = clamp(Src.z, -128+eps, 128-eps);
-	emit2(t->Program, OPCODE_MAX, 0,
-		dstregtmpmask(temp, WRITEMASK_XYW),
-		inst->SrcReg[0],
-		swizzle(srcreg(PROGRAM_CONSTANT, constant),
-			SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, constant_swizzle&3));
-	emit2(t->Program, OPCODE_MIN, 0,
-		dstregtmpmask(temp, WRITEMASK_Z),
-		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-		negate(srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle)));
-
-	// tmp.w = Pow(tmp.y, tmp.w)
-	emit1(t->Program, OPCODE_LG2, 0,
-		dstregtmpmask(temp, WRITEMASK_W),
-		swizzle(srctemp, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
-	emit2(t->Program, OPCODE_MUL, 0,
-		dstregtmpmask(temp, WRITEMASK_W),
-		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-		swizzle(srctemp, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z));
-	emit1(t->Program, OPCODE_EX2, 0,
-		dstregtmpmask(temp, WRITEMASK_W),
-		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
-
-	// tmp.z = (tmp.x > 0) ? tmp.w : 0.0
-	emit3(t->Program, OPCODE_CMP, inst->SaturateMode,
-		dstregtmpmask(temp, WRITEMASK_Z),
-		negate(swizzle(srctemp, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
-		swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-		builtin_zero);
-
-	// tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
-	emit1(t->Program, OPCODE_MOV, inst->SaturateMode,
-		dstregtmpmask(temp, WRITEMASK_XYW),
-		swizzle(srctemp, SWIZZLE_ONE, SWIZZLE_X, SWIZZLE_ONE, SWIZZLE_ONE));
-
-	if (needTemporary)
-		emit1(t->Program, OPCODE_MOV, 0, inst->DstReg, srctemp);
-}
-
-static void transform_LRP(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	int tempreg = radeonFindFreeTemporary(t);
-
-	emit2(t->Program, OPCODE_ADD, 0,
-		dstreg(PROGRAM_TEMPORARY, tempreg),
-		inst->SrcReg[1], negate(inst->SrcReg[2]));
-	emit3(t->Program, OPCODE_MAD, inst->SaturateMode,
-		inst->DstReg,
-		inst->SrcReg[0], srcreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[2]);
-}
-
-static void transform_POW(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	int tempreg = radeonFindFreeTemporary(t);
-	struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg);
-	struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg);
-	tempdst.WriteMask = WRITEMASK_W;
-	tempsrc.Swizzle = SWIZZLE_WWWW;
-
-	emit1(t->Program, OPCODE_LG2, 0, tempdst, scalar(inst->SrcReg[0]));
-	emit2(t->Program, OPCODE_MUL, 0, tempdst, tempsrc, scalar(inst->SrcReg[1]));
-	emit1(t->Program, OPCODE_EX2, inst->SaturateMode, inst->DstReg, tempsrc);
-}
-
-static void transform_RSQ(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	emit1(t->Program, OPCODE_RSQ, inst->SaturateMode, inst->DstReg, absolute(inst->SrcReg[0]));
-}
-
-static void transform_SGE(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	int tempreg = radeonFindFreeTemporary(t);
-
-	emit2(t->Program, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
-	emit3(t->Program, OPCODE_CMP, inst->SaturateMode, inst->DstReg,
-		srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one);
-}
-
-static void transform_SLT(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	int tempreg = radeonFindFreeTemporary(t);
-
-	emit2(t->Program, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
-	emit3(t->Program, OPCODE_CMP, inst->SaturateMode, inst->DstReg,
-		srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero);
-}
-
-static void transform_SUB(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	emit2(t->Program, OPCODE_ADD, inst->SaturateMode, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1]));
-}
-
-static void transform_SWZ(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	emit1(t->Program, OPCODE_MOV, inst->SaturateMode, inst->DstReg, inst->SrcReg[0]);
-}
-
-static void transform_XPD(struct radeon_transform_context* t,
-	struct prog_instruction* inst)
-{
-	int tempreg = radeonFindFreeTemporary(t);
-
-	emit2(t->Program, OPCODE_MUL, 0, dstreg(PROGRAM_TEMPORARY, tempreg),
-		swizzle(inst->SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
-		swizzle(inst->SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W));
-	emit3(t->Program, OPCODE_MAD, inst->SaturateMode, inst->DstReg,
-		swizzle(inst->SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W),
-		swizzle(inst->SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
-		negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
-}
-
-
-/**
- * Can be used as a transformation for @ref radeonClauseLocalTransform,
- * no userData necessary.
- *
- * Eliminates the following ALU instructions:
- *  ABS, DPH, DST, FLR, LIT, LRP, POW, SGE, SLT, SUB, SWZ, XPD
- * using:
- *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
- *
- * Transforms RSQ to Radeon's native RSQ by explicitly setting
- * absolute value.
- *
- * @note should be applicable to R300 and R500 fragment programs.
- */
-GLboolean radeonTransformALU(struct radeon_transform_context* t,
-	struct prog_instruction* inst,
-	void* unused)
-{
-	switch(inst->Opcode) {
-	case OPCODE_ABS: transform_ABS(t, inst); return GL_TRUE;
-	case OPCODE_DPH: transform_DPH(t, inst); return GL_TRUE;
-	case OPCODE_DST: transform_DST(t, inst); return GL_TRUE;
-	case OPCODE_FLR: transform_FLR(t, inst); return GL_TRUE;
-	case OPCODE_LIT: transform_LIT(t, inst); return GL_TRUE;
-	case OPCODE_LRP: transform_LRP(t, inst); return GL_TRUE;
-	case OPCODE_POW: transform_POW(t, inst); return GL_TRUE;
-	case OPCODE_RSQ: transform_RSQ(t, inst); return GL_TRUE;
-	case OPCODE_SGE: transform_SGE(t, inst); return GL_TRUE;
-	case OPCODE_SLT: transform_SLT(t, inst); return GL_TRUE;
-	case OPCODE_SUB: transform_SUB(t, inst); return GL_TRUE;
-	case OPCODE_SWZ: transform_SWZ(t, inst); return GL_TRUE;
-	case OPCODE_XPD: transform_XPD(t, inst); return GL_TRUE;
-	default:
-		return GL_FALSE;
-	}
-}
-
-
-static void sincos_constants(struct radeon_transform_context* t, GLuint *constants)
-{
-	static const GLfloat SinCosConsts[2][4] = {
-		{
-			1.273239545,		// 4/PI
-			-0.405284735,		// -4/(PI*PI)
-			3.141592654,		// PI
-			0.2225			// weight
-		},
-		{
-			0.75,
-			0.5,
-			0.159154943,		// 1/(2*PI)
-			6.283185307		// 2*PI
-		}
-	};
-	int i;
-
-	for(i = 0; i < 2; ++i) {
-		GLuint swz;
-		constants[i] = _mesa_add_unnamed_constant(t->Program->Parameters, SinCosConsts[i], 4, &swz);
-		ASSERT(swz == SWIZZLE_NOOP);
-	}
-}
-
-/**
- * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
- *
- * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
- * MAD tmp.x, tmp.y, |src|, tmp.x
- * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
- * MAD dest, tmp.y, weight, tmp.x
- */
-static void sin_approx(struct radeon_transform_context* t,
-	struct prog_dst_register dst, struct prog_src_register src, const GLuint* constants)
-{
-	GLuint tempreg = radeonFindFreeTemporary(t);
-
-	emit2(t->Program, OPCODE_MUL, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
-		swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-		srcreg(PROGRAM_CONSTANT, constants[0]));
-	emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_X),
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
-		absolute(swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
-	emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_Y),
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-		absolute(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
-		negate(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)));
-	emit3(t->Program, OPCODE_MAD, 0, dst,
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
-		swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-		swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
-}
-
-/**
- * Translate the trigonometric functions COS, SIN, and SCS
- * using only the basic instructions
- *  MOV, ADD, MUL, MAD, FRC
- */
-GLboolean radeonTransformTrigSimple(struct radeon_transform_context* t,
-	struct prog_instruction* inst,
-	void* unused)
-{
-	if (inst->Opcode != OPCODE_COS &&
-	    inst->Opcode != OPCODE_SIN &&
-	    inst->Opcode != OPCODE_SCS)
-		return GL_FALSE;
-
-	GLuint constants[2];
-	GLuint tempreg = radeonFindFreeTemporary(t);
-
-	sincos_constants(t, constants);
-
-	if (inst->Opcode == OPCODE_COS) {
-		// MAD tmp.x, src, 1/(2*PI), 0.75
-		// FRC tmp.x, tmp.x
-		// MAD tmp.z, tmp.x, 2*PI, -PI
-		emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
-		emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
-		emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
-
-		sin_approx(t, inst->DstReg,
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			constants);
-	} else if (inst->Opcode == OPCODE_SIN) {
-		emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
-		emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
-		emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
-
-		sin_approx(t, inst->DstReg,
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			constants);
-	} else {
-		emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
-			swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W));
-		emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
-			srcreg(PROGRAM_TEMPORARY, tempreg));
-		emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
-			srcreg(PROGRAM_TEMPORARY, tempreg),
-			swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
-			negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
-
-		struct prog_dst_register dst = inst->DstReg;
-
-		dst.WriteMask = inst->DstReg.WriteMask & WRITEMASK_X;
-		sin_approx(t, dst,
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-			constants);
-
-		dst.WriteMask = inst->DstReg.WriteMask & WRITEMASK_Y;
-		sin_approx(t, dst,
-			swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
-			constants);
-	}
-
-	return GL_TRUE;
-}
-
-
-/**
- * Transform the trigonometric functions COS, SIN, and SCS
- * to include pre-scaling by 1/(2*PI) and taking the fractional
- * part, so that the input to COS and SIN is always in the range [0,1).
- * SCS is replaced by one COS and one SIN instruction.
- *
- * @warning This transformation implicitly changes the semantics of SIN and COS!
- */
-GLboolean radeonTransformTrigScale(struct radeon_transform_context* t,
-	struct prog_instruction* inst,
-	void* unused)
-{
-	if (inst->Opcode != OPCODE_COS &&
-	    inst->Opcode != OPCODE_SIN &&
-	    inst->Opcode != OPCODE_SCS)
-		return GL_FALSE;
-
-	static const GLfloat RCP_2PI[] = { 0.15915494309189535 };
-	GLuint temp;
-	GLuint constant;
-	GLuint constant_swizzle;
-
-	temp = radeonFindFreeTemporary(t);
-	constant = _mesa_add_unnamed_constant(t->Program->Parameters, RCP_2PI, 1, &constant_swizzle);
-
-	emit2(t->Program, OPCODE_MUL, 0, dstregtmpmask(temp, WRITEMASK_W),
-		swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
-		srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle));
-	emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(temp, WRITEMASK_W),
-		srcreg(PROGRAM_TEMPORARY, temp));
-
-	if (inst->Opcode == OPCODE_COS) {
-		emit1(t->Program, OPCODE_COS, inst->SaturateMode, inst->DstReg,
-			srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
-	} else if (inst->Opcode == OPCODE_SIN) {
-		emit1(t->Program, OPCODE_SIN, inst->SaturateMode,
-			inst->DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
-	} else if (inst->Opcode == OPCODE_SCS) {
-		struct prog_dst_register moddst = inst->DstReg;
-
-		if (inst->DstReg.WriteMask & WRITEMASK_X) {
-			moddst.WriteMask = WRITEMASK_X;
-			emit1(t->Program, OPCODE_COS, inst->SaturateMode, moddst,
-				srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
-		}
-		if (inst->DstReg.WriteMask & WRITEMASK_Y) {
-			moddst.WriteMask = WRITEMASK_Y;
-			emit1(t->Program, OPCODE_SIN, inst->SaturateMode, moddst,
-				srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
-		}
-	}
-
-	return GL_TRUE;
-}
-
-/**
- * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
- * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
- * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
- *
- * @warning This explicitly changes the form of DDX and DDY!
- */
-
-GLboolean radeonTransformDeriv(struct radeon_transform_context* t,
-	struct prog_instruction* inst,
-	void* unused)
-{
-	if (inst->Opcode != OPCODE_DDX && inst->Opcode != OPCODE_DDY)
-		return GL_FALSE;
-
-	struct prog_src_register B = inst->SrcReg[1];
-
-	B.Swizzle = MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE,
-						SWIZZLE_ONE, SWIZZLE_ONE);
-	B.NegateBase = NEGATE_XYZW;
-
-	emit2(t->Program, inst->Opcode, inst->SaturateMode, inst->DstReg,
-		inst->SrcReg[0], B);
-
-	return GL_TRUE;
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_queryobj.c b/src/mesa/drivers/dri/r300/radeon_queryobj.c
new file mode 120000
index 0000000000..1d6ebc1c48
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_queryobj.c
@@ -0,0 +1 @@
+../radeon/radeon_queryobj.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_queryobj.h b/src/mesa/drivers/dri/r300/radeon_queryobj.h
new file mode 120000
index 0000000000..8f6f842b0a
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_queryobj.h
@@ -0,0 +1 @@
+../radeon/radeon_queryobj.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_screen.c b/src/mesa/drivers/dri/r300/radeon_screen.c
new file mode 120000
index 0000000000..86161118dd
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_screen.c
@@ -0,0 +1 @@
+../radeon/radeon_screen.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_screen.h b/src/mesa/drivers/dri/r300/radeon_screen.h
new file mode 120000
index 0000000000..23bb6bd459
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_screen.h
@@ -0,0 +1 @@
+../radeon/radeon_screen.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_span.c b/src/mesa/drivers/dri/r300/radeon_span.c
index 16f9fb99e6..232868c4c9 100644..120000
--- a/src/mesa/drivers/dri/r300/radeon_span.c
+++ b/src/mesa/drivers/dri/r300/radeon_span.c
@@ -1,349 +1 @@
-/**************************************************************************
-
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Kevin E. Martin <martin@valinux.com>
- *   Gareth Hughes <gareth@valinux.com>
- *   Keith Whitwell <keith@tungstengraphics.com>
- *
- */
-
-#include "main/glheader.h"
-#include "swrast/swrast.h"
-
-#include "r300_state.h"
-#include "radeon_ioctl.h"
-#include "r300_ioctl.h"
-#include "radeon_span.h"
-
-#include "drirenderbuffer.h"
-
-#define DBG 0
-
-/*
- * Note that all information needed to access pixels in a renderbuffer
- * should be obtained through the gl_renderbuffer parameter, not per-context
- * information.
- */
-#define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
-   const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
-   GLuint p;							\
-   (void) p;
-
-#define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
-   const GLuint bottom = dPriv->h - 1;			\
-   GLuint xo = dPriv->x;				\
-   GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
-
-#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
-
-#define Y_FLIP(Y) (bottom - (Y))
-
-#define HW_LOCK()
-
-#define HW_UNLOCK()
-
-/* ================================================================
- * Color buffer
- */
-
-/* 16 bit, RGB565 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    radeon##x##_RGB565
-#define TAG2(x,y) radeon##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
-#include "spantmp2.h"
-
-/* 32 bit, ARGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    radeon##x##_ARGB8888
-#define TAG2(x,y) radeon##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
-#include "spantmp2.h"
-
-/* ================================================================
- * Depth buffer
- */
-
-/* The Radeon family has depth tiling on all the time, so we have to convert
- * the x,y coordinates into the memory bus address (mba) in the same
- * manner as the engine.  In each case, the linear block address (ba)
- * is calculated, and then wired with x and y to produce the final
- * memory address.
- * The chip will do address translation on its own if the surface registers
- * are set up correctly. It is not quite enough to get it working with hyperz
- * too...
- */
-
-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 4 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0..1] = 0           */
-
-#ifdef COMPILE_R300
-		ba = (y / 8) * (pitch / 8) + (x / 8);
-#else
-		ba = (y / 16) * (pitch / 16) + (x / 16);
-#endif
-
-		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
-		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
-		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-static INLINE GLuint
-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 2 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0]    = 0           */
-
-		ba = (y / 16) * (pitch / 32) + (x / 32);
-
-		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
-		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
-		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-/* 16-bit depth buffer functions
- */
-#define VALUE_TYPE GLushort
-
-#define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
-
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
-
-#define TAG(x) radeon##x##_z16
-#include "depthtmp.h"
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- *
- * Careful: It looks like the R300 uses ZZZS byte order while the R200
- * uses SZZZ for 24 bit depth, 8 bit stencil mode.
- */
-#define VALUE_TYPE GLuint
-
-#ifdef COMPILE_R300
-#define WRITE_DEPTH( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0x000000ff;							\
-   tmp |= ((d << 8) & 0xffffff00);					\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#else
-#define WRITE_DEPTH( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xff000000;							\
-   tmp |= ((d) & 0x00ffffff);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#endif
-
-#ifdef COMPILE_R300
-#define READ_DEPTH( d, _x, _y )						\
-  do { \
-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
-					 _y + yo )) & 0xffffff00) >> 8; \
-  }while(0)
-#else
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
-					 _y + yo )) & 0x00ffffff;
-#endif
-
-#define TAG(x) radeon##x##_z24_s8
-#include "depthtmp.h"
-
-/* ================================================================
- * Stencil buffer
- */
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- */
-#ifdef COMPILE_R300
-#define WRITE_STENCIL( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xffffff00;							\
-   tmp |= (d) & 0xff;							\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#else
-#define WRITE_STENCIL( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0x00ffffff;							\
-   tmp |= (((d) & 0xff) << 24);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#endif
-
-#ifdef COMPILE_R300
-#define READ_STENCIL( d, _x, _y )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   d = tmp & 0x000000ff;						\
-} while (0)
-#else
-#define READ_STENCIL( d, _x, _y )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   d = (tmp & 0xff000000) >> 24;					\
-} while (0)
-#endif
-
-#define TAG(x) radeon##x##_z24_s8
-#include "stenciltmp.h"
-
-/* Move locking out to get reasonable span performance (10x better
- * than doing this in HW_LOCK above).  WaitForIdle() is the main
- * culprit.
- */
-
-static void radeonSpanRenderStart(GLcontext * ctx)
-{
-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-#ifdef COMPILE_R300
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-	R300_FIREVERTICES(r300);
-#else
-	RADEON_FIREVERTICES(rmesa);
-#endif
-	LOCK_HARDWARE(rmesa);
-	radeonWaitForIdleLocked(rmesa);
-
-	/* Read the first pixel in the frame buffer.  This should
-	 * be a noop, right?  In fact without this conform fails as reading
-	 * from the framebuffer sometimes produces old results -- the
-	 * on-card read cache gets mixed up and doesn't notice that the
-	 * framebuffer has been updated.
-	 *
-	 * Note that we should probably be reading some otherwise unused
-	 * region of VRAM, otherwise we might get incorrect results when
-	 * reading pixels from the top left of the screen.
-	 *
-	 * I found this problem on an R420 with glean's texCube test.
-	 * Note that the R200 span code also *writes* the first pixel in the
-	 * framebuffer, but I've found this to be unnecessary.
-	 *  -- Nicolai Hähnle, June 2008
-	 */
-	{
-		int p;
-		driRenderbuffer *drb =
-			(driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
-		volatile int *buf =
-			(volatile int *)(rmesa->dri.screen->pFB + drb->offset);
-		p = *buf;
-	}
-}
-
-static void radeonSpanRenderFinish(GLcontext * ctx)
-{
-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-	_swrast_flush(ctx);
-	UNLOCK_HARDWARE(rmesa);
-}
-
-void radeonInitSpanFuncs(GLcontext * ctx)
-{
-	struct swrast_device_driver *swdd =
-	    _swrast_GetDeviceDriverReference(ctx);
-	swdd->SpanRenderStart = radeonSpanRenderStart;
-	swdd->SpanRenderFinish = radeonSpanRenderFinish;
-}
-
-/**
- * Plug in the Get/Put routines for the given driRenderbuffer.
- */
-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
-{
-	if (drb->Base.InternalFormat == GL_RGBA) {
-		if (vis->redBits == 5 && vis->greenBits == 6
-		    && vis->blueBits == 5) {
-			radeonInitPointers_RGB565(&drb->Base);
-		} else {
-			radeonInitPointers_ARGB8888(&drb->Base);
-		}
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-		radeonInitDepthPointers_z16(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-		radeonInitDepthPointers_z24_s8(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-		radeonInitStencilPointers_z24_s8(&drb->Base);
-	}
-}
+../radeon/radeon_span.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_span.h b/src/mesa/drivers/dri/r300/radeon_span.h
new file mode 120000
index 0000000000..f9d634508c
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_span.h
@@ -0,0 +1 @@
+../radeon/radeon_span.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_state.c b/src/mesa/drivers/dri/r300/radeon_state.c
deleted file mode 100644
index c401da6c54..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_state.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/**************************************************************************
-
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/api_arrayelt.h"
-#include "main/enums.h"
-#include "main/framebuffer.h"
-#include "main/colormac.h"
-#include "main/light.h"
-
-#include "swrast/swrast.h"
-#include "vbo/vbo.h"
-#include "tnl/tnl.h"
-#include "tnl/t_pipeline.h"
-#include "swrast_setup/swrast_setup.h"
-
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
-#include "r300_ioctl.h"
-
-
-/* =============================================================
- * Scissoring
- */
-
-static GLboolean intersect_rect(drm_clip_rect_t * out,
-				drm_clip_rect_t * a, drm_clip_rect_t * b)
-{
-	*out = *a;
-	if (b->x1 > out->x1)
-		out->x1 = b->x1;
-	if (b->y1 > out->y1)
-		out->y1 = b->y1;
-	if (b->x2 < out->x2)
-		out->x2 = b->x2;
-	if (b->y2 < out->y2)
-		out->y2 = b->y2;
-	if (out->x1 >= out->x2)
-		return GL_FALSE;
-	if (out->y1 >= out->y2)
-		return GL_FALSE;
-	return GL_TRUE;
-}
-
-void radeonRecalcScissorRects(radeonContextPtr radeon)
-{
-	drm_clip_rect_t *out;
-	int i;
-
-	/* Grow cliprect store?
-	 */
-	if (radeon->state.scissor.numAllocedClipRects < radeon->numClipRects) {
-		while (radeon->state.scissor.numAllocedClipRects <
-		       radeon->numClipRects) {
-			radeon->state.scissor.numAllocedClipRects += 1;	/* zero case */
-			radeon->state.scissor.numAllocedClipRects *= 2;
-		}
-
-		if (radeon->state.scissor.pClipRects)
-			FREE(radeon->state.scissor.pClipRects);
-
-		radeon->state.scissor.pClipRects =
-		    MALLOC(radeon->state.scissor.numAllocedClipRects *
-			   sizeof(drm_clip_rect_t));
-
-		if (radeon->state.scissor.pClipRects == NULL) {
-			radeon->state.scissor.numAllocedClipRects = 0;
-			return;
-		}
-	}
-
-	out = radeon->state.scissor.pClipRects;
-	radeon->state.scissor.numClipRects = 0;
-
-	for (i = 0; i < radeon->numClipRects; i++) {
-		if (intersect_rect(out,
-				   &radeon->pClipRects[i],
-				   &radeon->state.scissor.rect)) {
-			radeon->state.scissor.numClipRects++;
-			out++;
-		}
-	}
-}
-
-void radeonUpdateScissor(GLcontext* ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	if (radeon->dri.drawable) {
-		__DRIdrawablePrivate *dPriv = radeon->dri.drawable;
-		int x1 = dPriv->x + ctx->Scissor.X;
-		int y1 = dPriv->y + dPriv->h - (ctx->Scissor.Y + ctx->Scissor.Height);
-
-		radeon->state.scissor.rect.x1 = x1;
-		radeon->state.scissor.rect.y1 = y1;
-		radeon->state.scissor.rect.x2 = x1 + ctx->Scissor.Width;
-		radeon->state.scissor.rect.y2 = y1 + ctx->Scissor.Height;
-
-		radeonRecalcScissorRects(radeon);
-	}
-}
-
-static void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
-{
-	if (ctx->Scissor.Enabled) {
-		/* We don't pipeline cliprect changes */
-		r300Flush(ctx);
-		radeonUpdateScissor(ctx);
-	}
-}
-
-
-/**
- * Update cliprects and scissors.
- */
-void radeonSetCliprects(radeonContextPtr radeon)
-{
-	__DRIdrawablePrivate *const drawable = radeon->dri.drawable;
-	__DRIdrawablePrivate *const readable = radeon->dri.readable;
-	GLframebuffer *const draw_fb = (GLframebuffer*)drawable->driverPrivate;
-	GLframebuffer *const read_fb = (GLframebuffer*)readable->driverPrivate;
-
-	if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-		/* Can't ignore 2d windows if we are page flipping. */
-		if (drawable->numBackClipRects == 0 || radeon->doPageFlip ||
-		    radeon->sarea->pfCurrentPage == 1) {
-			radeon->numClipRects = drawable->numClipRects;
-			radeon->pClipRects = drawable->pClipRects;
-		} else {
-			radeon->numClipRects = drawable->numBackClipRects;
-			radeon->pClipRects = drawable->pBackClipRects;
-		}
-	} else {
-		/* front buffer (or none, or multiple buffers */
-		radeon->numClipRects = drawable->numClipRects;
-		radeon->pClipRects = drawable->pClipRects;
-	}
-
-	if ((draw_fb->Width != drawable->w) ||
-	    (draw_fb->Height != drawable->h)) {
-		_mesa_resize_framebuffer(radeon->glCtx, draw_fb,
-					 drawable->w, drawable->h);
-		draw_fb->Initialized = GL_TRUE;
-	}
-
-	if (drawable != readable) {
-		if ((read_fb->Width != readable->w) ||
-		    (read_fb->Height != readable->h)) {
-			_mesa_resize_framebuffer(radeon->glCtx, read_fb,
-						 readable->w, readable->h);
-			read_fb->Initialized = GL_TRUE;
-		}
-	}
-
-	if (radeon->state.scissor.enabled)
-		radeonRecalcScissorRects(radeon);
-
-	radeon->lastStamp = drawable->lastStamp;
-}
-
-
-/**
- * Handle common enable bits.
- * Called as a fallback by r200Enable/r300Enable.
- */
-void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	switch(cap) {
-	case GL_SCISSOR_TEST:
-		/* We don't pipeline cliprect & scissor changes */
-		r300Flush(ctx);
-
-		radeon->state.scissor.enabled = state;
-		radeonUpdateScissor(ctx);
-		break;
-
-	default:
-		return;
-	}
-}
-
-
-/**
- * Initialize default state.
- * This function is called once at context init time from
- * r200InitState/r300InitState
- */
-void radeonInitState(radeonContextPtr radeon)
-{
-	radeon->Fallback = 0;
-
-	if (radeon->glCtx->Visual.doubleBufferMode && radeon->sarea->pfCurrentPage == 0) {
-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
-	} else {
-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
-	}
-}
-
-
-/**
- * Initialize common state functions.
- * Called by r200InitStateFuncs/r300InitStateFuncs
- */
-void radeonInitStateFuncs(struct dd_function_table *functions)
-{
-	functions->Scissor = radeonScissor;
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_state.h b/src/mesa/drivers/dri/r300/radeon_state.h
deleted file mode 100644
index 821cb40c7e..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_state.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (C) 2004 Nicolai Haehnle.  All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Nicolai Haehnle <prefect_@gmx.net>
- */
-
-#ifndef __RADEON_STATE_H__
-#define __RADEON_STATE_H__
-
-extern void radeonRecalcScissorRects(radeonContextPtr radeon);
-extern void radeonSetCliprects(radeonContextPtr radeon);
-extern void radeonUpdateScissor(GLcontext* ctx);
-
-extern void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state);
-
-extern void radeonInitState(radeonContextPtr radeon);
-extern void radeonInitStateFuncs(struct dd_function_table* functions);
-
-#endif
diff --git a/src/mesa/drivers/dri/r300/radeon_texture.c b/src/mesa/drivers/dri/r300/radeon_texture.c
new file mode 120000
index 0000000000..a822710915
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_texture.c
@@ -0,0 +1 @@
+../radeon/radeon_texture.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/radeon_texture.h b/src/mesa/drivers/dri/r300/radeon_texture.h
new file mode 120000
index 0000000000..17fac3d5ea
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/radeon_texture.h
@@ -0,0 +1 @@
+../radeon/radeon_texture.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/server/radeon.h b/src/mesa/drivers/dri/r300/server/radeon.h
new file mode 120000
index 0000000000..81274a54f1
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/server/radeon.h
@@ -0,0 +1 @@
+../../radeon/server/radeon.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/server/radeon_dri.c b/src/mesa/drivers/dri/r300/server/radeon_dri.c
new file mode 120000
index 0000000000..d05847d650
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/server/radeon_dri.c
@@ -0,0 +1 @@
+../../radeon/server/radeon_dri.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/server/radeon_dri.h b/src/mesa/drivers/dri/r300/server/radeon_dri.h
new file mode 120000
index 0000000000..27c591d3c9
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/server/radeon_dri.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_dri.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/server/radeon_egl.c b/src/mesa/drivers/dri/r300/server/radeon_egl.c
new file mode 120000
index 0000000000..d7735a7643
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/server/radeon_egl.c
@@ -0,0 +1 @@
+../../radeon/server/radeon_egl.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/server/radeon_macros.h b/src/mesa/drivers/dri/r300/server/radeon_macros.h
new file mode 120000
index 0000000000..c56cd735b8
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/server/radeon_macros.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_macros.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r300/server/radeon_reg.h b/src/mesa/drivers/dri/r300/server/radeon_reg.h
new file mode 120000
index 0000000000..e2349dcb68
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/server/radeon_reg.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_reg.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/Lindent b/src/mesa/drivers/dri/r600/Lindent
new file mode 100755
index 0000000000..7d8d8896e3
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/Lindent
@@ -0,0 +1,2 @@
+#!/bin/sh
+indent -npro -kr -i8 -ts8 -sob -l80 -ss -ncs "$@"
diff --git a/src/mesa/drivers/dri/r600/Makefile b/src/mesa/drivers/dri/r600/Makefile
new file mode 100644
index 0000000000..d925a2dfe3
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/Makefile
@@ -0,0 +1,79 @@
+# src/mesa/drivers/dri/r300/Makefile
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+CFLAGS += $(RADEON_CFLAGS)
+
+LIBNAME = r600_dri.so
+
+MINIGLX_SOURCES = server/radeon_dri.c
+
+ifeq ($(USING_EGL), 1)
+EGL_SOURCES = server/radeon_egl.c
+endif
+
+ifeq ($(RADEON_LDFLAGS),)
+CS_SOURCES = radeon_cs_space_drm.c
+endif
+
+COMMON_SOURCES = \
+	../../common/driverfuncs.c \
+	../common/mm.c \
+	../common/utils.c \
+	../common/texmem.c \
+	../common/vblank.c \
+	../common/xmlconfig.c \
+	../common/dri_util.c
+
+RADEON_COMMON_SOURCES = \
+	radeon_bo_legacy.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_cs_legacy.c \
+	radeon_dma.c \
+	radeon_debug.c \
+	radeon_fbo.c \
+	radeon_lock.c \
+	radeon_mipmap_tree.c \
+	radeon_span.c \
+	radeon_texture.c \
+	radeon_queryobj.c
+
+DRIVER_SOURCES = \
+		 radeon_screen.c \
+		 r600_context.c \
+		 r600_cmdbuf.c \
+		 r600_emit.c       \
+		 r700_assembler.c  \
+		 r700_fragprog.c \
+		 r700_vertprog.c \
+		 r700_shader.c \
+		 r700_shaderinst.c \
+		 r700_ioctl.c \
+		 r700_oglprog.c \
+		 r700_chip.c     \
+		 r700_state.c    \
+		 r700_clear.c    \
+		 r700_render.c   \
+		 r600_tex.c      \
+		 r600_texstate.c      \
+		 r700_debug.c    \
+		 $(RADEON_COMMON_SOURCES) \
+		 $(EGL_SOURCES) \
+		 $(CS_SOURCES)
+
+C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
+
+DRIVER_DEFINES = -DCOMPILE_R600 -DR200_MERGED=0 \
+	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R600 \
+#	-DRADEON_BO_TRACK \
+	-Wall
+
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
+
+##### TARGETS #####
+
+include ../Makefile.template
+
+symlinks:
diff --git a/src/mesa/drivers/dri/r600/defaultendian.h b/src/mesa/drivers/dri/r600/defaultendian.h
new file mode 100644
index 0000000000..32caf32cd2
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/defaultendian.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+
+#ifndef _DEFINEENDIAN_H_
+#define _DEFINEENDIAN_H_
+
+//We have to choose a reg bits orientation if there is no compile flag for it.
+#if defined(LITTLEENDIAN_CPU)
+#elif defined(BIGENDIAN_CPU)
+#else
+#define LITTLEENDIAN_CPU
+#endif
+
+#endif //_DEFINEENDIAN_H_
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.c b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
new file mode 100644
index 0000000000..3cfe03a45f
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
@@ -0,0 +1,514 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * Mostly coppied from \radeon\radeon_cs_legacy.c
+ */
+
+#include <errno.h>
+
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/simple_list.h"
+#include "swrast/swrast.h"
+
+#include "drm.h"
+#include "radeon_drm.h"
+
+#include "r600_context.h"
+#include "radeon_reg.h"
+#include "r600_cmdbuf.h"
+#include "r600_emit.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_mipmap_tree.h"
+#include "radeon_reg.h"
+
+
+
+static struct radeon_cs * r600_cs_create(struct radeon_cs_manager *csm,
+                                   uint32_t ndw)
+{
+    struct radeon_cs *cs;
+
+    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
+    if (cs == NULL) {
+        return NULL;
+    }
+    cs->csm = csm;
+    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
+    cs->packets = (uint32_t*)malloc(4*cs->ndw);
+    if (cs->packets == NULL) {
+        free(cs);
+        return NULL;
+    }
+    cs->relocs_total_size = 0;
+    return cs;
+}
+
+static int r600_cs_write_reloc(struct radeon_cs *cs,
+			       struct radeon_bo *bo,
+			       uint32_t read_domain,
+			       uint32_t write_domain,
+			       uint32_t flags)
+{
+    struct r600_cs_reloc_legacy *relocs;
+    int i;
+
+    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+    /* check domains */
+    if ((read_domain && write_domain) || (!read_domain && !write_domain)) {
+        /* in one CS a bo can only be in read or write domain but not
+         * in read & write domain at the same sime
+         */
+        return -EINVAL;
+    }
+    if (read_domain == RADEON_GEM_DOMAIN_CPU) {
+        return -EINVAL;
+    }
+    if (write_domain == RADEON_GEM_DOMAIN_CPU) {
+        return -EINVAL;
+    }
+    /* check if bo is already referenced */
+    for(i = 0; i < cs->crelocs; i++) {
+        uint32_t *indices;
+        uint32_t *reloc_indices;
+
+        if (relocs[i].base.bo->handle == bo->handle) {
+            /* Check domains must be in read or write. As we check already
+             * checked that in argument one of the read or write domain was
+             * set we only need to check that if previous reloc as the read
+             * domain set then the read_domain should also be set for this
+             * new relocation.
+             */
+            if (relocs[i].base.read_domain && !read_domain) {
+                return -EINVAL;
+            }
+            if (relocs[i].base.write_domain && !write_domain) {
+                return -EINVAL;
+            }
+            relocs[i].base.read_domain |= read_domain;
+            relocs[i].base.write_domain |= write_domain;
+            /* save indice */
+            relocs[i].cindices++;
+            indices = (uint32_t*)realloc(relocs[i].indices,
+                                         relocs[i].cindices * 4);
+            reloc_indices = (uint32_t*)realloc(relocs[i].reloc_indices,
+                                               relocs[i].cindices * 4);
+            if ( (indices == NULL) || (reloc_indices == NULL) ) {
+                relocs[i].cindices -= 1;
+                return -ENOMEM;
+            }
+            relocs[i].indices = indices;
+            relocs[i].reloc_indices = reloc_indices;
+            relocs[i].indices[relocs[i].cindices - 1] = cs->cdw;
+            relocs[i].reloc_indices[relocs[i].cindices - 1] = cs->cdw;
+            cs->section_cdw += 2;
+	    cs->cdw += 2;
+
+            return 0;
+        }
+    }
+    /* add bo to reloc */
+    relocs = (struct r600_cs_reloc_legacy*)
+             realloc(cs->relocs,
+                     sizeof(struct r600_cs_reloc_legacy) * (cs->crelocs + 1));
+    if (relocs == NULL) {
+        return -ENOMEM;
+    }
+    cs->relocs = relocs;
+    relocs[cs->crelocs].base.bo = bo;
+    relocs[cs->crelocs].base.read_domain = read_domain;
+    relocs[cs->crelocs].base.write_domain = write_domain;
+    relocs[cs->crelocs].base.flags = flags;
+    relocs[cs->crelocs].indices = (uint32_t*)malloc(4);
+    relocs[cs->crelocs].reloc_indices = (uint32_t*)malloc(4);
+    if ( (relocs[cs->crelocs].indices == NULL) || (relocs[cs->crelocs].reloc_indices == NULL) )
+    {
+        return -ENOMEM;
+    }
+
+    relocs[cs->crelocs].indices[0] = cs->cdw;
+    relocs[cs->crelocs].reloc_indices[0] = cs->cdw;
+    cs->section_cdw += 2;
+    cs->cdw += 2;
+    relocs[cs->crelocs].cindices = 1;
+    cs->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
+    cs->crelocs++;
+
+    radeon_bo_ref(bo);
+
+    return 0;
+}
+
+static int r600_cs_begin(struct radeon_cs *cs,
+                    uint32_t ndw,
+                    const char *file,
+                    const char *func,
+                    int line)
+{
+    if (cs->section) {
+        fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
+                cs->section_file, cs->section_func, cs->section_line);
+        fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+
+    cs->section = 1;
+    cs->section_ndw = ndw;
+    cs->section_cdw = 0;
+    cs->section_file = file;
+    cs->section_func = func;
+    cs->section_line = line;
+
+    if (cs->cdw + ndw > cs->ndw) {
+        uint32_t tmp, *ptr;
+	int num = (ndw > 0x400) ? ndw : 0x400;
+
+        tmp = (cs->cdw + num + 0x3FF) & (~0x3FF);
+        ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
+        if (ptr == NULL) {
+            return -ENOMEM;
+        }
+        cs->packets = ptr;
+        cs->ndw = tmp;
+    }
+
+    return 0;
+}
+
+static int r600_cs_end(struct radeon_cs *cs,
+                  const char *file,
+                  const char *func,
+                  int line)
+
+{
+    if (!cs->section) {
+        fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+    cs->section = 0;
+
+    if ( cs->section_ndw != cs->section_cdw ) {
+        fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
+                cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
+        fprintf(stderr, "cs->section_ndw = %d, cs->cdw = %d, cs->section_cdw = %d \n",
+                cs->section_ndw, cs->cdw, cs->section_cdw);
+        fprintf(stderr, "CS section end at (%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+
+    if (cs->cdw > cs->ndw) {
+	    fprintf(stderr, "CS section overflow at (%s,%s,%d) cdw %d ndw %d\n",
+		    cs->section_file, cs->section_func, cs->section_line,cs->cdw,cs->ndw);
+	    fprintf(stderr, "CS section end at (%s,%s,%d)\n",
+		    file, func, line);
+	    assert(0);
+    }
+
+    return 0;
+}
+
+static int r600_cs_process_relocs(struct radeon_cs *cs, 
+                                  uint32_t * reloc_chunk,
+                                  uint32_t * length_dw_reloc_chunk) 
+{
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_reloc_legacy *relocs;
+    int i, j, r;
+
+    uint32_t offset_dw = 0;
+
+    csm = (struct r600_cs_manager_legacy*)cs->csm;
+    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+restart:
+    for (i = 0; i < cs->crelocs; i++) {
+            uint32_t soffset, eoffset, asicoffset;
+
+            r = radeon_bo_legacy_validate(relocs[i].base.bo,
+					  &soffset, &eoffset);
+	    if (r == -EAGAIN) {
+		    goto restart;
+            }
+            if (r) {
+		    fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+			    relocs[i].base.bo, soffset, eoffset);
+		    return r;
+            }
+            asicoffset = soffset;
+
+	    for (j = 0; j < relocs[i].cindices; j++) {
+		    if (asicoffset >= eoffset) {
+			    /*                radeon_bo_debug(relocs[i].base.bo, 12); */
+			    fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+				    relocs[i].base.bo, soffset, eoffset);
+			    fprintf(stderr, "above end: %p 0x%08X 0x%08X\n",
+				    relocs[i].base.bo,
+				    cs->packets[relocs[i].indices[j]],
+				    eoffset);
+			    exit(0);
+			    return -EINVAL;
+		    }
+		    /* pkt3 nop header in ib chunk */
+		    cs->packets[relocs[i].reloc_indices[j]] = 0xC0001000;
+		    /* reloc index in ib chunk */
+		    cs->packets[relocs[i].reloc_indices[j] + 1] = offset_dw;
+	    }
+
+	    /* asic offset in reloc chunk */ /* see alex drm r600_nomm_relocate */
+	    reloc_chunk[offset_dw] = asicoffset;
+	    reloc_chunk[offset_dw + 3] = 0;
+
+	    offset_dw += 4;
+    }
+
+    *length_dw_reloc_chunk = offset_dw;
+
+    return 0;
+}
+
+static int r600_cs_set_age(struct radeon_cs *cs) /* -------------- */
+{
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_reloc_legacy *relocs;
+    int i;
+
+    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+    for (i = 0; i < cs->crelocs; i++) {
+        radeon_bo_legacy_pending(relocs[i].base.bo, csm->pending_age);
+        radeon_bo_unref(relocs[i].base.bo);
+    }
+    return 0;
+}
+
+#if 0
+static void dump_cmdbuf(struct radeon_cs *cs)
+{
+	int i;
+	fprintf(stderr,"--start--\n");
+	for (i = 0; i < cs->cdw; i++){
+		fprintf(stderr,"0x%08x\n", cs->packets[i]);
+	}
+	fprintf(stderr,"--end--\n");
+
+}
+#endif
+
+static int r600_cs_emit(struct radeon_cs *cs)
+{
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct drm_radeon_cs       cs_cmd;
+    struct drm_radeon_cs_chunk cs_chunk[2];
+    uint32_t length_dw_reloc_chunk;
+    uint64_t chunk_ptrs[2];
+    uint32_t *reloc_chunk;
+    int r;
+    int retry = 0;
+
+    /* TODO : put chip level things here if need. */
+    /* csm->ctx->vtbl.emit_cs_header(cs, csm->ctx); */
+
+    csm->pending_count = 1;
+
+    reloc_chunk = (uint32_t*)calloc(1, cs->crelocs * 4 * 4);
+
+    r = r600_cs_process_relocs(cs, reloc_chunk, &length_dw_reloc_chunk);
+    if (r) {
+	free(reloc_chunk);
+        return 0;
+    }
+
+    /* raw ib chunk */
+    cs_chunk[0].chunk_id   = RADEON_CHUNK_ID_IB;
+    cs_chunk[0].length_dw  = cs->cdw;
+    cs_chunk[0].chunk_data = (unsigned long)(cs->packets);
+
+    /* reloc chaunk */
+    cs_chunk[1].chunk_id   = RADEON_CHUNK_ID_RELOCS;
+    cs_chunk[1].length_dw  = length_dw_reloc_chunk;
+    cs_chunk[1].chunk_data = (unsigned long)reloc_chunk;
+
+    chunk_ptrs[0] = (uint64_t)(unsigned long)&(cs_chunk[0]);
+    chunk_ptrs[1] = (uint64_t)(unsigned long)&(cs_chunk[1]);
+
+    cs_cmd.num_chunks = 2;
+    /* cs_cmd.cs_id      = 0; */
+    cs_cmd.chunks     = (uint64_t)(unsigned long)chunk_ptrs;
+
+    //dump_cmdbuf(cs);
+
+    do 
+    {
+        r = drmCommandWriteRead(cs->csm->fd, DRM_RADEON_CS, &cs_cmd, sizeof(cs_cmd));
+        retry++;
+    } while (r == -EAGAIN && retry < 1000);
+
+    if (r) {
+	free(reloc_chunk);
+        return r;
+    }
+
+    csm->pending_age = cs_cmd.cs_id;
+
+    r600_cs_set_age(cs);
+
+    cs->csm->read_used = 0;
+    cs->csm->vram_write_used = 0;
+    cs->csm->gart_write_used = 0;
+
+    free(reloc_chunk);
+
+    return 0;
+}
+
+static void inline r600_cs_free_reloc(void *relocs_p, int crelocs)
+{
+    struct r600_cs_reloc_legacy *relocs = relocs_p;
+    int i;
+    if (!relocs_p)
+      return;
+    for (i = 0; i < crelocs; i++)
+    {
+        free(relocs[i].indices);
+        free(relocs[i].reloc_indices);
+    }
+}
+
+static int r600_cs_destroy(struct radeon_cs *cs)
+{
+    r600_cs_free_reloc(cs->relocs, cs->crelocs);
+    free(cs->relocs);
+    free(cs->packets);
+    free(cs);
+    return 0;
+}
+
+static int r600_cs_erase(struct radeon_cs *cs)
+{
+    r600_cs_free_reloc(cs->relocs, cs->crelocs);
+    free(cs->relocs);
+    cs->relocs_total_size = 0;
+    cs->relocs = NULL;
+    cs->crelocs = 0;
+    cs->cdw = 0;
+    cs->section = 0;
+    return 0;
+}
+
+static int r600_cs_need_flush(struct radeon_cs *cs)
+{
+    /* this function used to flush when the BO usage got to
+     * a certain size, now the higher levels handle this better */
+    return 0;
+}
+
+static void r600_cs_print(struct radeon_cs *cs, FILE *file)
+{
+}
+
+static struct radeon_cs_funcs  r600_cs_funcs = {
+    r600_cs_create,
+    r600_cs_write_reloc,
+    r600_cs_begin,
+    r600_cs_end,
+    r600_cs_emit,
+    r600_cs_destroy,
+    r600_cs_erase,
+    r600_cs_need_flush,
+    r600_cs_print
+};
+
+struct radeon_cs_manager * r600_radeon_cs_manager_legacy_ctor(struct radeon_context *ctx)
+{
+    struct r600_cs_manager_legacy *csm;
+
+    csm = (struct r600_cs_manager_legacy*)
+          calloc(1, sizeof(struct r600_cs_manager_legacy));
+    if (csm == NULL) {
+        return NULL;
+    }
+    csm->base.funcs = &r600_cs_funcs;
+    csm->base.fd = ctx->dri.fd;
+    csm->ctx = ctx;
+    csm->pending_age = 1;
+    return (struct radeon_cs_manager*)csm;
+}
+
+void r600InitCmdBuf(context_t *r600) /* from rcommonInitCmdBuf */
+{
+	radeonContextPtr rmesa = &r600->radeon;
+	GLuint size;
+
+	r600InitAtoms(r600);
+
+	/* Initialize command buffer */
+	size = 256 * driQueryOptioni(&rmesa->optionCache,
+				     "command_buffer_size");
+	if (size < 2 * rmesa->hw.max_state_size) {
+		size = 2 * rmesa->hw.max_state_size + 65535;
+	}
+	if (size > 64 * 256)
+		size = 64 * 256;
+
+	if (rmesa->radeonScreen->kernel_mm) {
+		int fd = rmesa->radeonScreen->driScreen->fd;
+		rmesa->cmdbuf.csm = radeon_cs_manager_gem_ctor(fd);
+	} else {
+		rmesa->cmdbuf.csm = r600_radeon_cs_manager_legacy_ctor(rmesa);
+	}
+	if (rmesa->cmdbuf.csm == NULL) {
+		/* FIXME: fatal error */
+		return;
+	}
+	rmesa->cmdbuf.cs = radeon_cs_create(rmesa->cmdbuf.csm, size);
+	assert(rmesa->cmdbuf.cs != NULL);
+	rmesa->cmdbuf.size = size;
+
+	radeon_cs_space_set_flush(rmesa->cmdbuf.cs,
+				  (void (*)(void *))rmesa->glCtx->Driver.Flush, rmesa->glCtx);
+
+	if (!rmesa->radeonScreen->kernel_mm) {
+		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, rmesa->radeonScreen->texSize[0]);
+		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, rmesa->radeonScreen->gartTextures.size);
+	} else {
+		struct drm_radeon_gem_info mminfo;
+
+		if (!drmCommandWriteRead(rmesa->dri.fd, DRM_RADEON_GEM_INFO, &mminfo, sizeof(mminfo)))
+		{
+			radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, mminfo.vram_visible);
+			radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, mminfo.gart_size);
+		}
+	}
+}
+
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.h b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
new file mode 100644
index 0000000000..eba43d37b6
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
@@ -0,0 +1,212 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R600_CMDBUF_H__
+#define __R600_CMDBUF_H__
+
+#include "r600_context.h"
+#include "r600_emit.h"
+
+#define RADEON_CP_PACKET3_NOP                       0xC0001000
+#define RADEON_CP_PACKET3_NEXT_CHAR                 0xC0001900
+#define RADEON_CP_PACKET3_PLY_NEXTSCAN              0xC0001D00
+#define RADEON_CP_PACKET3_SET_SCISSORS              0xC0001E00
+#define RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM     0xC0002300
+#define RADEON_CP_PACKET3_LOAD_MICROCODE            0xC0002400
+#define RADEON_CP_PACKET3_WAIT_FOR_IDLE             0xC0002600
+#define RADEON_CP_PACKET3_3D_DRAW_VBUF              0xC0002800
+#define RADEON_CP_PACKET3_3D_DRAW_IMMD              0xC0002900
+#define RADEON_CP_PACKET3_3D_DRAW_INDX              0xC0002A00
+#define RADEON_CP_PACKET3_LOAD_PALETTE              0xC0002C00
+#define RADEON_CP_PACKET3_3D_LOAD_VBPNTR            0xC0002F00
+#define RADEON_CP_PACKET3_CNTL_PAINT                0xC0009100
+#define RADEON_CP_PACKET3_CNTL_BITBLT               0xC0009200
+#define RADEON_CP_PACKET3_CNTL_SMALLTEXT            0xC0009300
+#define RADEON_CP_PACKET3_CNTL_HOSTDATA_BLT         0xC0009400
+#define RADEON_CP_PACKET3_CNTL_POLYLINE             0xC0009500
+#define RADEON_CP_PACKET3_CNTL_POLYSCANLINES        0xC0009800
+#define RADEON_CP_PACKET3_CNTL_PAINT_MULTI          0xC0009A00
+#define RADEON_CP_PACKET3_CNTL_BITBLT_MULTI         0xC0009B00
+#define RADEON_CP_PACKET3_CNTL_TRANS_BITBLT         0xC0009C00
+
+/* r6xx/r7xx packet 3 type offsets */
+#define R600_SET_CONFIG_REG_OFFSET                  0x00008000
+#define R600_SET_CONFIG_REG_END                     0x0000ac00
+#define R600_SET_CONTEXT_REG_OFFSET                 0x00028000
+#define R600_SET_CONTEXT_REG_END                    0x00029000
+#define R600_SET_ALU_CONST_OFFSET                   0x00030000
+#define R600_SET_ALU_CONST_END                      0x00032000
+#define R600_SET_RESOURCE_OFFSET                    0x00038000
+#define R600_SET_RESOURCE_END                       0x0003c000
+#define R600_SET_SAMPLER_OFFSET                     0x0003c000
+#define R600_SET_SAMPLER_END                        0x0003cff0
+#define R600_SET_CTL_CONST_OFFSET                   0x0003cff0
+#define R600_SET_CTL_CONST_END                      0x0003e200
+#define R600_SET_LOOP_CONST_OFFSET                  0x0003e200
+#define R600_SET_LOOP_CONST_END                     0x0003e380
+#define R600_SET_BOOL_CONST_OFFSET                  0x0003e380
+#define R600_SET_BOOL_CONST_END                     0x00040000
+
+/* r6xx/r7xx packet 3 types */
+#define R600_IT_INDIRECT_BUFFER_END               0x00001700
+#define R600_IT_SET_PREDICATION                   0x00002000
+#define R600_IT_REG_RMW                           0x00002100
+#define R600_IT_COND_EXEC                         0x00002200
+#define R600_IT_PRED_EXEC                         0x00002300
+#define R600_IT_START_3D_CMDBUF                   0x00002400
+#define R600_IT_DRAW_INDEX_2                      0x00002700
+#define R600_IT_CONTEXT_CONTROL                   0x00002800
+#define R600_IT_DRAW_INDEX_IMMD_BE                0x00002900
+#define R600_IT_INDEX_TYPE                        0x00002A00
+#define R600_IT_DRAW_INDEX                        0x00002B00
+#define R600_IT_DRAW_INDEX_AUTO                   0x00002D00
+#define R600_IT_DRAW_INDEX_IMMD                   0x00002E00
+#define R600_IT_NUM_INSTANCES                     0x00002F00
+#define R600_IT_STRMOUT_BUFFER_UPDATE             0x00003400
+#define R600_IT_INDIRECT_BUFFER_MP                0x00003800
+#define R600_IT_MEM_SEMAPHORE                     0x00003900
+#define R600_IT_MPEG_INDEX                        0x00003A00
+#define R600_IT_WAIT_REG_MEM                      0x00003C00
+#define R600_IT_MEM_WRITE                         0x00003D00
+#define R600_IT_INDIRECT_BUFFER                   0x00003200
+#define R600_IT_CP_INTERRUPT                      0x00004000
+#define R600_IT_SURFACE_SYNC                      0x00004300
+#define R600_IT_ME_INITIALIZE                     0x00004400
+#define R600_IT_COND_WRITE                        0x00004500
+#define R600_IT_EVENT_WRITE                       0x00004600
+#define R600_IT_EVENT_WRITE_EOP                   0x00004700
+#define R600_IT_ONE_REG_WRITE                     0x00005700
+#define R600_IT_SET_CONFIG_REG                    0x00006800
+#define R600_IT_SET_CONTEXT_REG                   0x00006900
+#define R600_IT_SET_ALU_CONST                     0x00006A00
+#define R600_IT_SET_BOOL_CONST                    0x00006B00
+#define R600_IT_SET_LOOP_CONST                    0x00006C00
+#define R600_IT_SET_RESOURCE                      0x00006D00
+#define R600_IT_SET_SAMPLER                       0x00006E00
+#define R600_IT_SET_CTL_CONST                     0x00006F00
+#define R600_IT_SURFACE_BASE_UPDATE               0x00007300
+
+struct r600_cs_manager_legacy
+{
+    struct radeon_cs_manager    base;
+    struct radeon_context       *ctx;
+    /* hack for scratch stuff */
+    uint32_t                    pending_age;
+    uint32_t                    pending_count;
+};
+
+struct r600_cs_reloc_legacy {
+    struct radeon_cs_reloc  base;
+    uint32_t                cindices;
+    uint32_t                *indices;
+    uint32_t                *reloc_indices;
+};
+
+struct radeon_cs_manager * r600_radeon_cs_manager_legacy_ctor(struct radeon_context *ctx);
+
+/**
+ * Write one dword to the command buffer.
+ */
+#define R600_OUT_BATCH(data)				\
+do {							\
+        radeon_cs_write_dword(b_l_rmesa->cmdbuf.cs, data);	\
+} while(0)
+
+/**
+ * Write n dwords from ptr to the command buffer.
+ */
+#define R600_OUT_BATCH_TABLE(ptr,n)		\
+do {						     \
+	radeon_cs_write_table(b_l_rmesa->cmdbuf.cs, ptr, n);	\
+} while(0)
+
+/**
+ * Write a relocated dword to the command buffer.
+ */
+#define R600_OUT_BATCH_RELOC(data, bo, offset, rd, wd, flags) 	\
+	do { 							\
+        if (0 && offset) {					\
+            fprintf(stderr, "(%s:%s:%d) offset : %d\n",		\
+            __FILE__, __FUNCTION__, __LINE__, offset);		\
+        }							\
+        radeon_cs_write_reloc(b_l_rmesa->cmdbuf.cs, 		\
+                              bo, rd, wd, flags);		\
+	} while(0)
+
+/* R600/R700 */
+#define R600_OUT_BATCH_REGS(reg, num)					\
+do {								\
+	if ((reg) >= R600_SET_CONFIG_REG_OFFSET && (reg) < R600_SET_CONFIG_REG_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_CONFIG_REG_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_CONTEXT_REG_OFFSET && (reg) < R600_SET_CONTEXT_REG_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONTEXT_REG, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_CONTEXT_REG_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_ALU_CONST_OFFSET && (reg) < R600_SET_ALU_CONST_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_ALU_CONST, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_ALU_CONST_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_RESOURCE_OFFSET && (reg) < R600_SET_RESOURCE_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_RESOURCE_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_SAMPLER_OFFSET && (reg) < R600_SET_SAMPLER_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_SAMPLER, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_SAMPLER_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_CTL_CONST_OFFSET && (reg) < R600_SET_CTL_CONST_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_CTL_CONST_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_LOOP_CONST_OFFSET && (reg) < R600_SET_LOOP_CONST_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_LOOP_CONST, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_LOOP_CONST_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_BOOL_CONST_OFFSET && (reg) < R600_SET_BOOL_CONST_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_BOOL_CONST, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_BOOL_CONST_OFFSET) >> 2);	\
+	} else {							\
+		R600_OUT_BATCH(CP_PACKET0((reg), (num))); \
+	}								\
+} while (0)
+
+/** Single register write to command buffer; requires 3 dwords for most things. */
+#define R600_OUT_BATCH_REGVAL(reg, val)		\
+	R600_OUT_BATCH_REGS((reg), 1);		\
+	R600_OUT_BATCH((val))
+
+/** Continuous register range write to command buffer; requires 1 dword,
+ * expects count dwords afterwards for register contents. */
+#define R600_OUT_BATCH_REGSEQ(reg, count)	\
+	R600_OUT_BATCH_REGS((reg), (count))
+
+extern void r600InitCmdBuf(context_t *r600);
+
+#endif				/* __R600_CMDBUF_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
new file mode 100644
index 0000000000..07a7bcf11f
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -0,0 +1,396 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "main/glheader.h"
+#include "main/api_arrayelt.h"
+#include "main/context.h"
+#include "main/simple_list.h"
+#include "main/imports.h"
+#include "main/matrix.h"
+#include "main/extensions.h"
+#include "main/state.h"
+#include "main/bufferobj.h"
+#include "main/texobj.h"
+
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "tnl/t_vp_build.h"
+
+#include "drivers/common/driverfuncs.h"
+
+#include "radeon_debug.h"
+#include "r600_context.h"
+#include "radeon_common_context.h"
+#include "radeon_span.h"
+#include "r600_cmdbuf.h"
+#include "r600_emit.h"
+#include "radeon_bocs_wrapper.h"
+
+#include "r700_state.h"
+#include "r700_ioctl.h"
+
+
+#include "vblank.h"
+#include "utils.h"
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+
+/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
+int future_hw_tcl_on = 1;
+int hw_tcl_on = 1;
+
+#define need_GL_VERSION_2_0
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_vertex_program
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_gpu_program_parameters
+#define need_GL_EXT_provoking_vertex
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_stencil_two_side
+#define need_GL_ATI_separate_stencil
+#define need_GL_NV_vertex_program
+
+#include "extension_helper.h"
+
+extern const struct tnl_pipeline_stage *r700_pipeline[];
+
+const struct dri_extension card_extensions[] = {
+  /* *INDENT-OFF* */
+  {"GL_ARB_depth_texture",		NULL},
+  {"GL_ARB_fragment_program",		NULL},
+  {"GL_ARB_multitexture",		NULL},
+  {"GL_ARB_point_parameters",		GL_ARB_point_parameters_functions},
+  {"GL_ARB_shadow",			NULL},
+  {"GL_ARB_shadow_ambient",		NULL},
+  {"GL_ARB_texture_border_clamp",	NULL},
+  {"GL_ARB_texture_cube_map",		NULL},
+  {"GL_ARB_texture_env_add",		NULL},
+  {"GL_ARB_texture_env_combine",	NULL},
+  {"GL_ARB_texture_env_crossbar",	NULL},
+  {"GL_ARB_texture_env_dot3",		NULL},
+  {"GL_ARB_texture_mirrored_repeat",	NULL},
+  {"GL_ARB_vertex_program",		GL_ARB_vertex_program_functions},
+  {"GL_EXT_blend_equation_separate",	GL_EXT_blend_equation_separate_functions},
+  {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
+  {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
+  {"GL_EXT_blend_subtract",		NULL},
+  {"GL_EXT_packed_depth_stencil",	NULL},
+  {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
+  {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+  {"GL_EXT_provoking_vertex",           GL_EXT_provoking_vertex_functions },
+  {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
+  {"GL_EXT_shadow_funcs",		NULL},
+  {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
+  {"GL_EXT_stencil_wrap",		NULL},
+  {"GL_EXT_texture_edge_clamp",		NULL},
+  {"GL_EXT_texture_env_combine", 	NULL},
+  {"GL_EXT_texture_env_dot3", 		NULL},
+  {"GL_EXT_texture_filter_anisotropic",	NULL},
+  {"GL_EXT_texture_lod_bias",		NULL},
+  {"GL_EXT_texture_mirror_clamp",	NULL},
+  {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_EXT_texture_sRGB",               NULL},
+  {"GL_ATI_separate_stencil",		GL_ATI_separate_stencil_functions},
+  {"GL_ATI_texture_env_combine3",	NULL},
+  {"GL_ATI_texture_mirror_once",	NULL},
+  {"GL_MESA_pack_invert",		NULL},
+  {"GL_MESA_ycbcr_texture",		NULL},
+  {"GL_MESAX_texture_float",		NULL},
+  {"GL_NV_blend_square",		NULL},
+  {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
+  {"GL_SGIS_generate_mipmap",		NULL},
+  {NULL,				NULL}
+  /* *INDENT-ON* */
+};
+
+
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
+/**
+ * The GL 2.0 functions are needed to make display lists work with
+ * functions added by GL_ATI_separate_stencil.
+ */
+const struct dri_extension gl_20_extension[] = {
+  {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
+};
+
+
+static void r600RunPipeline(GLcontext * ctx)
+{
+    _mesa_lock_context_textures(ctx);
+
+    if (ctx->NewState)
+        _mesa_update_state_locked(ctx);
+    
+    _tnl_run_pipeline(ctx);
+    _mesa_unlock_context_textures(ctx);
+}
+
+static void r600_get_lock(radeonContextPtr rmesa)
+{
+	drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+		sarea->ctx_owner = rmesa->dri.hwContext;
+		if (!rmesa->radeonScreen->kernel_mm)
+			radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
+	}
+}		  
+
+static void r600_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+    /* please flush pipe do all pending work */
+    /* to be enabled */
+}
+
+static void r600_vtbl_pre_emit_atoms(radeonContextPtr radeon)
+{
+	r700Start3D((context_t *)radeon);
+}
+
+static void r600_fallback(GLcontext *ctx, GLuint bit, GLboolean mode)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	if (mode)
+		context->radeon.Fallback |= bit;
+	else
+		context->radeon.Fallback &= ~bit;
+}
+
+static void r600_init_vtbl(radeonContextPtr radeon)
+{
+	radeon->vtbl.get_lock = r600_get_lock;
+	radeon->vtbl.update_viewport_offset = r700UpdateViewportOffset;
+	radeon->vtbl.emit_cs_header = r600_vtbl_emit_cs_header;
+	radeon->vtbl.swtcl_flush = NULL;
+	radeon->vtbl.pre_emit_atoms = r600_vtbl_pre_emit_atoms;
+	radeon->vtbl.fallback = r600_fallback;
+}
+
+/* Create the device specific rendering context.
+ */
+GLboolean r600CreateContext(const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	struct dd_function_table functions;
+	context_t *r600;
+	GLcontext *ctx;
+
+	assert(glVisual);
+	assert(driContextPriv);
+	assert(screen);
+
+	/* Allocate the R600 context */
+	r600 = (context_t*) CALLOC(sizeof(*r600));
+	if (!r600) {
+		radeon_error("Failed to allocate memory for context.\n");
+		return GL_FALSE;
+	}
+
+	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
+		hw_tcl_on = future_hw_tcl_on = 0;
+
+	r600_init_vtbl(&r600->radeon);
+	/* Parse configuration files.
+	 * Do this here so that initialMaxAnisotropy is set before we create
+	 * the default textures.
+	 */
+	driParseConfigFiles(&r600->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r600");
+
+	r600->radeon.initialMaxAnisotropy = driQueryOptionf(&r600->radeon.optionCache,
+						     "def_max_anisotropy");
+
+	/* Init default driver functions then plug in our R600-specific functions
+	 * (the texture functions are especially important)
+	 */
+	_mesa_init_driver_functions(&functions);
+
+	r700InitStateFuncs(&functions);
+	r600InitTextureFuncs(&functions);
+	r700InitShaderFuncs(&functions);
+	r700InitIoctlFuncs(&functions);
+
+	if (!radeonInitContext(&r600->radeon, &functions,
+			       glVisual, driContextPriv,
+			       sharedContextPrivate)) {
+		radeon_error("Initializing context failed.\n");
+		FREE(r600);
+		return GL_FALSE;
+	}
+
+	/* Init r600 context data */
+	/* Set the maximum texture size small enough that we can guarentee that
+	 * all texture units can bind a maximal texture and have them both in
+	 * texturable memory at once.
+	 */
+
+	ctx = r600->radeon.glCtx;
+
+	ctx->Const.MaxTextureImageUnits =
+	    driQueryOptioni(&r600->radeon.optionCache, "texture_image_units");
+	ctx->Const.MaxTextureCoordUnits =
+	    driQueryOptioni(&r600->radeon.optionCache, "texture_coord_units");
+	ctx->Const.MaxTextureUnits =
+	    MIN2(ctx->Const.MaxTextureImageUnits,
+		 ctx->Const.MaxTextureCoordUnits);
+	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+	ctx->Const.MaxTextureLodBias = 16.0;
+
+	ctx->Const.MaxTextureLevels = 13;
+	ctx->Const.MaxTextureRectSize = 4096;
+
+	ctx->Const.MinPointSize   = 0x0001 / 8.0;
+	ctx->Const.MinPointSizeAA = 0x0001 / 8.0;
+	ctx->Const.MaxPointSize   = 0xffff / 8.0;
+	ctx->Const.MaxPointSizeAA = 0xffff / 8.0;
+
+	ctx->Const.MinLineWidth   = 0x0001 / 8.0;
+	ctx->Const.MinLineWidthAA = 0x0001 / 8.0;
+	ctx->Const.MaxLineWidth   = 0xffff / 8.0;
+	ctx->Const.MaxLineWidthAA = 0xffff / 8.0;
+
+	/* Needs further modifications */
+#if 0
+	ctx->Const.MaxArrayLockSize =
+	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
+#endif
+
+	ctx->Const.MaxDrawBuffers = 1;
+
+	/* Initialize the software rasterizer and helper modules.
+	 */
+	_swrast_CreateContext(ctx);
+	_vbo_CreateContext(ctx);
+	_tnl_CreateContext(ctx);
+	_swsetup_CreateContext(ctx);
+	_swsetup_Wakeup(ctx);
+	_ae_create_context(ctx);
+
+	/* Install the customized pipeline:
+	 */
+	_tnl_destroy_pipeline(ctx);
+	_tnl_install_pipeline(ctx, r700_pipeline);
+
+	/* Try and keep materials and vertices separate:
+	 */
+/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
+
+	/* Configure swrast and TNL to match hardware characteristics:
+	 */
+	_swrast_allow_pixel_fog(ctx, GL_FALSE);
+	_swrast_allow_vertex_fog(ctx, GL_TRUE);
+	_tnl_allow_pixel_fog(ctx, GL_FALSE);
+	_tnl_allow_vertex_fog(ctx, GL_TRUE);
+
+	/* currently bogus data */
+	ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+	ctx->Const.VertexProgram.MaxNativeInstructions =
+		VSF_MAX_FRAGMENT_LENGTH / 4;
+	ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+	ctx->Const.VertexProgram.MaxTemps = 32;
+	ctx->Const.VertexProgram.MaxNativeTemps =
+		/*VSF_MAX_FRAGMENT_TEMPS */ 32;
+	ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+	ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+
+	ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
+	ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+	ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
+	ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
+	ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
+	ctx->Const.FragmentProgram.MaxNativeInstructions =
+	    PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
+	ctx->Const.FragmentProgram.MaxNativeTexIndirections =
+	    PFS_MAX_TEX_INDIRECT;
+	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
+	ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+
+	radeon_init_debug();
+
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+	if (r600->radeon.radeonScreen->kernel_mm)
+	  driInitExtensions(ctx, mm_extensions, GL_FALSE);
+
+	if (driQueryOptionb
+	    (&r600->radeon.optionCache, "disable_stencil_two_side"))
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+	if (r600->radeon.glCtx->Mesa_DXTn
+	    && !driQueryOptionb(&r600->radeon.optionCache, "disable_s3tc")) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else
+	    if (driQueryOptionb(&r600->radeon.optionCache, "force_s3tc_enable"))
+	{
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	}
+
+	radeon_fbo_init(&r600->radeon);
+   	radeonInitSpanFuncs( ctx );
+
+	r600InitCmdBuf(r600);
+
+	r700InitState(r600->radeon.glCtx);
+
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = r600RunPipeline;
+
+	if (driQueryOptionb(&r600->radeon.optionCache, "no_rast")) {
+		radeon_warning("disabling 3D acceleration\n");
+#if R200_MERGED
+		FALLBACK(&r600->radeon, RADEON_FALLBACK_DISABLE, 1);
+#endif
+	}
+
+	return GL_TRUE;
+}
+
+
diff --git a/src/mesa/drivers/dri/r600/r600_context.h b/src/mesa/drivers/dri/r600/r600_context.h
new file mode 100644
index 0000000000..c59df7505a
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_context.h
@@ -0,0 +1,208 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R600_CONTEXT_H__
+#define __R600_CONTEXT_H__
+
+#include "tnl/t_vertex.h"
+#include "drm.h"
+#include "radeon_drm.h"
+#include "dri_util.h"
+#include "texmem.h"
+#include "radeon_common.h"
+
+#include "main/macros.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+
+#include "r700_chip.h"
+#include "r600_tex.h"
+#include "r700_oglprog.h"
+#include "r700_vertprog.h"
+
+struct r600_context;
+typedef struct r600_context context_t;
+
+#include "main/mm.h"
+
+/************ DMA BUFFERS **************/
+
+/* The blit width for texture uploads
+ */
+#define R600_BLIT_WIDTH_BYTES 1024
+#define R600_MAX_TEXTURE_UNITS 8
+
+struct r600_texture_state {
+	int tc_count;		/* number of incoming texture coordinates from VAP */
+};
+
+/* Perhaps more if we store programs in vmem? */
+/* drm_r600_cmd_header_t->vpu->count is unsigned char */
+#define VSF_MAX_FRAGMENT_LENGTH (255*4)
+
+/* Can be tested with colormat currently. */
+#define VSF_MAX_FRAGMENT_TEMPS (14)
+
+#define STATE_R600_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
+#define STATE_R600_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
+
+extern int hw_tcl_on;
+
+#define COLOR_IS_RGBA
+#define TAG(x) r600##x
+#include "tnl_dd/t_dd_vertex.h"
+#undef TAG
+
+#define PFS_MAX_ALU_INST	64
+#define PFS_MAX_TEX_INST	64
+#define PFS_MAX_TEX_INDIRECT 4
+#define PFS_NUM_TEMP_REGS	32
+#define PFS_NUM_CONST_REGS	16
+
+#define R600_MAX_AOS_ARRAYS		16
+
+#define REG_COORDS	0
+#define REG_COLOR0	1
+#define REG_TEX0	2
+
+#define R600_FALLBACK_NONE 0
+#define R600_FALLBACK_TCL 1
+#define R600_FALLBACK_RAST 2
+
+enum 
+{
+    NO_SHIFT    = 0,
+    LEFT_SHIFT  = 1,
+    RIGHT_SHIFT = 2,
+};
+
+struct r600_hw_state {
+	struct radeon_state_atom sq;
+	struct radeon_state_atom db;
+	struct radeon_state_atom stencil;
+	struct radeon_state_atom db_target;
+	struct radeon_state_atom sc;
+	struct radeon_state_atom scissor;
+	struct radeon_state_atom aa;
+	struct radeon_state_atom cl;
+	struct radeon_state_atom gb;
+	struct radeon_state_atom ucp;
+	struct radeon_state_atom su;
+	struct radeon_state_atom poly;
+	struct radeon_state_atom cb;
+	struct radeon_state_atom clrcmp;
+	struct radeon_state_atom blnd;
+	struct radeon_state_atom blnd_clr;
+	struct radeon_state_atom cb_target;
+	struct radeon_state_atom sx;
+	struct radeon_state_atom vgt;
+	struct radeon_state_atom spi;
+	struct radeon_state_atom vpt;
+
+	struct radeon_state_atom fs;
+	struct radeon_state_atom vs;
+	struct radeon_state_atom ps;
+
+	struct radeon_state_atom vs_consts;
+	struct radeon_state_atom ps_consts;
+
+	struct radeon_state_atom vtx;
+	struct radeon_state_atom tx;
+	struct radeon_state_atom tx_smplr;
+	struct radeon_state_atom tx_brdr_clr;
+};
+
+/**
+ * \brief R600 context structure.
+ */
+struct r600_context {
+	struct radeon_context radeon;	/* parent class, must be first */
+
+	/* ------ */
+	R700_CHIP_CONTEXT hw;
+
+	struct r600_hw_state atoms;
+
+	struct r700_vertex_program *selected_vp;
+
+	/* Vertex buffers
+	 */
+	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
+	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
+
+};
+
+#define R700_CONTEXT(ctx)		((context_t *)(ctx->DriverCtx))
+#define GL_CONTEXT(context)     ((GLcontext *)(context->radeon.glCtx))
+
+extern GLboolean r600CreateContext(const __GLcontextModes * glVisual,
+				   __DRIcontextPrivate * driContextPriv,
+				   void *sharedContextPrivate);
+
+#define R700_CONTEXT_STATES(context) ((R700_CHIP_CONTEXT *)(&context->hw))
+
+#define R600_NEWPRIM( rmesa )			\
+do {						\
+	if ( rmesa->radeon.dma.flush )			\
+		rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
+} while (0)
+
+#define R600_STATECHANGE(r600, ATOM)			\
+do {							\
+	R600_NEWPRIM(r600);					\
+	r600->atoms.ATOM.dirty = GL_TRUE;					\
+	r600->radeon.hw.is_dirty = GL_TRUE;			\
+} while(0)
+
+extern GLboolean r700SyncSurf(context_t *context,
+			      struct radeon_bo *pbo,
+			      uint32_t read_domain,
+			      uint32_t write_domain,
+			      uint32_t sync_type);
+
+extern void r700SetupStreams(GLcontext * ctx);
+extern void r700Start3D(context_t *context);
+extern void r600InitAtoms(context_t *context);
+
+#define RADEON_D_CAPTURE 0
+#define RADEON_D_PLAYBACK 1
+#define RADEON_D_PLAYBACK_RAW 2
+#define RADEON_D_T 3
+
+#define r600PackFloat32 radeonPackFloat32
+#define r600PackFloat24 radeonPackFloat24
+
+#endif				/* __R600_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_emit.c b/src/mesa/drivers/dri/r600/r600_emit.c
new file mode 100644
index 0000000000..5c250c2418
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_emit.c
@@ -0,0 +1,117 @@
+/**************************************************************************
+
+Copyright 2008, 2009 Advanced Micro Devices Inc. (AMD)
+
+Copyright (C) Advanced Micro Devices Inc. (AMD)  2009.  All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/image.h"
+
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+
+#include "r600_context.h"
+#include "r600_emit.h"
+
+void r600EmitCacheFlush(context_t *rmesa)
+{
+}
+
+GLboolean r600EmitShader(GLcontext * ctx,
+                         void ** shaderbo,
+			 GLvoid * data,
+                         int sizeinDWORD,
+                         char * szShaderUsage)
+{
+	radeonContextPtr radeonctx = RADEON_CONTEXT(ctx);
+	struct radeon_bo * pbo;
+	uint32_t *out;
+shader_again_alloc:
+	pbo = radeon_bo_open(radeonctx->radeonScreen->bom,
+			0,
+			sizeinDWORD * 4,
+			256,
+			RADEON_GEM_DOMAIN_GTT,
+			0);
+
+	radeon_print(RADEON_SHADER, RADEON_NORMAL, "%s %p size %d: %s\n", __func__, pbo, sizeinDWORD, szShaderUsage);
+
+	if (!pbo) {
+		radeon_print(RADEON_MEMORY | RADEON_CS, RADEON_IMPORTANT, "No memory for buffer object. Flushing command buffer.\n");
+		rcommonFlushCmdBuf(radeonctx, __FUNCTION__);
+		goto shader_again_alloc;
+	}
+
+	radeon_cs_space_add_persistent_bo(radeonctx->cmdbuf.cs,
+			pbo,
+			RADEON_GEM_DOMAIN_GTT, 0);
+
+	if (radeon_cs_space_check_with_bo(radeonctx->cmdbuf.cs,
+				pbo,
+				RADEON_GEM_DOMAIN_GTT, 0)) {
+		radeon_error("failure to revalidate BOs - badness\n");
+		return GL_FALSE;
+	}
+
+	radeon_bo_map(pbo, 1);
+
+	out = (uint32_t*)(pbo->ptr);
+
+	memcpy(out, data, sizeinDWORD * 4);
+
+	radeon_bo_unmap(pbo);
+
+	*shaderbo = (void*)pbo;
+
+	return GL_TRUE;
+}
+
+GLboolean r600DeleteShader(GLcontext * ctx,
+                           void * shaderbo)
+{
+    struct radeon_bo * pbo = (struct radeon_bo *)shaderbo;
+
+    radeon_print(RADEON_SHADER, RADEON_NORMAL, "%s: %p\n", __func__, pbo);
+
+    if (pbo) {
+	    if (pbo->ptr)
+		radeon_bo_unmap(pbo);
+	    radeon_bo_unref(pbo); /* when bo->cref <= 0, bo will be bo_free */
+    }
+
+    return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r200/r200_span.h b/src/mesa/drivers/dri/r600/r600_emit.h
index bae5644309..661774d11e 100644
--- a/src/mesa/drivers/dri/r200/r200_span.h
+++ b/src/mesa/drivers/dri/r600/r600_emit.h
@@ -1,9 +1,8 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+/**************************************************************************
+
+Copyright 2008, 2009 Advanced Micro Devices Inc. (AMD)
 
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
+Copyright (C) Advanced Micro Devices Inc. (AMD)  2009.  All Rights Reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
@@ -29,17 +28,28 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 /*
  * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
  */
 
-#ifndef __R200_SPAN_H__
-#define __R200_SPAN_H__
 
-#include "drirenderbuffer.h"
+#ifndef __R600_EMIT_H__
+#define __R600_EMIT_H__
+
+#include "main/glheader.h"
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+#include "radeon_reg.h"
+
+void r600EmitCacheFlush(context_t *rmesa);
 
-extern void r200InitSpanFuncs( GLcontext *ctx );
+extern GLboolean r600EmitShader(GLcontext * ctx, 
+                                void ** shaderbo,
+			                    GLvoid * data, 
+                                int sizeinDWORD,
+                                char * szShaderUsage); 
 
-extern void
-radeonSetSpanFunctions(driRenderbuffer *rb, const GLvisual *vis);
+extern GLboolean r600DeleteShader(GLcontext * ctx, 
+                                 void * shaderbo);
 
 #endif
diff --git a/src/mesa/drivers/dri/r600/r600_reg.h b/src/mesa/drivers/dri/r600/r600_reg.h
new file mode 100644
index 0000000000..ffe5ee4f74
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_reg.h
@@ -0,0 +1,121 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_H_
+#define _R600_REG_H_
+
+/*
+ * Register definitions
+ */
+
+#include "r600_reg_auto_r6xx.h"
+#include "r600_reg_r6xx.h"
+#include "r600_reg_r7xx.h"
+
+
+/* SET_*_REG offsets + ends */
+enum 
+{
+    SET_CONFIG_REG_offset                = 0x00008000,
+    SET_CONFIG_REG_end                   = 0x0000ac00,
+    SET_CONTEXT_REG_offset               = 0x00028000,
+    SET_CONTEXT_REG_end                  = 0x00029000,
+    SET_ALU_CONST_offset                 = 0x00030000,
+    SET_ALU_CONST_end                    = 0x00032000,
+    SET_RESOURCE_offset                  = 0x00038000,
+    SET_RESOURCE_end                     = 0x0003c000,
+    SET_SAMPLER_offset                   = 0x0003c000,
+    SET_SAMPLER_end                      = 0x0003cff0,
+    SET_CTL_CONST_offset                 = 0x0003cff0,
+    SET_CTL_CONST_end                    = 0x0003e200,
+    SET_LOOP_CONST_offset                = 0x0003e200,
+    SET_LOOP_CONST_end                   = 0x0003e380,
+    SET_BOOL_CONST_offset                = 0x0003e380,
+    SET_BOOL_CONST_end                   = 0x00040000,
+};
+
+/* packet3 IT_SURFACE_BASE_UPDATE bits */
+enum 
+{
+    DEPTH_BASE                           = (1 << 0),
+    COLOR0_BASE                          = (1 << 1),
+    COLOR1_BASE                          = (1 << 2),
+    COLOR2_BASE                          = (1 << 3),
+    COLOR3_BASE                          = (1 << 4),
+    COLOR4_BASE                          = (1 << 5),
+    COLOR5_BASE                          = (1 << 6),
+    COLOR6_BASE                          = (1 << 7),
+    COLOR7_BASE                          = (1 << 8),
+    STRMOUT_BASE0                        = (1 << 9),
+    STRMOUT_BASE1                        = (1 << 10),
+    STRMOUT_BASE2                        = (1 << 11),
+    STRMOUT_BASE3                        = (1 << 12),
+    COHER_BASE0                          = (1 << 13),
+    COHER_BASE1                          = (1 << 14),
+};
+
+/* Packet3 commands */
+enum 
+{
+    IT_NOP                               = 0x10,
+    IT_INDIRECT_BUFFER_END               = 0x17,
+    IT_SET_PREDICATION                   = 0x20,
+    IT_REG_RMW                           = 0x21,
+    IT_COND_EXEC                         = 0x22,
+    IT_PRED_EXEC                         = 0x23,
+    IT_START_3D_CMDBUF                   = 0x24,
+    IT_DRAW_INDEX_2                      = 0x27,
+    IT_CONTEXT_CONTROL                   = 0x28,
+    IT_DRAW_INDEX_IMMD_BE                = 0x29,
+    IT_INDEX_TYPE                        = 0x2A,
+    IT_DRAW_INDEX                        = 0x2B,
+    IT_DRAW_INDEX_AUTO                   = 0x2D,
+    IT_DRAW_INDEX_IMMD                   = 0x2E,
+    IT_NUM_INSTANCES                     = 0x2F,
+    IT_STRMOUT_BUFFER_UPDATE             = 0x34,
+    IT_INDIRECT_BUFFER_MP                = 0x38,
+    IT_MEM_SEMAPHORE                     = 0x39,
+    IT_MPEG_INDEX                        = 0x3A,
+    IT_WAIT_REG_MEM                      = 0x3C,
+    IT_MEM_WRITE                         = 0x3D,
+    IT_INDIRECT_BUFFER                   = 0x32,
+    IT_CP_INTERRUPT                      = 0x40,
+    IT_SURFACE_SYNC                      = 0x43,
+    IT_ME_INITIALIZE                     = 0x44,
+    IT_COND_WRITE                        = 0x45,
+    IT_EVENT_WRITE                       = 0x46,
+    IT_EVENT_WRITE_EOP                   = 0x47,
+    IT_ONE_REG_WRITE                     = 0x57,
+    IT_SET_CONFIG_REG                    = 0x68,
+    IT_SET_CONTEXT_REG                   = 0x69,
+    IT_SET_ALU_CONST                     = 0x6A,
+    IT_SET_BOOL_CONST                    = 0x6B,
+    IT_SET_LOOP_CONST                    = 0x6C,
+    IT_SET_RESOURCE                      = 0x6D,
+    IT_SET_SAMPLER                       = 0x6E,
+    IT_SET_CTL_CONST                     = 0x6F,
+    IT_SURFACE_BASE_UPDATE               = 0x73,
+};
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/r600_reg_auto_r6xx.h b/src/mesa/drivers/dri/r600/r600_reg_auto_r6xx.h
new file mode 100644
index 0000000000..9d5aa3c7e4
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_reg_auto_r6xx.h
@@ -0,0 +1,3087 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _AUTOREGS
+#define _AUTOREGS
+
+enum {
+
+    VGT_VTX_VECT_EJECT_REG                                = 0x000088b0,
+	PRIM_COUNT_mask                                   = 0x3ff << 0,
+	PRIM_COUNT_shift                                  = 0,
+    VGT_LAST_COPY_STATE                                   = 0x000088c0,
+	SRC_STATE_ID_mask                                 = 0x07 << 0,
+	SRC_STATE_ID_shift                                = 0,
+	DST_STATE_ID_mask                                 = 0x07 << 16,
+	DST_STATE_ID_shift                                = 16,
+    VGT_CACHE_INVALIDATION                                = 0x000088c4,
+	CACHE_INVALIDATION_mask                           = 0x03 << 0,
+	CACHE_INVALIDATION_shift                          = 0,
+	    VC_ONLY                                       = 0x00,
+	    TC_ONLY                                       = 0x01,
+	    VC_AND_TC                                     = 0x02,
+	VS_NO_EXTRA_BUFFER_bit                            = 1 << 5,
+    VGT_GS_PER_ES                                         = 0x000088c8,
+    VGT_ES_PER_GS                                         = 0x000088cc,
+    VGT_GS_VERTEX_REUSE                                   = 0x000088d4,
+	VERT_REUSE_mask                                   = 0x1f << 0,
+	VERT_REUSE_shift                                  = 0,
+    VGT_MC_LAT_CNTL                                       = 0x000088d8,
+	MC_TIME_STAMP_RES_mask                            = 0x03 << 0,
+	MC_TIME_STAMP_RES_shift                           = 0,
+	    X_0_992_MAX_LATENCY                           = 0x00,
+	    X_0_496_MAX_LATENCY                           = 0x01,
+	    X_0_248_MAX_LATENCY                           = 0x02,
+	    X_0_124_MAX_LATENCY                           = 0x03,
+    VGT_GS_PER_VS                                         = 0x000088e8,
+	GS_PER_VS_mask                                    = 0x0f << 0,
+	GS_PER_VS_shift                                   = 0,
+    VGT_CNTL_STATUS                                       = 0x000088f0,
+	VGT_OUT_INDX_BUSY_bit                             = 1 << 0,
+	VGT_OUT_BUSY_bit                                  = 1 << 1,
+	VGT_PT_BUSY_bit                                   = 1 << 2,
+	VGT_TE_BUSY_bit                                   = 1 << 3,
+	VGT_VR_BUSY_bit                                   = 1 << 4,
+	VGT_GRP_BUSY_bit                                  = 1 << 5,
+	VGT_DMA_REQ_BUSY_bit                              = 1 << 6,
+	VGT_DMA_BUSY_bit                                  = 1 << 7,
+	VGT_GS_BUSY_bit                                   = 1 << 8,
+	VGT_BUSY_bit                                      = 1 << 9,
+    VGT_PRIMITIVE_TYPE                                    = 0x00008958,
+	VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask                = 0x3f << 0,
+	VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift               = 0,
+	    DI_PT_NONE                                    = 0x00,
+	    DI_PT_POINTLIST                               = 0x01,
+	    DI_PT_LINELIST                                = 0x02,
+	    DI_PT_LINESTRIP                               = 0x03,
+	    DI_PT_TRILIST                                 = 0x04,
+	    DI_PT_TRIFAN                                  = 0x05,
+	    DI_PT_TRISTRIP                                = 0x06,
+	    DI_PT_UNUSED_0                                = 0x07,
+	    DI_PT_UNUSED_1                                = 0x08,
+	    DI_PT_UNUSED_2                                = 0x09,
+	    DI_PT_LINELIST_ADJ                            = 0x0a,
+	    DI_PT_LINESTRIP_ADJ                           = 0x0b,
+	    DI_PT_TRILIST_ADJ                             = 0x0c,
+	    DI_PT_TRISTRIP_ADJ                            = 0x0d,
+	    DI_PT_UNUSED_3                                = 0x0e,
+	    DI_PT_UNUSED_4                                = 0x0f,
+	    DI_PT_TRI_WITH_WFLAGS                         = 0x10,
+	    DI_PT_RECTLIST                                = 0x11,
+	    DI_PT_LINELOOP                                = 0x12,
+	    DI_PT_QUADLIST                                = 0x13,
+	    DI_PT_QUADSTRIP                               = 0x14,
+	    DI_PT_POLYGON                                 = 0x15,
+	    DI_PT_2D_COPY_RECT_LIST_V0                    = 0x16,
+	    DI_PT_2D_COPY_RECT_LIST_V1                    = 0x17,
+	    DI_PT_2D_COPY_RECT_LIST_V2                    = 0x18,
+	    DI_PT_2D_COPY_RECT_LIST_V3                    = 0x19,
+	    DI_PT_2D_FILL_RECT_LIST                       = 0x1a,
+	    DI_PT_2D_LINE_STRIP                           = 0x1b,
+	    DI_PT_2D_TRI_STRIP                            = 0x1c,
+    VGT_INDEX_TYPE                                        = 0x0000895c,
+	INDEX_TYPE_mask                                   = 0x03 << 0,
+	INDEX_TYPE_shift                                  = 0,
+	    DI_INDEX_SIZE_16_BIT                          = 0x00,
+	    DI_INDEX_SIZE_32_BIT                          = 0x01,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_0                      = 0x00008960,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_1                      = 0x00008964,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_2                      = 0x00008968,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_3                      = 0x0000896c,
+    VGT_NUM_INDICES                                       = 0x00008970,
+    VGT_NUM_INSTANCES                                     = 0x00008974,
+    PA_CL_CNTL_STATUS                                     = 0x00008a10,
+	CL_BUSY_bit                                       = 1 << 31,
+    PA_CL_ENHANCE                                         = 0x00008a14,
+	CLIP_VTX_REORDER_ENA_bit                          = 1 << 0,
+	NUM_CLIP_SEQ_mask                                 = 0x03 << 1,
+	NUM_CLIP_SEQ_shift                                = 1,
+	CLIPPED_PRIM_SEQ_STALL_bit                        = 1 << 3,
+	VE_NAN_PROC_DISABLE_bit                           = 1 << 4,
+    PA_SU_CNTL_STATUS                                     = 0x00008a50,
+	SU_BUSY_bit                                       = 1 << 31,
+    PA_SC_LINE_STIPPLE_STATE                              = 0x00008b10,
+	CURRENT_PTR_mask                                  = 0x0f << 0,
+	CURRENT_PTR_shift                                 = 0,
+	CURRENT_COUNT_mask                                = 0xff << 8,
+	CURRENT_COUNT_shift                               = 8,
+    PA_SC_MULTI_CHIP_CNTL                                 = 0x00008b20,
+	LOG2_NUM_CHIPS_mask                               = 0x07 << 0,
+	LOG2_NUM_CHIPS_shift                              = 0,
+	MULTI_CHIP_TILE_SIZE_mask                         = 0x03 << 3,
+	MULTI_CHIP_TILE_SIZE_shift                        = 3,
+	    X_16_X_16_PIXEL_TILE_PER_CHIP                 = 0x00,
+	    X_32_X_32_PIXEL_TILE_PER_CHIP                 = 0x01,
+	    X_64_X_64_PIXEL_TILE_PER_CHIP                 = 0x02,
+	    X_128X128_PIXEL_TILE_PER_CHIP                 = 0x03,
+	CHIP_TILE_X_LOC_mask                              = 0x07 << 5,
+	CHIP_TILE_X_LOC_shift                             = 5,
+	CHIP_TILE_Y_LOC_mask                              = 0x07 << 8,
+	CHIP_TILE_Y_LOC_shift                             = 8,
+	CHIP_SUPER_TILE_B_bit                             = 1 << 11,
+    PA_SC_AA_SAMPLE_LOCS_2S                               = 0x00008b40,
+	S0_X_mask                                         = 0x0f << 0,
+	S0_X_shift                                        = 0,
+	S0_Y_mask                                         = 0x0f << 4,
+	S0_Y_shift                                        = 4,
+	S1_X_mask                                         = 0x0f << 8,
+	S1_X_shift                                        = 8,
+	S1_Y_mask                                         = 0x0f << 12,
+	S1_Y_shift                                        = 12,
+    PA_SC_AA_SAMPLE_LOCS_4S                               = 0x00008b44,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+	S2_X_mask                                         = 0x0f << 16,
+	S2_X_shift                                        = 16,
+	S2_Y_mask                                         = 0x0f << 20,
+	S2_Y_shift                                        = 20,
+	S3_X_mask                                         = 0x0f << 24,
+	S3_X_shift                                        = 24,
+	S3_Y_mask                                         = 0x0f << 28,
+	S3_Y_shift                                        = 28,
+    PA_SC_AA_SAMPLE_LOCS_8S_WD0                           = 0x00008b48,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+/* 	S2_X_mask                                         = 0x0f << 16, */
+/* 	S2_X_shift                                        = 16, */
+/* 	S2_Y_mask                                         = 0x0f << 20, */
+/* 	S2_Y_shift                                        = 20, */
+/* 	S3_X_mask                                         = 0x0f << 24, */
+/* 	S3_X_shift                                        = 24, */
+/* 	S3_Y_mask                                         = 0x0f << 28, */
+/* 	S3_Y_shift                                        = 28, */
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1                           = 0x00008b4c,
+	S4_X_mask                                         = 0x0f << 0,
+	S4_X_shift                                        = 0,
+	S4_Y_mask                                         = 0x0f << 4,
+	S4_Y_shift                                        = 4,
+	S5_X_mask                                         = 0x0f << 8,
+	S5_X_shift                                        = 8,
+	S5_Y_mask                                         = 0x0f << 12,
+	S5_Y_shift                                        = 12,
+	S6_X_mask                                         = 0x0f << 16,
+	S6_X_shift                                        = 16,
+	S6_Y_mask                                         = 0x0f << 20,
+	S6_Y_shift                                        = 20,
+	S7_X_mask                                         = 0x0f << 24,
+	S7_X_shift                                        = 24,
+	S7_Y_mask                                         = 0x0f << 28,
+	S7_Y_shift                                        = 28,
+    PA_SC_CNTL_STATUS                                     = 0x00008be0,
+	MPASS_OVERFLOW_bit                                = 1 << 30,
+    PA_SC_ENHANCE                                         = 0x00008bf0,
+	FORCE_EOV_MAX_CLK_CNT_mask                        = 0xfff << 0,
+	FORCE_EOV_MAX_CLK_CNT_shift                       = 0,
+	FORCE_EOV_MAX_TILE_CNT_mask                       = 0xfff << 12,
+	FORCE_EOV_MAX_TILE_CNT_shift                      = 12,
+    SQ_CONFIG                                             = 0x00008c00,
+	VC_ENABLE_bit                                     = 1 << 0,
+	EXPORT_SRC_C_bit                                  = 1 << 1,
+	DX9_CONSTS_bit                                    = 1 << 2,
+	ALU_INST_PREFER_VECTOR_bit                        = 1 << 3,
+	SQ_CONFIG__DX10_CLAMP_bit                         = 1 << 4,
+	ALU_PREFER_ONE_WATERFALL_bit                      = 1 << 5,
+	ALU_MAX_ONE_WATERFALL_bit                         = 1 << 6,
+	CLAUSE_SEQ_PRIO_mask                              = 0x03 << 8,
+	CLAUSE_SEQ_PRIO_shift                             = 8,
+	    SQ_CL_PRIO_RND_ROBIN                          = 0x00,
+	    SQ_CL_PRIO_MACRO_SEQ                          = 0x01,
+	    SQ_CL_PRIO_NONE                               = 0x02,
+	PS_PRIO_mask                                      = 0x03 << 24,
+	PS_PRIO_shift                                     = 24,
+	VS_PRIO_mask                                      = 0x03 << 26,
+	VS_PRIO_shift                                     = 26,
+	GS_PRIO_mask                                      = 0x03 << 28,
+	GS_PRIO_shift                                     = 28,
+	ES_PRIO_mask                                      = 0x03 << 30,
+	ES_PRIO_shift                                     = 30,
+    SQ_GPR_RESOURCE_MGMT_1                                = 0x00008c04,
+	NUM_PS_GPRS_mask                                  = 0xff << 0,
+	NUM_PS_GPRS_shift                                 = 0,
+	NUM_VS_GPRS_mask                                  = 0xff << 16,
+	NUM_VS_GPRS_shift                                 = 16,
+	NUM_CLAUSE_TEMP_GPRS_mask                         = 0x0f << 28,
+	NUM_CLAUSE_TEMP_GPRS_shift                        = 28,
+    SQ_GPR_RESOURCE_MGMT_2                                = 0x00008c08,
+	NUM_GS_GPRS_mask                                  = 0xff << 0,
+	NUM_GS_GPRS_shift                                 = 0,
+	NUM_ES_GPRS_mask                                  = 0xff << 16,
+	NUM_ES_GPRS_shift                                 = 16,
+    SQ_THREAD_RESOURCE_MGMT                               = 0x00008c0c,
+	NUM_PS_THREADS_mask                               = 0xff << 0,
+	NUM_PS_THREADS_shift                              = 0,
+	NUM_VS_THREADS_mask                               = 0xff << 8,
+	NUM_VS_THREADS_shift                              = 8,
+	NUM_GS_THREADS_mask                               = 0xff << 16,
+	NUM_GS_THREADS_shift                              = 16,
+	NUM_ES_THREADS_mask                               = 0xff << 24,
+	NUM_ES_THREADS_shift                              = 24,
+    SQ_STACK_RESOURCE_MGMT_1                              = 0x00008c10,
+	NUM_PS_STACK_ENTRIES_mask                         = 0xfff << 0,
+	NUM_PS_STACK_ENTRIES_shift                        = 0,
+	NUM_VS_STACK_ENTRIES_mask                         = 0xfff << 16,
+	NUM_VS_STACK_ENTRIES_shift                        = 16,
+    SQ_STACK_RESOURCE_MGMT_2                              = 0x00008c14,
+	NUM_GS_STACK_ENTRIES_mask                         = 0xfff << 0,
+	NUM_GS_STACK_ENTRIES_shift                        = 0,
+	NUM_ES_STACK_ENTRIES_mask                         = 0xfff << 16,
+	NUM_ES_STACK_ENTRIES_shift                        = 16,
+    SQ_ESGS_RING_BASE                                     = 0x00008c40,
+    SQ_ESGS_RING_SIZE                                     = 0x00008c44,
+    SQ_GSVS_RING_BASE                                     = 0x00008c48,
+    SQ_GSVS_RING_SIZE                                     = 0x00008c4c,
+    SQ_ESTMP_RING_BASE                                    = 0x00008c50,
+    SQ_ESTMP_RING_SIZE                                    = 0x00008c54,
+    SQ_GSTMP_RING_BASE                                    = 0x00008c58,
+    SQ_GSTMP_RING_SIZE                                    = 0x00008c5c,
+    SQ_VSTMP_RING_BASE                                    = 0x00008c60,
+    SQ_VSTMP_RING_SIZE                                    = 0x00008c64,
+    SQ_PSTMP_RING_BASE                                    = 0x00008c68,
+    SQ_PSTMP_RING_SIZE                                    = 0x00008c6c,
+    SQ_FBUF_RING_BASE                                     = 0x00008c70,
+    SQ_FBUF_RING_SIZE                                     = 0x00008c74,
+    SQ_REDUC_RING_BASE                                    = 0x00008c78,
+    SQ_REDUC_RING_SIZE                                    = 0x00008c7c,
+    SQ_ALU_WORD1_OP3                                      = 0x00008dfc,
+	SRC2_SEL_mask                                     = 0x1ff << 0,
+	SRC2_SEL_shift                                    = 0,
+	    SQ_ALU_SRC_0                                  = 0xf8,
+	    SQ_ALU_SRC_1                                  = 0xf9,
+	    SQ_ALU_SRC_1_INT                              = 0xfa,
+	    SQ_ALU_SRC_M_1_INT                            = 0xfb,
+	    SQ_ALU_SRC_0_5                                = 0xfc,
+	    SQ_ALU_SRC_LITERAL                            = 0xfd,
+	    SQ_ALU_SRC_PV                                 = 0xfe,
+	    SQ_ALU_SRC_PS                                 = 0xff,
+	SRC2_REL_bit                                      = 1 << 9,
+	SRC2_CHAN_mask                                    = 0x03 << 10,
+	SRC2_CHAN_shift                                   = 10,
+	    SQ_CHAN_X                                     = 0x00,
+	    SQ_CHAN_Y                                     = 0x01,
+	    SQ_CHAN_Z                                     = 0x02,
+	    SQ_CHAN_W                                     = 0x03,
+	SRC2_NEG_bit                                      = 1 << 12,
+	SQ_ALU_WORD1_OP3__ALU_INST_mask                   = 0x1f << 13,
+	SQ_ALU_WORD1_OP3__ALU_INST_shift                  = 13,
+	    SQ_OP3_INST_MUL_LIT                           = 0x0c,
+	    SQ_OP3_INST_MUL_LIT_M2                        = 0x0d,
+	    SQ_OP3_INST_MUL_LIT_M4                        = 0x0e,
+	    SQ_OP3_INST_MUL_LIT_D2                        = 0x0f,
+	    SQ_OP3_INST_MULADD                            = 0x10,
+	    SQ_OP3_INST_MULADD_M2                         = 0x11,
+	    SQ_OP3_INST_MULADD_M4                         = 0x12,
+	    SQ_OP3_INST_MULADD_D2                         = 0x13,
+	    SQ_OP3_INST_MULADD_IEEE                       = 0x14,
+	    SQ_OP3_INST_MULADD_IEEE_M2                    = 0x15,
+	    SQ_OP3_INST_MULADD_IEEE_M4                    = 0x16,
+	    SQ_OP3_INST_MULADD_IEEE_D2                    = 0x17,
+	    SQ_OP3_INST_CNDE                              = 0x18,
+	    SQ_OP3_INST_CNDGT                             = 0x19,
+	    SQ_OP3_INST_CNDGE                             = 0x1a,
+	    SQ_OP3_INST_CNDE_INT                          = 0x1c,
+	    SQ_OP3_INST_CNDGT_INT                         = 0x1d,
+	    SQ_OP3_INST_CNDGE_INT                         = 0x1e,
+    SQ_TEX_WORD2                                          = 0x00008dfc,
+	OFFSET_X_mask                                     = 0x1f << 0,
+	OFFSET_X_shift                                    = 0,
+	OFFSET_Y_mask                                     = 0x1f << 5,
+	OFFSET_Y_shift                                    = 5,
+	OFFSET_Z_mask                                     = 0x1f << 10,
+	OFFSET_Z_shift                                    = 10,
+	SAMPLER_ID_mask                                   = 0x1f << 15,
+	SAMPLER_ID_shift                                  = 15,
+	SQ_TEX_WORD2__SRC_SEL_X_mask                      = 0x07 << 20,
+	SQ_TEX_WORD2__SRC_SEL_X_shift                     = 20,
+	    SQ_SEL_X                                      = 0x00,
+	    SQ_SEL_Y                                      = 0x01,
+	    SQ_SEL_Z                                      = 0x02,
+	    SQ_SEL_W                                      = 0x03,
+	    SQ_SEL_0                                      = 0x04,
+	    SQ_SEL_1                                      = 0x05,
+	SRC_SEL_Y_mask                                    = 0x07 << 23,
+	SRC_SEL_Y_shift                                   = 23,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SRC_SEL_Z_mask                                    = 0x07 << 26,
+	SRC_SEL_Z_shift                                   = 26,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SRC_SEL_W_mask                                    = 0x07 << 29,
+	SRC_SEL_W_shift                                   = 29,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+    SQ_CF_ALLOC_EXPORT_WORD1                              = 0x00008dfc,
+	BURST_COUNT_mask                                  = 0x0f << 17,
+	BURST_COUNT_shift                                 = 17,
+	END_OF_PROGRAM_bit                                = 1 << 21,
+	VALID_PIXEL_MODE_bit                              = 1 << 22,
+	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_mask            = 0x7f << 23,
+	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_shift           = 23,
+	    SQ_CF_INST_MEM_STREAM0                        = 0x20,
+	    SQ_CF_INST_MEM_STREAM1                        = 0x21,
+	    SQ_CF_INST_MEM_STREAM2                        = 0x22,
+	    SQ_CF_INST_MEM_STREAM3                        = 0x23,
+	    SQ_CF_INST_MEM_SCRATCH                        = 0x24,
+	    SQ_CF_INST_MEM_REDUCTION                      = 0x25,
+	    SQ_CF_INST_MEM_RING                           = 0x26,
+	    SQ_CF_INST_EXPORT                             = 0x27,
+	    SQ_CF_INST_EXPORT_DONE                        = 0x28,
+	WHOLE_QUAD_MODE_bit                               = 1 << 30,
+	BARRIER_bit                                       = 1 << 31,
+    SQ_CF_ALU_WORD1                                       = 0x00008dfc,
+	KCACHE_MODE1_mask                                 = 0x03 << 0,
+	KCACHE_MODE1_shift                                = 0,
+	    SQ_CF_KCACHE_NOP                              = 0x00,
+	    SQ_CF_KCACHE_LOCK_1                           = 0x01,
+	    SQ_CF_KCACHE_LOCK_2                           = 0x02,
+	    SQ_CF_KCACHE_LOCK_LOOP_INDEX                  = 0x03,
+	KCACHE_ADDR0_mask                                 = 0xff << 2,
+	KCACHE_ADDR0_shift                                = 2,
+	KCACHE_ADDR1_mask                                 = 0xff << 10,
+	KCACHE_ADDR1_shift                                = 10,
+	SQ_CF_ALU_WORD1__COUNT_mask                       = 0x7f << 18,
+	SQ_CF_ALU_WORD1__COUNT_shift                      = 18,
+	SQ_CF_ALU_WORD1__ALT_CONST_bit                    = 1 << 25,
+	SQ_CF_ALU_WORD1__CF_INST_mask                     = 0x0f << 26,
+	SQ_CF_ALU_WORD1__CF_INST_shift                    = 26,
+	    SQ_CF_INST_ALU                                = 0x08,
+	    SQ_CF_INST_ALU_PUSH_BEFORE                    = 0x09,
+	    SQ_CF_INST_ALU_POP_AFTER                      = 0x0a,
+	    SQ_CF_INST_ALU_POP2_AFTER                     = 0x0b,
+	    SQ_CF_INST_ALU_CONTINUE                       = 0x0d,
+	    SQ_CF_INST_ALU_BREAK                          = 0x0e,
+	    SQ_CF_INST_ALU_ELSE_AFTER                     = 0x0f,
+/* 	WHOLE_QUAD_MODE_bit                               = 1 << 30, */
+/* 	BARRIER_bit                                       = 1 << 31, */
+    SQ_TEX_WORD1                                          = 0x00008dfc,
+	SQ_TEX_WORD1__DST_GPR_mask                        = 0x7f << 0,
+	SQ_TEX_WORD1__DST_GPR_shift                       = 0,
+	SQ_TEX_WORD1__DST_REL_bit                         = 1 << 7,
+	SQ_TEX_WORD1__DST_SEL_X_mask                      = 0x07 << 9,
+	SQ_TEX_WORD1__DST_SEL_X_shift                     = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	    SQ_SEL_MASK                                   = 0x07,
+	SQ_TEX_WORD1__DST_SEL_Y_mask                      = 0x07 << 12,
+	SQ_TEX_WORD1__DST_SEL_Y_shift                     = 12,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__DST_SEL_Z_mask                      = 0x07 << 15,
+	SQ_TEX_WORD1__DST_SEL_Z_shift                     = 15,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__DST_SEL_W_mask                      = 0x07 << 18,
+	SQ_TEX_WORD1__DST_SEL_W_shift                     = 18,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__LOD_BIAS_mask                       = 0x7f << 21,
+	SQ_TEX_WORD1__LOD_BIAS_shift                      = 21,
+	COORD_TYPE_X_bit                                  = 1 << 28,
+	COORD_TYPE_Y_bit                                  = 1 << 29,
+	COORD_TYPE_Z_bit                                  = 1 << 30,
+	COORD_TYPE_W_bit                                  = 1 << 31,
+    SQ_VTX_WORD0                                          = 0x00008dfc,
+	VTX_INST_mask                                     = 0x1f << 0,
+	VTX_INST_shift                                    = 0,
+	    SQ_VTX_INST_FETCH                             = 0x00,
+	    SQ_VTX_INST_SEMANTIC                          = 0x01,
+	FETCH_TYPE_mask                                   = 0x03 << 5,
+	FETCH_TYPE_shift                                  = 5,
+	    SQ_VTX_FETCH_VERTEX_DATA                      = 0x00,
+	    SQ_VTX_FETCH_INSTANCE_DATA                    = 0x01,
+	    SQ_VTX_FETCH_NO_INDEX_OFFSET                  = 0x02,
+	FETCH_WHOLE_QUAD_bit                              = 1 << 7,
+	BUFFER_ID_mask                                    = 0xff << 8,
+	BUFFER_ID_shift                                   = 8,
+	SRC_GPR_mask                                      = 0x7f << 16,
+	SRC_GPR_shift                                     = 16,
+	SRC_REL_bit                                       = 1 << 23,
+	SQ_VTX_WORD0__SRC_SEL_X_mask                      = 0x03 << 24,
+	SQ_VTX_WORD0__SRC_SEL_X_shift                     = 24,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+	MEGA_FETCH_COUNT_mask                             = 0x3f << 26,
+	MEGA_FETCH_COUNT_shift                            = 26,
+    SQ_CF_ALLOC_EXPORT_WORD1_SWIZ                         = 0x00008dfc,
+	SEL_X_mask                                        = 0x07 << 0,
+	SEL_X_shift                                       = 0,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_Y_mask                                        = 0x07 << 3,
+	SEL_Y_shift                                       = 3,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_Z_mask                                        = 0x07 << 6,
+	SEL_Z_shift                                       = 6,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_W_mask                                        = 0x07 << 9,
+	SEL_W_shift                                       = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+    SQ_ALU_WORD1                                          = 0x00008dfc,
+	ENCODING_mask                                     = 0x07 << 15,
+	ENCODING_shift                                    = 15,
+	BANK_SWIZZLE_mask                                 = 0x07 << 18,
+	BANK_SWIZZLE_shift                                = 18,
+	    SQ_ALU_VEC_012                                = 0x00,
+	    SQ_ALU_VEC_021                                = 0x01,
+	    SQ_ALU_VEC_120                                = 0x02,
+	    SQ_ALU_VEC_102                                = 0x03,
+	    SQ_ALU_VEC_201                                = 0x04,
+	    SQ_ALU_VEC_210                                = 0x05,
+	SQ_ALU_WORD1__DST_GPR_mask                        = 0x7f << 21,
+	SQ_ALU_WORD1__DST_GPR_shift                       = 21,
+	SQ_ALU_WORD1__DST_REL_bit                         = 1 << 28,
+	DST_CHAN_mask                                     = 0x03 << 29,
+	DST_CHAN_shift                                    = 29,
+	    CHAN_X                                        = 0x00,
+	    CHAN_Y                                        = 0x01,
+	    CHAN_Z                                        = 0x02,
+	    CHAN_W                                        = 0x03,
+	SQ_ALU_WORD1__CLAMP_bit                           = 1 << 31,
+    SQ_CF_ALU_WORD0                                       = 0x00008dfc,
+	SQ_CF_ALU_WORD0__ADDR_mask                        = 0x3fffff << 0,
+	SQ_CF_ALU_WORD0__ADDR_shift                       = 0,
+	KCACHE_BANK0_mask                                 = 0x0f << 22,
+	KCACHE_BANK0_shift                                = 22,
+	KCACHE_BANK1_mask                                 = 0x0f << 26,
+	KCACHE_BANK1_shift                                = 26,
+	KCACHE_MODE0_mask                                 = 0x03 << 30,
+	KCACHE_MODE0_shift                                = 30,
+/* 	    SQ_CF_KCACHE_NOP                              = 0x00, */
+/* 	    SQ_CF_KCACHE_LOCK_1                           = 0x01, */
+/* 	    SQ_CF_KCACHE_LOCK_2                           = 0x02, */
+/* 	    SQ_CF_KCACHE_LOCK_LOOP_INDEX                  = 0x03, */
+    SQ_VTX_WORD2                                          = 0x00008dfc,
+	SQ_VTX_WORD2__OFFSET_mask                         = 0xffff << 0,
+	SQ_VTX_WORD2__OFFSET_shift                        = 0,
+	SQ_VTX_WORD2__ENDIAN_SWAP_mask                    = 0x03 << 16,
+	SQ_VTX_WORD2__ENDIAN_SWAP_shift                   = 16,
+	    SQ_ENDIAN_NONE                                = 0x00,
+	    SQ_ENDIAN_8IN16                               = 0x01,
+	    SQ_ENDIAN_8IN32                               = 0x02,
+	CONST_BUF_NO_STRIDE_bit                           = 1 << 18,
+	MEGA_FETCH_bit                                    = 1 << 19,
+	SQ_VTX_WORD2__ALT_CONST_bit                       = 1 << 20,
+    SQ_ALU_WORD1_OP2_V2                                   = 0x00008dfc,
+	SRC0_ABS_bit                                      = 1 << 0,
+	SRC1_ABS_bit                                      = 1 << 1,
+	UPDATE_EXECUTE_MASK_bit                           = 1 << 2,
+	UPDATE_PRED_bit                                   = 1 << 3,
+	WRITE_MASK_bit                                    = 1 << 4,
+	SQ_ALU_WORD1_OP2_V2__OMOD_mask                    = 0x03 << 5,
+	SQ_ALU_WORD1_OP2_V2__OMOD_shift                   = 5,
+	    SQ_ALU_OMOD_OFF                               = 0x00,
+	    SQ_ALU_OMOD_M2                                = 0x01,
+	    SQ_ALU_OMOD_M4                                = 0x02,
+	    SQ_ALU_OMOD_D2                                = 0x03,
+	SQ_ALU_WORD1_OP2_V2__ALU_INST_mask                = 0x7ff << 7,
+	SQ_ALU_WORD1_OP2_V2__ALU_INST_shift               = 7,
+	    SQ_OP2_INST_ADD                               = 0x00,
+	    SQ_OP2_INST_MUL                               = 0x01,
+	    SQ_OP2_INST_MUL_IEEE                          = 0x02,
+	    SQ_OP2_INST_MAX                               = 0x03,
+	    SQ_OP2_INST_MIN                               = 0x04,
+	    SQ_OP2_INST_MAX_DX10                          = 0x05,
+	    SQ_OP2_INST_MIN_DX10                          = 0x06,
+	    SQ_OP2_INST_SETE                              = 0x08,
+	    SQ_OP2_INST_SETGT                             = 0x09,
+	    SQ_OP2_INST_SETGE                             = 0x0a,
+	    SQ_OP2_INST_SETNE                             = 0x0b,
+	    SQ_OP2_INST_SETE_DX10                         = 0x0c,
+	    SQ_OP2_INST_SETGT_DX10                        = 0x0d,
+	    SQ_OP2_INST_SETGE_DX10                        = 0x0e,
+	    SQ_OP2_INST_SETNE_DX10                        = 0x0f,
+	    SQ_OP2_INST_FRACT                             = 0x10,
+	    SQ_OP2_INST_TRUNC                             = 0x11,
+	    SQ_OP2_INST_CEIL                              = 0x12,
+	    SQ_OP2_INST_RNDNE                             = 0x13,
+	    SQ_OP2_INST_FLOOR                             = 0x14,
+	    SQ_OP2_INST_MOVA                              = 0x15,
+	    SQ_OP2_INST_MOVA_FLOOR                        = 0x16,
+	    SQ_OP2_INST_MOVA_INT                          = 0x18,
+	    SQ_OP2_INST_MOV                               = 0x19,
+	    SQ_OP2_INST_NOP                               = 0x1a,
+	    SQ_OP2_INST_PRED_SETGT_UINT                   = 0x1e,
+	    SQ_OP2_INST_PRED_SETGE_UINT                   = 0x1f,
+	    SQ_OP2_INST_PRED_SETE                         = 0x20,
+	    SQ_OP2_INST_PRED_SETGT                        = 0x21,
+	    SQ_OP2_INST_PRED_SETGE                        = 0x22,
+	    SQ_OP2_INST_PRED_SETNE                        = 0x23,
+	    SQ_OP2_INST_PRED_SET_INV                      = 0x24,
+	    SQ_OP2_INST_PRED_SET_POP                      = 0x25,
+	    SQ_OP2_INST_PRED_SET_CLR                      = 0x26,
+	    SQ_OP2_INST_PRED_SET_RESTORE                  = 0x27,
+	    SQ_OP2_INST_PRED_SETE_PUSH                    = 0x28,
+	    SQ_OP2_INST_PRED_SETGT_PUSH                   = 0x29,
+	    SQ_OP2_INST_PRED_SETGE_PUSH                   = 0x2a,
+	    SQ_OP2_INST_PRED_SETNE_PUSH                   = 0x2b,
+	    SQ_OP2_INST_KILLE                             = 0x2c,
+	    SQ_OP2_INST_KILLGT                            = 0x2d,
+	    SQ_OP2_INST_KILLGE                            = 0x2e,
+	    SQ_OP2_INST_KILLNE                            = 0x2f,
+	    SQ_OP2_INST_AND_INT                           = 0x30,
+	    SQ_OP2_INST_OR_INT                            = 0x31,
+	    SQ_OP2_INST_XOR_INT                           = 0x32,
+	    SQ_OP2_INST_NOT_INT                           = 0x33,
+	    SQ_OP2_INST_ADD_INT                           = 0x34,
+	    SQ_OP2_INST_SUB_INT                           = 0x35,
+	    SQ_OP2_INST_MAX_INT                           = 0x36,
+	    SQ_OP2_INST_MIN_INT                           = 0x37,
+	    SQ_OP2_INST_MAX_UINT                          = 0x38,
+	    SQ_OP2_INST_MIN_UINT                          = 0x39,
+	    SQ_OP2_INST_SETE_INT                          = 0x3a,
+	    SQ_OP2_INST_SETGT_INT                         = 0x3b,
+	    SQ_OP2_INST_SETGE_INT                         = 0x3c,
+	    SQ_OP2_INST_SETNE_INT                         = 0x3d,
+	    SQ_OP2_INST_SETGT_UINT                        = 0x3e,
+	    SQ_OP2_INST_SETGE_UINT                        = 0x3f,
+	    SQ_OP2_INST_KILLGT_UINT                       = 0x40,
+	    SQ_OP2_INST_KILLGE_UINT                       = 0x41,
+	    SQ_OP2_INST_PRED_SETE_INT                     = 0x42,
+	    SQ_OP2_INST_PRED_SETGT_INT                    = 0x43,
+	    SQ_OP2_INST_PRED_SETGE_INT                    = 0x44,
+	    SQ_OP2_INST_PRED_SETNE_INT                    = 0x45,
+	    SQ_OP2_INST_KILLE_INT                         = 0x46,
+	    SQ_OP2_INST_KILLGT_INT                        = 0x47,
+	    SQ_OP2_INST_KILLGE_INT                        = 0x48,
+	    SQ_OP2_INST_KILLNE_INT                        = 0x49,
+	    SQ_OP2_INST_PRED_SETE_PUSH_INT                = 0x4a,
+	    SQ_OP2_INST_PRED_SETGT_PUSH_INT               = 0x4b,
+	    SQ_OP2_INST_PRED_SETGE_PUSH_INT               = 0x4c,
+	    SQ_OP2_INST_PRED_SETNE_PUSH_INT               = 0x4d,
+	    SQ_OP2_INST_PRED_SETLT_PUSH_INT               = 0x4e,
+	    SQ_OP2_INST_PRED_SETLE_PUSH_INT               = 0x4f,
+	    SQ_OP2_INST_DOT4                              = 0x50,
+	    SQ_OP2_INST_DOT4_IEEE                         = 0x51,
+	    SQ_OP2_INST_CUBE                              = 0x52,
+	    SQ_OP2_INST_MAX4                              = 0x53,
+	    SQ_OP2_INST_MOVA_GPR_INT                      = 0x60,
+	    SQ_OP2_INST_EXP_IEEE                          = 0x61,
+	    SQ_OP2_INST_LOG_CLAMPED                       = 0x62,
+	    SQ_OP2_INST_LOG_IEEE                          = 0x63,
+	    SQ_OP2_INST_RECIP_CLAMPED                     = 0x64,
+	    SQ_OP2_INST_RECIP_FF                          = 0x65,
+	    SQ_OP2_INST_RECIP_IEEE                        = 0x66,
+	    SQ_OP2_INST_RECIPSQRT_CLAMPED                 = 0x67,
+	    SQ_OP2_INST_RECIPSQRT_FF                      = 0x68,
+	    SQ_OP2_INST_RECIPSQRT_IEEE                    = 0x69,
+	    SQ_OP2_INST_SQRT_IEEE                         = 0x6a,
+	    SQ_OP2_INST_FLT_TO_INT                        = 0x6b,
+	    SQ_OP2_INST_INT_TO_FLT                        = 0x6c,
+	    SQ_OP2_INST_UINT_TO_FLT                       = 0x6d,
+	    SQ_OP2_INST_SIN                               = 0x6e,
+	    SQ_OP2_INST_COS                               = 0x6f,
+	    SQ_OP2_INST_ASHR_INT                          = 0x70,
+	    SQ_OP2_INST_LSHR_INT                          = 0x71,
+	    SQ_OP2_INST_LSHL_INT                          = 0x72,
+	    SQ_OP2_INST_MULLO_INT                         = 0x73,
+	    SQ_OP2_INST_MULHI_INT                         = 0x74,
+	    SQ_OP2_INST_MULLO_UINT                        = 0x75,
+	    SQ_OP2_INST_MULHI_UINT                        = 0x76,
+	    SQ_OP2_INST_RECIP_INT                         = 0x77,
+	    SQ_OP2_INST_RECIP_UINT                        = 0x78,
+	    SQ_OP2_INST_FLT_TO_UINT                       = 0x79,
+    SQ_CF_ALLOC_EXPORT_WORD1_BUF                          = 0x00008dfc,
+	ARRAY_SIZE_mask                                   = 0xfff << 0,
+	ARRAY_SIZE_shift                                  = 0,
+	COMP_MASK_mask                                    = 0x0f << 12,
+	COMP_MASK_shift                                   = 12,
+    SQ_CF_WORD0                                           = 0x00008dfc,
+    SQ_CF_ALLOC_EXPORT_WORD0                              = 0x00008dfc,
+	ARRAY_BASE_mask                                   = 0x1fff << 0,
+	ARRAY_BASE_shift                                  = 0,
+	SQ_CF_ALLOC_EXPORT_WORD0__TYPE_mask               = 0x03 << 13,
+	SQ_CF_ALLOC_EXPORT_WORD0__TYPE_shift              = 13,
+	    SQ_EXPORT_PIXEL                               = 0x00,
+	    SQ_EXPORT_POS                                 = 0x01,
+	    SQ_EXPORT_PARAM                               = 0x02,
+	    X_UNUSED_FOR_SX_EXPORTS                       = 0x03,
+	RW_GPR_mask                                       = 0x7f << 15,
+	RW_GPR_shift                                      = 15,
+	RW_REL_bit                                        = 1 << 22,
+	INDEX_GPR_mask                                    = 0x7f << 23,
+	INDEX_GPR_shift                                   = 23,
+	ELEM_SIZE_mask                                    = 0x03 << 30,
+	ELEM_SIZE_shift                                   = 30,
+    SQ_VTX_WORD1                                          = 0x00008dfc,
+	SQ_VTX_WORD1__DST_SEL_X_mask                      = 0x07 << 9,
+	SQ_VTX_WORD1__DST_SEL_X_shift                     = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_Y_mask                      = 0x07 << 12,
+	SQ_VTX_WORD1__DST_SEL_Y_shift                     = 12,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_Z_mask                      = 0x07 << 15,
+	SQ_VTX_WORD1__DST_SEL_Z_shift                     = 15,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_W_mask                      = 0x07 << 18,
+	SQ_VTX_WORD1__DST_SEL_W_shift                     = 18,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	USE_CONST_FIELDS_bit                              = 1 << 21,
+	SQ_VTX_WORD1__DATA_FORMAT_mask                    = 0x3f << 22,
+	SQ_VTX_WORD1__DATA_FORMAT_shift                   = 22,
+	SQ_VTX_WORD1__NUM_FORMAT_ALL_mask                 = 0x03 << 28,
+	SQ_VTX_WORD1__NUM_FORMAT_ALL_shift                = 28,
+	    SQ_NUM_FORMAT_NORM                            = 0x00,
+	    SQ_NUM_FORMAT_INT                             = 0x01,
+	    SQ_NUM_FORMAT_SCALED                          = 0x02,
+	SQ_VTX_WORD1__FORMAT_COMP_ALL_bit                 = 1 << 30,
+	SQ_VTX_WORD1__SRF_MODE_ALL_bit                    = 1 << 31,
+    SQ_ALU_WORD1_OP2                                      = 0x00008dfc,
+/* 	SRC0_ABS_bit                                      = 1 << 0, */
+/* 	SRC1_ABS_bit                                      = 1 << 1, */
+/* 	UPDATE_EXECUTE_MASK_bit                           = 1 << 2, */
+/* 	UPDATE_PRED_bit                                   = 1 << 3, */
+/* 	WRITE_MASK_bit                                    = 1 << 4, */
+	FOG_MERGE_bit                                     = 1 << 5,
+	SQ_ALU_WORD1_OP2__OMOD_mask                       = 0x03 << 6,
+	SQ_ALU_WORD1_OP2__OMOD_shift                      = 6,
+/* 	    SQ_ALU_OMOD_OFF                               = 0x00, */
+/* 	    SQ_ALU_OMOD_M2                                = 0x01, */
+/* 	    SQ_ALU_OMOD_M4                                = 0x02, */
+/* 	    SQ_ALU_OMOD_D2                                = 0x03, */
+	SQ_ALU_WORD1_OP2__ALU_INST_mask                   = 0x3ff << 8,
+	SQ_ALU_WORD1_OP2__ALU_INST_shift                  = 8,
+/* 	    SQ_OP2_INST_ADD                               = 0x00, */
+/* 	    SQ_OP2_INST_MUL                               = 0x01, */
+/* 	    SQ_OP2_INST_MUL_IEEE                          = 0x02, */
+/* 	    SQ_OP2_INST_MAX                               = 0x03, */
+/* 	    SQ_OP2_INST_MIN                               = 0x04, */
+/* 	    SQ_OP2_INST_MAX_DX10                          = 0x05, */
+/* 	    SQ_OP2_INST_MIN_DX10                          = 0x06, */
+/* 	    SQ_OP2_INST_SETE                              = 0x08, */
+/* 	    SQ_OP2_INST_SETGT                             = 0x09, */
+/* 	    SQ_OP2_INST_SETGE                             = 0x0a, */
+/* 	    SQ_OP2_INST_SETNE                             = 0x0b, */
+/* 	    SQ_OP2_INST_SETE_DX10                         = 0x0c, */
+/* 	    SQ_OP2_INST_SETGT_DX10                        = 0x0d, */
+/* 	    SQ_OP2_INST_SETGE_DX10                        = 0x0e, */
+/* 	    SQ_OP2_INST_SETNE_DX10                        = 0x0f, */
+/* 	    SQ_OP2_INST_FRACT                             = 0x10, */
+/* 	    SQ_OP2_INST_TRUNC                             = 0x11, */
+/* 	    SQ_OP2_INST_CEIL                              = 0x12, */
+/* 	    SQ_OP2_INST_RNDNE                             = 0x13, */
+/* 	    SQ_OP2_INST_FLOOR                             = 0x14, */
+/* 	    SQ_OP2_INST_MOVA                              = 0x15, */
+/* 	    SQ_OP2_INST_MOVA_FLOOR                        = 0x16, */
+/* 	    SQ_OP2_INST_MOVA_INT                          = 0x18, */
+/* 	    SQ_OP2_INST_MOV                               = 0x19, */
+/* 	    SQ_OP2_INST_NOP                               = 0x1a, */
+/* 	    SQ_OP2_INST_PRED_SETGT_UINT                   = 0x1e, */
+/* 	    SQ_OP2_INST_PRED_SETGE_UINT                   = 0x1f, */
+/* 	    SQ_OP2_INST_PRED_SETE                         = 0x20, */
+/* 	    SQ_OP2_INST_PRED_SETGT                        = 0x21, */
+/* 	    SQ_OP2_INST_PRED_SETGE                        = 0x22, */
+/* 	    SQ_OP2_INST_PRED_SETNE                        = 0x23, */
+/* 	    SQ_OP2_INST_PRED_SET_INV                      = 0x24, */
+/* 	    SQ_OP2_INST_PRED_SET_POP                      = 0x25, */
+/* 	    SQ_OP2_INST_PRED_SET_CLR                      = 0x26, */
+/* 	    SQ_OP2_INST_PRED_SET_RESTORE                  = 0x27, */
+/* 	    SQ_OP2_INST_PRED_SETE_PUSH                    = 0x28, */
+/* 	    SQ_OP2_INST_PRED_SETGT_PUSH                   = 0x29, */
+/* 	    SQ_OP2_INST_PRED_SETGE_PUSH                   = 0x2a, */
+/* 	    SQ_OP2_INST_PRED_SETNE_PUSH                   = 0x2b, */
+/* 	    SQ_OP2_INST_KILLE                             = 0x2c, */
+/* 	    SQ_OP2_INST_KILLGT                            = 0x2d, */
+/* 	    SQ_OP2_INST_KILLGE                            = 0x2e, */
+/* 	    SQ_OP2_INST_KILLNE                            = 0x2f, */
+/* 	    SQ_OP2_INST_AND_INT                           = 0x30, */
+/* 	    SQ_OP2_INST_OR_INT                            = 0x31, */
+/* 	    SQ_OP2_INST_XOR_INT                           = 0x32, */
+/* 	    SQ_OP2_INST_NOT_INT                           = 0x33, */
+/* 	    SQ_OP2_INST_ADD_INT                           = 0x34, */
+/* 	    SQ_OP2_INST_SUB_INT                           = 0x35, */
+/* 	    SQ_OP2_INST_MAX_INT                           = 0x36, */
+/* 	    SQ_OP2_INST_MIN_INT                           = 0x37, */
+/* 	    SQ_OP2_INST_MAX_UINT                          = 0x38, */
+/* 	    SQ_OP2_INST_MIN_UINT                          = 0x39, */
+/* 	    SQ_OP2_INST_SETE_INT                          = 0x3a, */
+/* 	    SQ_OP2_INST_SETGT_INT                         = 0x3b, */
+/* 	    SQ_OP2_INST_SETGE_INT                         = 0x3c, */
+/* 	    SQ_OP2_INST_SETNE_INT                         = 0x3d, */
+/* 	    SQ_OP2_INST_SETGT_UINT                        = 0x3e, */
+/* 	    SQ_OP2_INST_SETGE_UINT                        = 0x3f, */
+/* 	    SQ_OP2_INST_KILLGT_UINT                       = 0x40, */
+/* 	    SQ_OP2_INST_KILLGE_UINT                       = 0x41, */
+/* 	    SQ_OP2_INST_PRED_SETE_INT                     = 0x42, */
+/* 	    SQ_OP2_INST_PRED_SETGT_INT                    = 0x43, */
+/* 	    SQ_OP2_INST_PRED_SETGE_INT                    = 0x44, */
+/* 	    SQ_OP2_INST_PRED_SETNE_INT                    = 0x45, */
+/* 	    SQ_OP2_INST_KILLE_INT                         = 0x46, */
+/* 	    SQ_OP2_INST_KILLGT_INT                        = 0x47, */
+/* 	    SQ_OP2_INST_KILLGE_INT                        = 0x48, */
+/* 	    SQ_OP2_INST_KILLNE_INT                        = 0x49, */
+/* 	    SQ_OP2_INST_PRED_SETE_PUSH_INT                = 0x4a, */
+/* 	    SQ_OP2_INST_PRED_SETGT_PUSH_INT               = 0x4b, */
+/* 	    SQ_OP2_INST_PRED_SETGE_PUSH_INT               = 0x4c, */
+/* 	    SQ_OP2_INST_PRED_SETNE_PUSH_INT               = 0x4d, */
+/* 	    SQ_OP2_INST_PRED_SETLT_PUSH_INT               = 0x4e, */
+/* 	    SQ_OP2_INST_PRED_SETLE_PUSH_INT               = 0x4f, */
+/* 	    SQ_OP2_INST_DOT4                              = 0x50, */
+/* 	    SQ_OP2_INST_DOT4_IEEE                         = 0x51, */
+/* 	    SQ_OP2_INST_CUBE                              = 0x52, */
+/* 	    SQ_OP2_INST_MAX4                              = 0x53, */
+/* 	    SQ_OP2_INST_MOVA_GPR_INT                      = 0x60, */
+/* 	    SQ_OP2_INST_EXP_IEEE                          = 0x61, */
+/* 	    SQ_OP2_INST_LOG_CLAMPED                       = 0x62, */
+/* 	    SQ_OP2_INST_LOG_IEEE                          = 0x63, */
+/* 	    SQ_OP2_INST_RECIP_CLAMPED                     = 0x64, */
+/* 	    SQ_OP2_INST_RECIP_FF                          = 0x65, */
+/* 	    SQ_OP2_INST_RECIP_IEEE                        = 0x66, */
+/* 	    SQ_OP2_INST_RECIPSQRT_CLAMPED                 = 0x67, */
+/* 	    SQ_OP2_INST_RECIPSQRT_FF                      = 0x68, */
+/* 	    SQ_OP2_INST_RECIPSQRT_IEEE                    = 0x69, */
+/* 	    SQ_OP2_INST_SQRT_IEEE                         = 0x6a, */
+/* 	    SQ_OP2_INST_FLT_TO_INT                        = 0x6b, */
+/* 	    SQ_OP2_INST_INT_TO_FLT                        = 0x6c, */
+/* 	    SQ_OP2_INST_UINT_TO_FLT                       = 0x6d, */
+/* 	    SQ_OP2_INST_SIN                               = 0x6e, */
+/* 	    SQ_OP2_INST_COS                               = 0x6f, */
+/* 	    SQ_OP2_INST_ASHR_INT                          = 0x70, */
+/* 	    SQ_OP2_INST_LSHR_INT                          = 0x71, */
+/* 	    SQ_OP2_INST_LSHL_INT                          = 0x72, */
+/* 	    SQ_OP2_INST_MULLO_INT                         = 0x73, */
+/* 	    SQ_OP2_INST_MULHI_INT                         = 0x74, */
+/* 	    SQ_OP2_INST_MULLO_UINT                        = 0x75, */
+/* 	    SQ_OP2_INST_MULHI_UINT                        = 0x76, */
+/* 	    SQ_OP2_INST_RECIP_INT                         = 0x77, */
+/* 	    SQ_OP2_INST_RECIP_UINT                        = 0x78, */
+/* 	    SQ_OP2_INST_FLT_TO_UINT                       = 0x79, */
+    SQ_CF_WORD1                                           = 0x00008dfc,
+	POP_COUNT_mask                                    = 0x07 << 0,
+	POP_COUNT_shift                                   = 0,
+	CF_CONST_mask                                     = 0x1f << 3,
+	CF_CONST_shift                                    = 3,
+	COND_mask                                         = 0x03 << 8,
+	COND_shift                                        = 8,
+	    SQ_CF_COND_ACTIVE                             = 0x00,
+	    SQ_CF_COND_FALSE                              = 0x01,
+	    SQ_CF_COND_BOOL                               = 0x02,
+	    SQ_CF_COND_NOT_BOOL                           = 0x03,
+	SQ_CF_WORD1__COUNT_mask                           = 0x07 << 10,
+	SQ_CF_WORD1__COUNT_shift                          = 10,
+	CALL_COUNT_mask                                   = 0x3f << 13,
+	CALL_COUNT_shift                                  = 13,
+	COUNT_3_bit                                       = 1 << 19,
+/* 	END_OF_PROGRAM_bit                                = 1 << 21, */
+/* 	VALID_PIXEL_MODE_bit                              = 1 << 22, */
+	SQ_CF_WORD1__CF_INST_mask                         = 0x7f << 23,
+	SQ_CF_WORD1__CF_INST_shift                        = 23,
+	    SQ_CF_INST_NOP                                = 0x00,
+	    SQ_CF_INST_TEX                                = 0x01,
+	    SQ_CF_INST_VTX                                = 0x02,
+	    SQ_CF_INST_VTX_TC                             = 0x03,
+	    SQ_CF_INST_LOOP_START                         = 0x04,
+	    SQ_CF_INST_LOOP_END                           = 0x05,
+	    SQ_CF_INST_LOOP_START_DX10                    = 0x06,
+	    SQ_CF_INST_LOOP_START_NO_AL                   = 0x07,
+	    SQ_CF_INST_LOOP_CONTINUE                      = 0x08,
+	    SQ_CF_INST_LOOP_BREAK                         = 0x09,
+	    SQ_CF_INST_JUMP                               = 0x0a,
+	    SQ_CF_INST_PUSH                               = 0x0b,
+	    SQ_CF_INST_PUSH_ELSE                          = 0x0c,
+	    SQ_CF_INST_ELSE                               = 0x0d,
+	    SQ_CF_INST_POP                                = 0x0e,
+	    SQ_CF_INST_POP_JUMP                           = 0x0f,
+	    SQ_CF_INST_POP_PUSH                           = 0x10,
+	    SQ_CF_INST_POP_PUSH_ELSE                      = 0x11,
+	    SQ_CF_INST_CALL                               = 0x12,
+	    SQ_CF_INST_CALL_FS                            = 0x13,
+	    SQ_CF_INST_RETURN                             = 0x14,
+	    SQ_CF_INST_EMIT_VERTEX                        = 0x15,
+	    SQ_CF_INST_EMIT_CUT_VERTEX                    = 0x16,
+	    SQ_CF_INST_CUT_VERTEX                         = 0x17,
+	    SQ_CF_INST_KILL                               = 0x18,
+/* 	WHOLE_QUAD_MODE_bit                               = 1 << 30, */
+/* 	BARRIER_bit                                       = 1 << 31, */
+    SQ_VTX_WORD1_SEM                                      = 0x00008dfc,
+	SEMANTIC_ID_mask                                  = 0xff << 0,
+	SEMANTIC_ID_shift                                 = 0,
+    SQ_TEX_WORD0                                          = 0x00008dfc,
+	TEX_INST_mask                                     = 0x1f << 0,
+	TEX_INST_shift                                    = 0,
+	    SQ_TEX_INST_VTX_FETCH                         = 0x00,
+	    SQ_TEX_INST_VTX_SEMANTIC                      = 0x01,
+	    SQ_TEX_INST_LD                                = 0x03,
+	    SQ_TEX_INST_GET_TEXTURE_RESINFO               = 0x04,
+	    SQ_TEX_INST_GET_NUMBER_OF_SAMPLES             = 0x05,
+	    SQ_TEX_INST_GET_LOD                           = 0x06,
+	    SQ_TEX_INST_GET_GRADIENTS_H                   = 0x07,
+	    SQ_TEX_INST_GET_GRADIENTS_V                   = 0x08,
+	    SQ_TEX_INST_GET_LERP                          = 0x09,
+	    SQ_TEX_INST_RESERVED_10                       = 0x0a,
+	    SQ_TEX_INST_SET_GRADIENTS_H                   = 0x0b,
+	    SQ_TEX_INST_SET_GRADIENTS_V                   = 0x0c,
+	    SQ_TEX_INST_PASS                              = 0x0d,
+	    X_Z_SET_INDEX_FOR_ARRAY_OF_CUBEMAPS           = 0x0e,
+	    SQ_TEX_INST_SAMPLE                            = 0x10,
+	    SQ_TEX_INST_SAMPLE_L                          = 0x11,
+	    SQ_TEX_INST_SAMPLE_LB                         = 0x12,
+	    SQ_TEX_INST_SAMPLE_LZ                         = 0x13,
+	    SQ_TEX_INST_SAMPLE_G                          = 0x14,
+	    SQ_TEX_INST_SAMPLE_G_L                        = 0x15,
+	    SQ_TEX_INST_SAMPLE_G_LB                       = 0x16,
+	    SQ_TEX_INST_SAMPLE_G_LZ                       = 0x17,
+	    SQ_TEX_INST_SAMPLE_C                          = 0x18,
+	    SQ_TEX_INST_SAMPLE_C_L                        = 0x19,
+	    SQ_TEX_INST_SAMPLE_C_LB                       = 0x1a,
+	    SQ_TEX_INST_SAMPLE_C_LZ                       = 0x1b,
+	    SQ_TEX_INST_SAMPLE_C_G                        = 0x1c,
+	    SQ_TEX_INST_SAMPLE_C_G_L                      = 0x1d,
+	    SQ_TEX_INST_SAMPLE_C_G_LB                     = 0x1e,
+	    SQ_TEX_INST_SAMPLE_C_G_LZ                     = 0x1f,
+	BC_FRAC_MODE_bit                                  = 1 << 5,
+/* 	FETCH_WHOLE_QUAD_bit                              = 1 << 7, */
+	RESOURCE_ID_mask                                  = 0xff << 8,
+	RESOURCE_ID_shift                                 = 8,
+/* 	SRC_GPR_mask                                      = 0x7f << 16, */
+/* 	SRC_GPR_shift                                     = 16, */
+/* 	SRC_REL_bit                                       = 1 << 23, */
+	SQ_TEX_WORD0__ALT_CONST_bit                       = 1 << 24,
+    SQ_VTX_WORD1_GPR                                      = 0x00008dfc,
+	SQ_VTX_WORD1_GPR__DST_GPR_mask                    = 0x7f << 0,
+	SQ_VTX_WORD1_GPR__DST_GPR_shift                   = 0,
+	SQ_VTX_WORD1_GPR__DST_REL_bit                     = 1 << 7,
+    SQ_ALU_WORD0                                          = 0x00008dfc,
+	SRC0_SEL_mask                                     = 0x1ff << 0,
+	SRC0_SEL_shift                                    = 0,
+/* 	    SQ_ALU_SRC_0                                  = 0xf8, */
+/* 	    SQ_ALU_SRC_1                                  = 0xf9, */
+/* 	    SQ_ALU_SRC_1_INT                              = 0xfa, */
+/* 	    SQ_ALU_SRC_M_1_INT                            = 0xfb, */
+/* 	    SQ_ALU_SRC_0_5                                = 0xfc, */
+/* 	    SQ_ALU_SRC_LITERAL                            = 0xfd, */
+/* 	    SQ_ALU_SRC_PV                                 = 0xfe, */
+/* 	    SQ_ALU_SRC_PS                                 = 0xff, */
+	SRC0_REL_bit                                      = 1 << 9,
+	SRC0_CHAN_mask                                    = 0x03 << 10,
+	SRC0_CHAN_shift                                   = 10,
+/* 	    SQ_CHAN_X                                     = 0x00, */
+/* 	    SQ_CHAN_Y                                     = 0x01, */
+/* 	    SQ_CHAN_Z                                     = 0x02, */
+/* 	    SQ_CHAN_W                                     = 0x03, */
+	SRC0_NEG_bit                                      = 1 << 12,
+	SRC1_SEL_mask                                     = 0x1ff << 13,
+	SRC1_SEL_shift                                    = 13,
+/* 	    SQ_ALU_SRC_0                                  = 0xf8, */
+/* 	    SQ_ALU_SRC_1                                  = 0xf9, */
+/* 	    SQ_ALU_SRC_1_INT                              = 0xfa, */
+/* 	    SQ_ALU_SRC_M_1_INT                            = 0xfb, */
+/* 	    SQ_ALU_SRC_0_5                                = 0xfc, */
+/* 	    SQ_ALU_SRC_LITERAL                            = 0xfd, */
+/* 	    SQ_ALU_SRC_PV                                 = 0xfe, */
+/* 	    SQ_ALU_SRC_PS                                 = 0xff, */
+	SRC1_REL_bit                                      = 1 << 22,
+	SRC1_CHAN_mask                                    = 0x03 << 23,
+	SRC1_CHAN_shift                                   = 23,
+/* 	    SQ_CHAN_X                                     = 0x00, */
+/* 	    SQ_CHAN_Y                                     = 0x01, */
+/* 	    SQ_CHAN_Z                                     = 0x02, */
+/* 	    SQ_CHAN_W                                     = 0x03, */
+	SRC1_NEG_bit                                      = 1 << 25,
+	INDEX_MODE_mask                                   = 0x07 << 26,
+	INDEX_MODE_shift                                  = 26,
+	    SQ_INDEX_AR_X                                 = 0x00,
+	    SQ_INDEX_AR_Y                                 = 0x01,
+	    SQ_INDEX_AR_Z                                 = 0x02,
+	    SQ_INDEX_AR_W                                 = 0x03,
+	    SQ_INDEX_LOOP                                 = 0x04,
+	PRED_SEL_mask                                     = 0x03 << 29,
+	PRED_SEL_shift                                    = 29,
+	    SQ_PRED_SEL_OFF                               = 0x00,
+	    SQ_PRED_SEL_ZERO                              = 0x02,
+	    SQ_PRED_SEL_ONE                               = 0x03,
+	LAST_bit                                          = 1 << 31,
+    SX_EXPORT_BUFFER_SIZES                                = 0x0000900c,
+	COLOR_BUFFER_SIZE_mask                            = 0xff << 0,
+	COLOR_BUFFER_SIZE_shift                           = 0,
+	POSITION_BUFFER_SIZE_mask                         = 0xff << 8,
+	POSITION_BUFFER_SIZE_shift                        = 8,
+	SMX_BUFFER_SIZE_mask                              = 0xff << 16,
+	SMX_BUFFER_SIZE_shift                             = 16,
+    SX_MEMORY_EXPORT_BASE                                 = 0x00009010,
+    SX_MEMORY_EXPORT_SIZE                                 = 0x00009014,
+    SPI_CONFIG_CNTL                                       = 0x00009100,
+	GPR_WRITE_PRIORITY_mask                           = 0x1f << 0,
+	GPR_WRITE_PRIORITY_shift                          = 0,
+	    X_PRIORITY_ORDER                              = 0x00,
+	    X_PRIORITY_ORDER_VS                           = 0x01,
+	DISABLE_INTERP_1_bit                              = 1 << 5,
+	DEBUG_THREAD_TYPE_SEL_mask                        = 0x03 << 6,
+	DEBUG_THREAD_TYPE_SEL_shift                       = 6,
+	DEBUG_GROUP_SEL_mask                              = 0x1f << 8,
+	DEBUG_GROUP_SEL_shift                             = 8,
+	DEBUG_GRBM_OVERRIDE_bit                           = 1 << 13,
+    SPI_CONFIG_CNTL_1                                     = 0x0000913c,
+	VTX_DONE_DELAY_mask                               = 0x0f << 0,
+	VTX_DONE_DELAY_shift                              = 0,
+	    X_DELAY_10_CLKS                               = 0x00,
+	    X_DELAY_11_CLKS                               = 0x01,
+	    X_DELAY_12_CLKS                               = 0x02,
+	    X_DELAY_13_CLKS                               = 0x03,
+	    X_DELAY_14_CLKS                               = 0x04,
+	    X_DELAY_15_CLKS                               = 0x05,
+	    X_DELAY_16_CLKS                               = 0x06,
+	    X_DELAY_17_CLKS                               = 0x07,
+	    X_DELAY_2_CLKS                                = 0x08,
+	    X_DELAY_3_CLKS                                = 0x09,
+	    X_DELAY_4_CLKS                                = 0x0a,
+	    X_DELAY_5_CLKS                                = 0x0b,
+	    X_DELAY_6_CLKS                                = 0x0c,
+	    X_DELAY_7_CLKS                                = 0x0d,
+	    X_DELAY_8_CLKS                                = 0x0e,
+	    X_DELAY_9_CLKS                                = 0x0f,
+	INTERP_ONE_PRIM_PER_ROW_bit                       = 1 << 4,
+    TD_FILTER4                                            = 0x00009400,
+	WEIGHT_1_mask                                     = 0x7ff << 0,
+	WEIGHT_1_shift                                    = 0,
+	WEIGHT_0_mask                                     = 0x7ff << 11,
+	WEIGHT_0_shift                                    = 11,
+	WEIGHT_PAIR_bit                                   = 1 << 22,
+	PHASE_mask                                        = 0x0f << 23,
+	PHASE_shift                                       = 23,
+	DIRECTION_bit                                     = 1 << 27,
+    TD_FILTER4_1                                          = 0x00009404,
+	TD_FILTER4_1_num                                  = 35,
+/* 	WEIGHT_1_mask                                     = 0x7ff << 0, */
+/* 	WEIGHT_1_shift                                    = 0, */
+/* 	WEIGHT_0_mask                                     = 0x7ff << 11, */
+/* 	WEIGHT_0_shift                                    = 11, */
+    TD_CNTL                                               = 0x00009490,
+	SYNC_PHASE_SH_mask                                = 0x03 << 0,
+	SYNC_PHASE_SH_shift                               = 0,
+	SYNC_PHASE_VC_SMX_mask                            = 0x03 << 4,
+	SYNC_PHASE_VC_SMX_shift                           = 4,
+    TD0_CNTL                                              = 0x00009494,
+	TD0_CNTL_num                                      = 4,
+	ID_OVERRIDE_mask                                  = 0x03 << 28,
+	ID_OVERRIDE_shift                                 = 28,
+    TD0_STATUS                                            = 0x000094a4,
+	TD0_STATUS_num                                    = 4,
+	BUSY_bit                                          = 1 << 31,
+    TA_CNTL                                               = 0x00009504,
+	GRADIENT_CREDIT_mask                              = 0x1f << 0,
+	GRADIENT_CREDIT_shift                             = 0,
+	WALKER_CREDIT_mask                                = 0x1f << 8,
+	WALKER_CREDIT_shift                               = 8,
+	ALIGNER_CREDIT_mask                               = 0x1f << 16,
+	ALIGNER_CREDIT_shift                              = 16,
+	TD_FIFO_CREDIT_mask                               = 0x3ff << 22,
+	TD_FIFO_CREDIT_shift                              = 22,
+    TA_CNTL_AUX                                           = 0x00009508,
+	DISABLE_CUBE_WRAP_bit                             = 1 << 0,
+	SYNC_GRADIENT_bit                                 = 1 << 24,
+	SYNC_WALKER_bit                                   = 1 << 25,
+	SYNC_ALIGNER_bit                                  = 1 << 26,
+	BILINEAR_PRECISION_bit                            = 1 << 31,
+    TA0_CNTL                                              = 0x00009510,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA1_CNTL                                              = 0x00009514,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA2_CNTL                                              = 0x00009518,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA3_CNTL                                              = 0x0000951c,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA0_STATUS                                            = 0x00009520,
+	FG_PFIFO_EMPTYB_bit                               = 1 << 12,
+	FG_LFIFO_EMPTYB_bit                               = 1 << 13,
+	FG_SFIFO_EMPTYB_bit                               = 1 << 14,
+	FL_PFIFO_EMPTYB_bit                               = 1 << 16,
+	FL_LFIFO_EMPTYB_bit                               = 1 << 17,
+	FL_SFIFO_EMPTYB_bit                               = 1 << 18,
+	FA_PFIFO_EMPTYB_bit                               = 1 << 20,
+	FA_LFIFO_EMPTYB_bit                               = 1 << 21,
+	FA_SFIFO_EMPTYB_bit                               = 1 << 22,
+	IN_BUSY_bit                                       = 1 << 24,
+	FG_BUSY_bit                                       = 1 << 25,
+	FL_BUSY_bit                                       = 1 << 27,
+	TA_BUSY_bit                                       = 1 << 28,
+	FA_BUSY_bit                                       = 1 << 29,
+	AL_BUSY_bit                                       = 1 << 30,
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA1_STATUS                                            = 0x00009524,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA2_STATUS                                            = 0x00009528,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA3_STATUS                                            = 0x0000952c,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TC_STATUS                                             = 0x00009600,
+	TC_BUSY_bit                                       = 1 << 0,
+    TC_INVALIDATE                                         = 0x00009604,
+	START_bit                                         = 1 << 0,
+    TC_CNTL                                               = 0x00009608,
+	FORCE_HIT_bit                                     = 1 << 0,
+	FORCE_MISS_bit                                    = 1 << 1,
+	L2_SIZE_mask                                      = 0x0f << 5,
+	L2_SIZE_shift                                     = 5,
+	    _256K                                         = 0x00,
+	    _224K                                         = 0x01,
+	    _192K                                         = 0x02,
+	    _160K                                         = 0x03,
+	    _128K                                         = 0x04,
+	    _96K                                          = 0x05,
+	    _64K                                          = 0x06,
+	    _32K                                          = 0x07,
+	L2_DISABLE_LATE_HIT_bit                           = 1 << 9,
+	DISABLE_VERT_PERF_bit                             = 1 << 10,
+	DISABLE_INVAL_BUSY_bit                            = 1 << 11,
+	DISABLE_INVAL_SAME_SURFACE_bit                    = 1 << 12,
+	PARTITION_MODE_mask                               = 0x03 << 13,
+	PARTITION_MODE_shift                              = 13,
+	    X_VERTEX                                      = 0x00,
+	MISS_ARB_MODE_bit                                 = 1 << 15,
+	HIT_ARB_MODE_bit                                  = 1 << 16,
+	DISABLE_WRITE_DELAY_bit                           = 1 << 17,
+	HIT_FIFO_DEPTH_bit                                = 1 << 18,
+    VC_CNTL                                               = 0x00009700,
+	L2_INVALIDATE_bit                                 = 1 << 0,
+	RESERVED_bit                                      = 1 << 1,
+	CC_FORCE_MISS_bit                                 = 1 << 2,
+	MI_CHAN_SEL_mask                                  = 0x03 << 3,
+	MI_CHAN_SEL_shift                                 = 3,
+	    X_MC0_USES_CH_0_1                             = 0x00,
+	    X_MC0_USES_CH_0_3                             = 0x01,
+	    X_VC_MC0_IS_ACTIVE                            = 0x02,
+	    X_VC_MC1_IS_DISABLED                          = 0x03,
+	MI_STEER_DISABLE_bit                              = 1 << 5,
+	MI_CREDIT_CTR_mask                                = 0x0f << 6,
+	MI_CREDIT_CTR_shift                               = 6,
+	MI_CREDIT_WE_bit                                  = 1 << 10,
+	MI_REQ_STALL_THLD_mask                            = 0x07 << 11,
+	MI_REQ_STALL_THLD_shift                           = 11,
+	    X_LATENCY_EXCEEDS_399_CLOCKS                  = 0x00,
+	    X_LATENCY_EXCEEDS_415_CLOCKS                  = 0x01,
+	    X_LATENCY_EXCEEDS_431_CLOCKS                  = 0x02,
+	    X_LATENCY_EXCEEDS_447_CLOCKS                  = 0x03,
+	    X_LATENCY_EXCEEDS_463_CLOCKS                  = 0x04,
+	    X_LATENCY_EXCEEDS_479_CLOCKS                  = 0x05,
+	    X_LATENCY_EXCEEDS_495_CLOCKS                  = 0x06,
+	    X_LATENCY_EXCEEDS_511_CLOCKS                  = 0x07,
+	VC_CNTL__MI_TIMESTAMP_RES_mask                    = 0x1f << 14,
+	VC_CNTL__MI_TIMESTAMP_RES_shift                   = 14,
+	    X_1X_SYSTEM_CLOCK                             = 0x00,
+	    X_2X_SYSTEM_CLOCK                             = 0x01,
+	    X_4X_SYSTEM_CLOCK                             = 0x02,
+	    X_8X_SYSTEM_CLOCK                             = 0x03,
+	    X_16X_SYSTEM_CLOCK                            = 0x04,
+	    X_32X_SYSTEM_CLOCK                            = 0x05,
+	    X_64X_SYSTEM_CLOCK                            = 0x06,
+	    X_128X_SYSTEM_CLOCK                           = 0x07,
+	    X_256X_SYSTEM_CLOCK                           = 0x08,
+	    X_512X_SYSTEM_CLOCK                           = 0x09,
+	    X_1024X_SYSTEM_CLOCK                          = 0x0a,
+	    X_2048X_SYSTEM_CLOCK                          = 0x0b,
+	    X_4092X_SYSTEM_CLOCK                          = 0x0c,
+	    X_8192X_SYSTEM_CLOCK                          = 0x0d,
+	    X_16384X_SYSTEM_CLOCK                         = 0x0e,
+	    X_32768X_SYSTEM_CLOCK                         = 0x0f,
+    VC_CNTL_STATUS                                        = 0x00009704,
+	RP_BUSY_bit                                       = 1 << 0,
+	RG_BUSY_bit                                       = 1 << 1,
+	VC_BUSY_bit                                       = 1 << 2,
+	CLAMP_DETECT_bit                                  = 1 << 3,
+    VC_CONFIG                                             = 0x00009718,
+	WRITE_DIS_bit                                     = 1 << 0,
+	GPR_DATA_PHASE_ADJ_mask                           = 0x07 << 1,
+	GPR_DATA_PHASE_ADJ_shift                          = 1,
+	    X_LATENCY_BASE_0_CYCLES                       = 0x00,
+	    X_LATENCY_BASE_1_CYCLES                       = 0x01,
+	    X_LATENCY_BASE_2_CYCLES                       = 0x02,
+	    X_LATENCY_BASE_3_CYCLES                       = 0x03,
+	TD_SIMD_SYNC_ADJ_mask                             = 0x07 << 4,
+	TD_SIMD_SYNC_ADJ_shift                            = 4,
+	    X_0_CYCLES_DELAY                              = 0x00,
+	    X_1_CYCLES_DELAY                              = 0x01,
+	    X_2_CYCLES_DELAY                              = 0x02,
+	    X_3_CYCLES_DELAY                              = 0x03,
+	    X_4_CYCLES_DELAY                              = 0x04,
+	    X_5_CYCLES_DELAY                              = 0x05,
+	    X_6_CYCLES_DELAY                              = 0x06,
+	    X_7_CYCLES_DELAY                              = 0x07,
+    SMX_DC_CTL0                                           = 0x0000a020,
+	WR_GATHER_STREAM0_bit                             = 1 << 0,
+	WR_GATHER_STREAM1_bit                             = 1 << 1,
+	WR_GATHER_STREAM2_bit                             = 1 << 2,
+	WR_GATHER_STREAM3_bit                             = 1 << 3,
+	WR_GATHER_SCRATCH_bit                             = 1 << 4,
+	WR_GATHER_REDUC_BUF_bit                           = 1 << 5,
+	WR_GATHER_RING_BUF_bit                            = 1 << 6,
+	WR_GATHER_F_BUF_bit                               = 1 << 7,
+	DISABLE_CACHES_bit                                = 1 << 8,
+	AUTO_FLUSH_INVAL_EN_bit                           = 1 << 10,
+	AUTO_FLUSH_EN_bit                                 = 1 << 11,
+	AUTO_FLUSH_CNT_mask                               = 0xffff << 12,
+	AUTO_FLUSH_CNT_shift                              = 12,
+	MC_RD_STALL_FACTOR_mask                           = 0x03 << 28,
+	MC_RD_STALL_FACTOR_shift                          = 28,
+	MC_WR_STALL_FACTOR_mask                           = 0x03 << 30,
+	MC_WR_STALL_FACTOR_shift                          = 30,
+    SMX_DC_CTL1                                           = 0x0000a024,
+	OP_FIFO_SKID_mask                                 = 0x7f << 0,
+	OP_FIFO_SKID_shift                                = 0,
+	CACHE_LINE_SIZE_bit                               = 1 << 8,
+	MULTI_FLUSH_MODE_bit                              = 1 << 9,
+	MULTI_FLUSH_REQ_ABORT_IDX_FIFO_SKID_mask          = 0x0f << 10,
+	MULTI_FLUSH_REQ_ABORT_IDX_FIFO_SKID_shift         = 10,
+	DISABLE_WR_GATHER_RD_HIT_FORCE_EVICT_bit          = 1 << 16,
+	DISABLE_WR_GATHER_RD_HIT_COMP_VLDS_CHECK_bit      = 1 << 17,
+	DISABLE_FLUSH_ES_ALSO_INVALS_bit                  = 1 << 18,
+	DISABLE_FLUSH_GS_ALSO_INVALS_bit                  = 1 << 19,
+    SMX_DC_CTL2                                           = 0x0000a028,
+	INVALIDATE_CACHES_bit                             = 1 << 0,
+	CACHES_INVALID_bit                                = 1 << 1,
+	CACHES_DIRTY_bit                                  = 1 << 2,
+	FLUSH_ALL_bit                                     = 1 << 4,
+	FLUSH_GS_THREADS_bit                              = 1 << 8,
+	FLUSH_ES_THREADS_bit                              = 1 << 9,
+    SMX_DC_MC_INTF_CTL                                    = 0x0000a02c,
+	MC_RD_REQ_CRED_mask                               = 0xff << 0,
+	MC_RD_REQ_CRED_shift                              = 0,
+	MC_WR_REQ_CRED_mask                               = 0xff << 16,
+	MC_WR_REQ_CRED_shift                              = 16,
+    TD_PS_SAMPLER0_BORDER_RED                             = 0x0000a400,
+	TD_PS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_PS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_PS_SAMPLER0_BORDER_GREEN                           = 0x0000a404,
+	TD_PS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_PS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_PS_SAMPLER0_BORDER_BLUE                            = 0x0000a408,
+	TD_PS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_PS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_PS_SAMPLER0_BORDER_ALPHA                           = 0x0000a40c,
+	TD_PS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_PS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_VS_SAMPLER0_BORDER_RED                             = 0x0000a600,
+	TD_VS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_VS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_VS_SAMPLER0_BORDER_GREEN                           = 0x0000a604,
+	TD_VS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_VS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_VS_SAMPLER0_BORDER_BLUE                            = 0x0000a608,
+	TD_VS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_VS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_VS_SAMPLER0_BORDER_ALPHA                           = 0x0000a60c,
+	TD_VS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_VS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_GS_SAMPLER0_BORDER_RED                             = 0x0000a800,
+	TD_GS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_GS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_GS_SAMPLER0_BORDER_GREEN                           = 0x0000a804,
+	TD_GS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_GS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_GS_SAMPLER0_BORDER_BLUE                            = 0x0000a808,
+	TD_GS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_GS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_GS_SAMPLER0_BORDER_ALPHA                           = 0x0000a80c,
+	TD_GS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_GS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_PS_SAMPLER0_CLEARTYPE_KERNEL                       = 0x0000aa00,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL_num               = 18,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__WIDTH_mask       = 0x07 << 0,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__WIDTH_shift      = 0,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__HEIGHT_mask      = 0x07 << 3,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__HEIGHT_shift     = 3,
+    DB_DEPTH_SIZE                                         = 0x00028000,
+	PITCH_TILE_MAX_mask                               = 0x3ff << 0,
+	PITCH_TILE_MAX_shift                              = 0,
+	SLICE_TILE_MAX_mask                               = 0xfffff << 10,
+	SLICE_TILE_MAX_shift                              = 10,
+    DB_DEPTH_VIEW                                         = 0x00028004,
+	SLICE_START_mask                                  = 0x7ff << 0,
+	SLICE_START_shift                                 = 0,
+	SLICE_MAX_mask                                    = 0x7ff << 13,
+	SLICE_MAX_shift                                   = 13,
+    DB_DEPTH_BASE                                         = 0x0002800c,
+    DB_DEPTH_INFO                                         = 0x00028010,
+	DB_DEPTH_INFO__FORMAT_mask                        = 0x07 << 0,
+	DB_DEPTH_INFO__FORMAT_shift                       = 0,
+	    DEPTH_INVALID                                 = 0x00,
+	    DEPTH_16                                      = 0x01,
+	    DEPTH_X8_24                                   = 0x02,
+	    DEPTH_8_24                                    = 0x03,
+	    DEPTH_X8_24_FLOAT                             = 0x04,
+	    DEPTH_8_24_FLOAT                              = 0x05,
+	    DEPTH_32_FLOAT                                = 0x06,
+	    DEPTH_X24_8_32_FLOAT                          = 0x07,
+	DB_DEPTH_INFO__READ_SIZE_bit                      = 1 << 3,
+	DB_DEPTH_INFO__ARRAY_MODE_mask                    = 0x0f << 15,
+	DB_DEPTH_INFO__ARRAY_MODE_shift                   = 15,
+	    ARRAY_2D_TILED_THIN1                          = 0x04,
+	TILE_SURFACE_ENABLE_bit                           = 1 << 25,
+	TILE_COMPACT_bit                                  = 1 << 26,
+	ZRANGE_PRECISION_bit                              = 1 << 31,
+    DB_HTILE_DATA_BASE                                    = 0x00028014,
+    DB_STENCIL_CLEAR                                      = 0x00028028,
+	DB_STENCIL_CLEAR__CLEAR_mask                      = 0xff << 0,
+	DB_STENCIL_CLEAR__CLEAR_shift                     = 0,
+	MIN_mask                                          = 0xff << 16,
+	MIN_shift                                         = 16,
+    DB_DEPTH_CLEAR                                        = 0x0002802c,
+    PA_SC_SCREEN_SCISSOR_TL                               = 0x00028030,
+	PA_SC_SCREEN_SCISSOR_TL__TL_X_mask                = 0x7fff << 0,
+	PA_SC_SCREEN_SCISSOR_TL__TL_X_shift               = 0,
+	PA_SC_SCREEN_SCISSOR_TL__TL_Y_mask                = 0x7fff << 16,
+	PA_SC_SCREEN_SCISSOR_TL__TL_Y_shift               = 16,
+    PA_SC_SCREEN_SCISSOR_BR                               = 0x00028034,
+	PA_SC_SCREEN_SCISSOR_BR__BR_X_mask                = 0x7fff << 0,
+	PA_SC_SCREEN_SCISSOR_BR__BR_X_shift               = 0,
+	PA_SC_SCREEN_SCISSOR_BR__BR_Y_mask                = 0x7fff << 16,
+	PA_SC_SCREEN_SCISSOR_BR__BR_Y_shift               = 16,
+    CB_COLOR0_BASE                                        = 0x00028040,
+	CB_COLOR0_BASE_num                                = 8,
+    CB_COLOR0_SIZE                                        = 0x00028060,
+	CB_COLOR0_SIZE_num                                = 8,
+/* 	PITCH_TILE_MAX_mask                               = 0x3ff << 0, */
+/* 	PITCH_TILE_MAX_shift                              = 0, */
+/* 	SLICE_TILE_MAX_mask                               = 0xfffff << 10, */
+/* 	SLICE_TILE_MAX_shift                              = 10, */
+    CB_COLOR0_VIEW                                        = 0x00028080,
+	CB_COLOR0_VIEW_num                                = 8,
+/* 	SLICE_START_mask                                  = 0x7ff << 0, */
+/* 	SLICE_START_shift                                 = 0, */
+/* 	SLICE_MAX_mask                                    = 0x7ff << 13, */
+/* 	SLICE_MAX_shift                                   = 13, */
+    CB_COLOR0_INFO                                        = 0x000280a0,
+	CB_COLOR0_INFO_num                                = 8,
+	ENDIAN_mask                                       = 0x03 << 0,
+	ENDIAN_shift                                      = 0,
+	    ENDIAN_NONE                                   = 0x00,
+	    ENDIAN_8IN16                                  = 0x01,
+	    ENDIAN_8IN32                                  = 0x02,
+	    ENDIAN_8IN64                                  = 0x03,
+	CB_COLOR0_INFO__FORMAT_mask                       = 0x3f << 2,
+	CB_COLOR0_INFO__FORMAT_shift                      = 2,
+	    COLOR_INVALID                                 = 0x00,
+	    COLOR_8                                       = 0x01,
+	    COLOR_4_4                                     = 0x02,
+	    COLOR_3_3_2                                   = 0x03,
+	    COLOR_16                                      = 0x05,
+	    COLOR_16_FLOAT                                = 0x06,
+	    COLOR_8_8                                     = 0x07,
+	    COLOR_5_6_5                                   = 0x08,
+	    COLOR_6_5_5                                   = 0x09,
+	    COLOR_1_5_5_5                                 = 0x0a,
+	    COLOR_4_4_4_4                                 = 0x0b,
+	    COLOR_5_5_5_1                                 = 0x0c,
+	    COLOR_32                                      = 0x0d,
+	    COLOR_32_FLOAT                                = 0x0e,
+	    COLOR_16_16                                   = 0x0f,
+	    COLOR_16_16_FLOAT                             = 0x10,
+	    COLOR_8_24                                    = 0x11,
+	    COLOR_8_24_FLOAT                              = 0x12,
+	    COLOR_24_8                                    = 0x13,
+	    COLOR_24_8_FLOAT                              = 0x14,
+	    COLOR_10_11_11                                = 0x15,
+	    COLOR_10_11_11_FLOAT                          = 0x16,
+	    COLOR_11_11_10                                = 0x17,
+	    COLOR_11_11_10_FLOAT                          = 0x18,
+	    COLOR_2_10_10_10                              = 0x19,
+	    COLOR_8_8_8_8                                 = 0x1a,
+	    COLOR_10_10_10_2                              = 0x1b,
+	    COLOR_X24_8_32_FLOAT                          = 0x1c,
+	    COLOR_32_32                                   = 0x1d,
+	    COLOR_32_32_FLOAT                             = 0x1e,
+	    COLOR_16_16_16_16                             = 0x1f,
+	    COLOR_16_16_16_16_FLOAT                       = 0x20,
+	    COLOR_32_32_32_32                             = 0x22,
+	    COLOR_32_32_32_32_FLOAT                       = 0x23,
+	CB_COLOR0_INFO__ARRAY_MODE_mask                   = 0x0f << 8,
+	CB_COLOR0_INFO__ARRAY_MODE_shift                  = 8,
+	    ARRAY_LINEAR_GENERAL                          = 0x00,
+	    ARRAY_LINEAR_ALIGNED                          = 0x01,
+/* 	    ARRAY_2D_TILED_THIN1                          = 0x04, */
+	NUMBER_TYPE_mask                                  = 0x07 << 12,
+	NUMBER_TYPE_shift                                 = 12,
+	    NUMBER_UNORM                                  = 0x00,
+	    NUMBER_SNORM                                  = 0x01,
+	    NUMBER_USCALED                                = 0x02,
+	    NUMBER_SSCALED                                = 0x03,
+	    NUMBER_UINT                                   = 0x04,
+	    NUMBER_SINT                                   = 0x05,
+	    NUMBER_SRGB                                   = 0x06,
+	    NUMBER_FLOAT                                  = 0x07,
+	CB_COLOR0_INFO__READ_SIZE_bit                     = 1 << 15,
+	COMP_SWAP_mask                                    = 0x03 << 16,
+	COMP_SWAP_shift                                   = 16,
+	    SWAP_STD                                      = 0x00,
+	    SWAP_ALT                                      = 0x01,
+	    SWAP_STD_REV                                  = 0x02,
+	    SWAP_ALT_REV                                  = 0x03,
+	CB_COLOR0_INFO__TILE_MODE_mask                    = 0x03 << 18,
+	CB_COLOR0_INFO__TILE_MODE_shift                   = 18,
+	    TILE_DISABLE                                  = 0x00,
+	    TILE_CLEAR_ENABLE                             = 0x01,
+	    TILE_FRAG_ENABLE                              = 0x02,
+	BLEND_CLAMP_bit                                   = 1 << 20,
+	CLEAR_COLOR_bit                                   = 1 << 21,
+	BLEND_BYPASS_bit                                  = 1 << 22,
+	BLEND_FLOAT32_bit                                 = 1 << 23,
+	SIMPLE_FLOAT_bit                                  = 1 << 24,
+	CB_COLOR0_INFO__ROUND_MODE_bit                    = 1 << 25,
+/* 	TILE_COMPACT_bit                                  = 1 << 26, */
+	SOURCE_FORMAT_bit                                 = 1 << 27,
+    CB_COLOR0_TILE                                        = 0x000280c0,
+	CB_COLOR0_TILE_num                                = 8,
+    CB_COLOR0_FRAG                                        = 0x000280e0,
+	CB_COLOR0_FRAG_num                                = 8,
+    CB_COLOR0_MASK                                        = 0x00028100,
+	CB_COLOR0_MASK_num                                = 8,
+	CMASK_BLOCK_MAX_mask                              = 0xfff << 0,
+	CMASK_BLOCK_MAX_shift                             = 0,
+	FMASK_TILE_MAX_mask                               = 0xfffff << 12,
+	FMASK_TILE_MAX_shift                              = 12,
+    CB_CLEAR_RED                                          = 0x00028120,
+    CB_CLEAR_GREEN                                        = 0x00028124,
+    CB_CLEAR_BLUE                                         = 0x00028128,
+    CB_CLEAR_ALPHA                                        = 0x0002812c,
+    SQ_ALU_CONST_BUFFER_SIZE_PS_0                         = 0x00028140,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0__DATA_shift         = 0,
+    SQ_ALU_CONST_BUFFER_SIZE_VS_0                         = 0x00028180,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0__DATA_shift         = 0,
+    SQ_ALU_CONST_BUFFER_SIZE_GS_0                         = 0x000281c0,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0__DATA_shift         = 0,
+    PA_SC_WINDOW_OFFSET                                   = 0x00028200,
+	WINDOW_X_OFFSET_mask                              = 0x7fff << 0,
+	WINDOW_X_OFFSET_shift                             = 0,
+	WINDOW_Y_OFFSET_mask                              = 0x7fff << 16,
+	WINDOW_Y_OFFSET_shift                             = 16,
+    PA_SC_WINDOW_SCISSOR_TL                               = 0x00028204,
+	PA_SC_WINDOW_SCISSOR_TL__TL_X_mask                = 0x3fff << 0,
+	PA_SC_WINDOW_SCISSOR_TL__TL_X_shift               = 0,
+	PA_SC_WINDOW_SCISSOR_TL__TL_Y_mask                = 0x3fff << 16,
+	PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift               = 16,
+	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31,
+    PA_SC_WINDOW_SCISSOR_BR                               = 0x00028208,
+	PA_SC_WINDOW_SCISSOR_BR__BR_X_mask                = 0x3fff << 0,
+	PA_SC_WINDOW_SCISSOR_BR__BR_X_shift               = 0,
+	PA_SC_WINDOW_SCISSOR_BR__BR_Y_mask                = 0x3fff << 16,
+	PA_SC_WINDOW_SCISSOR_BR__BR_Y_shift               = 16,
+    PA_SC_CLIPRECT_RULE                                   = 0x0002820c,
+	CLIP_RULE_mask                                    = 0xffff << 0,
+	CLIP_RULE_shift                                   = 0,
+    PA_SC_CLIPRECT_0_TL                                   = 0x00028210,
+	PA_SC_CLIPRECT_0_TL_num                           = 4,
+	PA_SC_CLIPRECT_0_TL_offset                        = 8,
+	PA_SC_CLIPRECT_0_TL__TL_X_mask                    = 0x3fff << 0,
+	PA_SC_CLIPRECT_0_TL__TL_X_shift                   = 0,
+	PA_SC_CLIPRECT_0_TL__TL_Y_mask                    = 0x3fff << 16,
+	PA_SC_CLIPRECT_0_TL__TL_Y_shift                   = 16,
+    PA_SC_CLIPRECT_0_BR                                   = 0x00028214,
+	PA_SC_CLIPRECT_0_BR_num                           = 4,
+	PA_SC_CLIPRECT_0_BR_offset                        = 8,
+	PA_SC_CLIPRECT_0_BR__BR_X_mask                    = 0x3fff << 0,
+	PA_SC_CLIPRECT_0_BR__BR_X_shift                   = 0,
+	PA_SC_CLIPRECT_0_BR__BR_Y_mask                    = 0x3fff << 16,
+	PA_SC_CLIPRECT_0_BR__BR_Y_shift                   = 16,
+    CB_TARGET_MASK                                        = 0x00028238,
+	TARGET0_ENABLE_mask                               = 0x0f << 0,
+	TARGET0_ENABLE_shift                              = 0,
+	TARGET1_ENABLE_mask                               = 0x0f << 4,
+	TARGET1_ENABLE_shift                              = 4,
+	TARGET2_ENABLE_mask                               = 0x0f << 8,
+	TARGET2_ENABLE_shift                              = 8,
+	TARGET3_ENABLE_mask                               = 0x0f << 12,
+	TARGET3_ENABLE_shift                              = 12,
+	TARGET4_ENABLE_mask                               = 0x0f << 16,
+	TARGET4_ENABLE_shift                              = 16,
+	TARGET5_ENABLE_mask                               = 0x0f << 20,
+	TARGET5_ENABLE_shift                              = 20,
+	TARGET6_ENABLE_mask                               = 0x0f << 24,
+	TARGET6_ENABLE_shift                              = 24,
+	TARGET7_ENABLE_mask                               = 0x0f << 28,
+	TARGET7_ENABLE_shift                              = 28,
+    CB_SHADER_MASK                                        = 0x0002823c,
+	OUTPUT0_ENABLE_mask                               = 0x0f << 0,
+	OUTPUT0_ENABLE_shift                              = 0,
+	OUTPUT1_ENABLE_mask                               = 0x0f << 4,
+	OUTPUT1_ENABLE_shift                              = 4,
+	OUTPUT2_ENABLE_mask                               = 0x0f << 8,
+	OUTPUT2_ENABLE_shift                              = 8,
+	OUTPUT3_ENABLE_mask                               = 0x0f << 12,
+	OUTPUT3_ENABLE_shift                              = 12,
+	OUTPUT4_ENABLE_mask                               = 0x0f << 16,
+	OUTPUT4_ENABLE_shift                              = 16,
+	OUTPUT5_ENABLE_mask                               = 0x0f << 20,
+	OUTPUT5_ENABLE_shift                              = 20,
+	OUTPUT6_ENABLE_mask                               = 0x0f << 24,
+	OUTPUT6_ENABLE_shift                              = 24,
+	OUTPUT7_ENABLE_mask                               = 0x0f << 28,
+	OUTPUT7_ENABLE_shift                              = 28,
+    PA_SC_GENERIC_SCISSOR_TL                              = 0x00028240,
+	PA_SC_GENERIC_SCISSOR_TL__TL_X_mask               = 0x3fff << 0,
+	PA_SC_GENERIC_SCISSOR_TL__TL_X_shift              = 0,
+	PA_SC_GENERIC_SCISSOR_TL__TL_Y_mask               = 0x3fff << 16,
+	PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift              = 16,
+/* 	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31, */
+    PA_SC_GENERIC_SCISSOR_BR                              = 0x00028244,
+	PA_SC_GENERIC_SCISSOR_BR__BR_X_mask               = 0x3fff << 0,
+	PA_SC_GENERIC_SCISSOR_BR__BR_X_shift              = 0,
+	PA_SC_GENERIC_SCISSOR_BR__BR_Y_mask               = 0x3fff << 16,
+	PA_SC_GENERIC_SCISSOR_BR__BR_Y_shift              = 16,
+    PA_SC_VPORT_SCISSOR_0_TL                              = 0x00028250,
+	PA_SC_VPORT_SCISSOR_0_TL_num                      = 16,
+	PA_SC_VPORT_SCISSOR_0_TL_offset                   = 8,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_X_mask               = 0x3fff << 0,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_X_shift              = 0,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_Y_mask               = 0x3fff << 16,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_Y_shift              = 16,
+/* 	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31, */
+    PA_SC_VPORT_SCISSOR_0_BR                              = 0x00028254,
+	PA_SC_VPORT_SCISSOR_0_BR_num                      = 16,
+	PA_SC_VPORT_SCISSOR_0_BR_offset                   = 8,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_X_mask               = 0x3fff << 0,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_X_shift              = 0,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_Y_mask               = 0x3fff << 16,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift              = 16,
+    PA_SC_VPORT_ZMIN_0                                    = 0x000282d0,
+	PA_SC_VPORT_ZMIN_0_num                            = 16,
+	PA_SC_VPORT_ZMIN_0_offset                         = 8,
+    PA_SC_VPORT_ZMAX_0                                    = 0x000282d4,
+	PA_SC_VPORT_ZMAX_0_num                            = 16,
+	PA_SC_VPORT_ZMAX_0_offset                         = 8,
+    SX_MISC                                               = 0x00028350,
+	MULTIPASS_bit                                     = 1 << 0,
+    SQ_VTX_SEMANTIC_0                                     = 0x00028380,
+	SQ_VTX_SEMANTIC_0_num                             = 32,
+/* 	SEMANTIC_ID_mask                                  = 0xff << 0, */
+/* 	SEMANTIC_ID_shift                                 = 0, */
+    VGT_MAX_VTX_INDX                                      = 0x00028400,
+    VGT_MIN_VTX_INDX                                      = 0x00028404,
+    VGT_INDX_OFFSET                                       = 0x00028408,
+    VGT_MULTI_PRIM_IB_RESET_INDX                          = 0x0002840c,
+    SX_ALPHA_TEST_CONTROL                                 = 0x00028410,
+	ALPHA_FUNC_mask                                   = 0x07 << 0,
+	ALPHA_FUNC_shift                                  = 0,
+	    REF_NEVER                                     = 0x00,
+	    REF_LESS                                      = 0x01,
+	    REF_EQUAL                                     = 0x02,
+	    REF_LEQUAL                                    = 0x03,
+	    REF_GREATER                                   = 0x04,
+	    REF_NOTEQUAL                                  = 0x05,
+	    REF_GEQUAL                                    = 0x06,
+	    REF_ALWAYS                                    = 0x07,
+	ALPHA_TEST_ENABLE_bit                             = 1 << 3,
+	ALPHA_TEST_BYPASS_bit                             = 1 << 8,
+    CB_BLEND_RED                                          = 0x00028414,
+    CB_BLEND_GREEN                                        = 0x00028418,
+    CB_BLEND_BLUE                                         = 0x0002841c,
+    CB_BLEND_ALPHA                                        = 0x00028420,
+    CB_FOG_RED                                            = 0x00028424,
+    CB_FOG_GREEN                                          = 0x00028428,
+    CB_FOG_BLUE                                           = 0x0002842c,
+    DB_STENCILREFMASK                                     = 0x00028430,
+	STENCILREF_mask                                   = 0xff << 0,
+	STENCILREF_shift                                  = 0,
+	STENCILMASK_mask                                  = 0xff << 8,
+	STENCILMASK_shift                                 = 8,
+	STENCILWRITEMASK_mask                             = 0xff << 16,
+	STENCILWRITEMASK_shift                            = 16,
+    DB_STENCILREFMASK_BF                                  = 0x00028434,
+	STENCILREF_BF_mask                                = 0xff << 0,
+	STENCILREF_BF_shift                               = 0,
+	STENCILMASK_BF_mask                               = 0xff << 8,
+	STENCILMASK_BF_shift                              = 8,
+	STENCILWRITEMASK_BF_mask                          = 0xff << 16,
+	STENCILWRITEMASK_BF_shift                         = 16,
+    SX_ALPHA_REF                                          = 0x00028438,
+    PA_CL_VPORT_XSCALE_0                                  = 0x0002843c,
+	PA_CL_VPORT_XSCALE_0_num                          = 16,
+	PA_CL_VPORT_XSCALE_0_offset                       = 24,
+    PA_CL_VPORT_XOFFSET_0                                 = 0x00028440,
+	PA_CL_VPORT_XOFFSET_0_num                         = 16,
+	PA_CL_VPORT_XOFFSET_0_offset                      = 24,
+    PA_CL_VPORT_YSCALE_0                                  = 0x00028444,
+	PA_CL_VPORT_YSCALE_0_num                          = 16,
+	PA_CL_VPORT_YSCALE_0_offset                       = 24,
+    PA_CL_VPORT_YOFFSET_0                                 = 0x00028448,
+	PA_CL_VPORT_YOFFSET_0_num                         = 16,
+	PA_CL_VPORT_YOFFSET_0_offset                      = 24,
+    PA_CL_VPORT_ZSCALE_0                                  = 0x0002844c,
+	PA_CL_VPORT_ZSCALE_0_num                          = 16,
+	PA_CL_VPORT_ZSCALE_0_offset                       = 24,
+    PA_CL_VPORT_ZOFFSET_0                                 = 0x00028450,
+	PA_CL_VPORT_ZOFFSET_0_num                         = 16,
+	PA_CL_VPORT_ZOFFSET_0_offset                      = 24,
+    SPI_VS_OUT_ID_0                                       = 0x00028614,
+	SPI_VS_OUT_ID_0_num                               = 10,
+	SEMANTIC_0_mask                                   = 0xff << 0,
+	SEMANTIC_0_shift                                  = 0,
+	SEMANTIC_1_mask                                   = 0xff << 8,
+	SEMANTIC_1_shift                                  = 8,
+	SEMANTIC_2_mask                                   = 0xff << 16,
+	SEMANTIC_2_shift                                  = 16,
+	SEMANTIC_3_mask                                   = 0xff << 24,
+	SEMANTIC_3_shift                                  = 24,
+    SPI_PS_INPUT_CNTL_0                                   = 0x00028644,
+	SPI_PS_INPUT_CNTL_0_num                           = 32,
+	SEMANTIC_mask                                     = 0xff << 0,
+	SEMANTIC_shift                                    = 0,
+	DEFAULT_VAL_mask                                  = 0x03 << 8,
+	DEFAULT_VAL_shift                                 = 8,
+	    X_0_0F                                        = 0x00,
+	FLAT_SHADE_bit                                    = 1 << 10,
+	SEL_CENTROID_bit                                  = 1 << 11,
+	SEL_LINEAR_bit                                    = 1 << 12,
+	CYL_WRAP_mask                                     = 0x0f << 13,
+	CYL_WRAP_shift                                    = 13,
+	PT_SPRITE_TEX_bit                                 = 1 << 17,
+	SEL_SAMPLE_bit                                    = 1 << 18,
+    SPI_VS_OUT_CONFIG                                     = 0x000286c4,
+	VS_PER_COMPONENT_bit                              = 1 << 0,
+	VS_EXPORT_COUNT_mask                              = 0x1f << 1,
+	VS_EXPORT_COUNT_shift                             = 1,
+	VS_EXPORTS_FOG_bit                                = 1 << 8,
+	VS_OUT_FOG_VEC_ADDR_mask                          = 0x1f << 9,
+	VS_OUT_FOG_VEC_ADDR_shift                         = 9,
+    SPI_PS_IN_CONTROL_0                                   = 0x000286cc,
+	NUM_INTERP_mask                                   = 0x3f << 0,
+	NUM_INTERP_shift                                  = 0,
+	POSITION_ENA_bit                                  = 1 << 8,
+	POSITION_CENTROID_bit                             = 1 << 9,
+	POSITION_ADDR_mask                                = 0x1f << 10,
+	POSITION_ADDR_shift                               = 10,
+	PARAM_GEN_mask                                    = 0x0f << 15,
+	PARAM_GEN_shift                                   = 15,
+	PARAM_GEN_ADDR_mask                               = 0x7f << 19,
+	PARAM_GEN_ADDR_shift                              = 19,
+	BARYC_SAMPLE_CNTL_mask                            = 0x03 << 26,
+	BARYC_SAMPLE_CNTL_shift                           = 26,
+	    CENTROIDS_ONLY                                = 0x00,
+	    CENTERS_ONLY                                  = 0x01,
+	    CENTROIDS_AND_CENTERS                         = 0x02,
+	    UNDEF                                         = 0x03,
+	PERSP_GRADIENT_ENA_bit                            = 1 << 28,
+	LINEAR_GRADIENT_ENA_bit                           = 1 << 29,
+	POSITION_SAMPLE_bit                               = 1 << 30,
+	BARYC_AT_SAMPLE_ENA_bit                           = 1 << 31,
+    SPI_PS_IN_CONTROL_1                                   = 0x000286d0,
+	GEN_INDEX_PIX_bit                                 = 1 << 0,
+	GEN_INDEX_PIX_ADDR_mask                           = 0x7f << 1,
+	GEN_INDEX_PIX_ADDR_shift                          = 1,
+	FRONT_FACE_ENA_bit                                = 1 << 8,
+	FRONT_FACE_CHAN_mask                              = 0x03 << 9,
+	FRONT_FACE_CHAN_shift                             = 9,
+	FRONT_FACE_ALL_BITS_bit                           = 1 << 11,
+	FRONT_FACE_ADDR_mask                              = 0x1f << 12,
+	FRONT_FACE_ADDR_shift                             = 12,
+	FOG_ADDR_mask                                     = 0x7f << 17,
+	FOG_ADDR_shift                                    = 17,
+	FIXED_PT_POSITION_ENA_bit                         = 1 << 24,
+	FIXED_PT_POSITION_ADDR_mask                       = 0x1f << 25,
+	FIXED_PT_POSITION_ADDR_shift                      = 25,
+    SPI_INTERP_CONTROL_0                                  = 0x000286d4,
+	FLAT_SHADE_ENA_bit                                = 1 << 0,
+	PNT_SPRITE_ENA_bit                                = 1 << 1,
+	PNT_SPRITE_OVRD_X_mask                            = 0x07 << 2,
+	PNT_SPRITE_OVRD_X_shift                           = 2,
+	    SPI_PNT_SPRITE_SEL_0                          = 0x00,
+	    SPI_PNT_SPRITE_SEL_1                          = 0x01,
+	    SPI_PNT_SPRITE_SEL_S                          = 0x02,
+	    SPI_PNT_SPRITE_SEL_T                          = 0x03,
+	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04,
+	PNT_SPRITE_OVRD_Y_mask                            = 0x07 << 5,
+	PNT_SPRITE_OVRD_Y_shift                           = 5,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_OVRD_Z_mask                            = 0x07 << 8,
+	PNT_SPRITE_OVRD_Z_shift                           = 8,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_OVRD_W_mask                            = 0x07 << 11,
+	PNT_SPRITE_OVRD_W_shift                           = 11,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_TOP_1_bit                              = 1 << 14,
+    SPI_INPUT_Z                                           = 0x000286d8,
+	PROVIDE_Z_TO_SPI_bit                              = 1 << 0,
+    SPI_FOG_CNTL                                          = 0x000286dc,
+	PASS_FOG_THROUGH_PS_bit                           = 1 << 0,
+	PIXEL_FOG_FUNC_mask                               = 0x03 << 1,
+	PIXEL_FOG_FUNC_shift                              = 1,
+	    SPI_FOG_NONE                                  = 0x00,
+	    SPI_FOG_EXP                                   = 0x01,
+	    SPI_FOG_EXP2                                  = 0x02,
+	    SPI_FOG_LINEAR                                = 0x03,
+	PIXEL_FOG_SRC_SEL_bit                             = 1 << 3,
+	VS_FOG_CLAMP_DISABLE_bit                          = 1 << 4,
+    SPI_FOG_FUNC_SCALE                                    = 0x000286e0,
+    SPI_FOG_FUNC_BIAS                                     = 0x000286e4,
+    CB_BLEND0_CONTROL                                     = 0x00028780,
+	CB_BLEND0_CONTROL_num                             = 8,
+	COLOR_SRCBLEND_mask                               = 0x1f << 0,
+	COLOR_SRCBLEND_shift                              = 0,
+	COLOR_COMB_FCN_mask                               = 0x07 << 5,
+	COLOR_COMB_FCN_shift                              = 5,
+	COLOR_DESTBLEND_mask                              = 0x1f << 8,
+	COLOR_DESTBLEND_shift                             = 8,
+	OPACITY_WEIGHT_bit                                = 1 << 13,
+	ALPHA_SRCBLEND_mask                               = 0x1f << 16,
+	ALPHA_SRCBLEND_shift                              = 16,
+	ALPHA_COMB_FCN_mask                               = 0x07 << 21,
+	ALPHA_COMB_FCN_shift                              = 21,
+	ALPHA_DESTBLEND_mask                              = 0x1f << 24,
+	ALPHA_DESTBLEND_shift                             = 24,
+	SEPARATE_ALPHA_BLEND_bit                          = 1 << 29,
+    VGT_DMA_BASE_HI                                       = 0x000287e4,
+	VGT_DMA_BASE_HI__BASE_ADDR_mask                   = 0xff << 0,
+	VGT_DMA_BASE_HI__BASE_ADDR_shift                  = 0,
+    VGT_DMA_BASE                                          = 0x000287e8,
+    VGT_DRAW_INITIATOR                                    = 0x000287f0,
+	SOURCE_SELECT_mask                                = 0x03 << 0,
+	SOURCE_SELECT_shift                               = 0,
+	    DI_SRC_SEL_DMA                                = 0x00,
+	    DI_SRC_SEL_IMMEDIATE                          = 0x01,
+	    DI_SRC_SEL_AUTO_INDEX                         = 0x02,
+	    DI_SRC_SEL_RESERVED                           = 0x03,
+	MAJOR_MODE_mask                                   = 0x03 << 2,
+	MAJOR_MODE_shift                                  = 2,
+	    DI_MAJOR_MODE_0                               = 0x00,
+	    DI_MAJOR_MODE_1                               = 0x01,
+	SPRITE_EN_bit                                     = 1 << 4,
+	NOT_EOP_bit                                       = 1 << 5,
+	USE_OPAQUE_bit                                    = 1 << 6,
+    VGT_IMMED_DATA                                        = 0x000287f4,
+    VGT_EVENT_ADDRESS_REG                                 = 0x000287f8,
+	ADDRESS_LOW_mask                                  = 0xfffffff << 0,
+	ADDRESS_LOW_shift                                 = 0,
+    DB_DEPTH_CONTROL                                      = 0x00028800,
+	STENCIL_ENABLE_bit                                = 1 << 0,
+	Z_ENABLE_bit                                      = 1 << 1,
+	Z_WRITE_ENABLE_bit                                = 1 << 2,
+	ZFUNC_mask                                        = 0x07 << 4,
+	ZFUNC_shift                                       = 4,
+	    FRAG_NEVER                                    = 0x00,
+	    FRAG_LESS                                     = 0x01,
+	    FRAG_EQUAL                                    = 0x02,
+	    FRAG_LEQUAL                                   = 0x03,
+	    FRAG_GREATER                                  = 0x04,
+	    FRAG_NOTEQUAL                                 = 0x05,
+	    FRAG_GEQUAL                                   = 0x06,
+	    FRAG_ALWAYS                                   = 0x07,
+	BACKFACE_ENABLE_bit                               = 1 << 7,
+	STENCILFUNC_mask                                  = 0x07 << 8,
+	STENCILFUNC_shift                                 = 8,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	STENCILFAIL_mask                                  = 0x07 << 11,
+	STENCILFAIL_shift                                 = 11,
+	    STENCIL_KEEP                                  = 0x00,
+	    STENCIL_ZERO                                  = 0x01,
+	    STENCIL_REPLACE                               = 0x02,
+	    STENCIL_INCR_CLAMP                            = 0x03,
+	    STENCIL_DECR_CLAMP                            = 0x04,
+	    STENCIL_INVERT                                = 0x05,
+	    STENCIL_INCR_WRAP                             = 0x06,
+	    STENCIL_DECR_WRAP                             = 0x07,
+	STENCILZPASS_mask                                 = 0x07 << 14,
+	STENCILZPASS_shift                                = 14,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZFAIL_mask                                 = 0x07 << 17,
+	STENCILZFAIL_shift                                = 17,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILFUNC_BF_mask                               = 0x07 << 20,
+	STENCILFUNC_BF_shift                              = 20,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	STENCILFAIL_BF_mask                               = 0x07 << 23,
+	STENCILFAIL_BF_shift                              = 23,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZPASS_BF_mask                              = 0x07 << 26,
+	STENCILZPASS_BF_shift                             = 26,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZFAIL_BF_mask                              = 0x07 << 29,
+	STENCILZFAIL_BF_shift                             = 29,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+    CB_BLEND_CONTROL                                      = 0x00028804,
+/* 	COLOR_SRCBLEND_mask                               = 0x1f << 0, */
+/* 	COLOR_SRCBLEND_shift                              = 0, */
+	    BLEND_ZERO                                    = 0x00,
+	    BLEND_ONE                                     = 0x01,
+	    BLEND_SRC_COLOR                               = 0x02,
+	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03,
+	    BLEND_SRC_ALPHA                               = 0x04,
+	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05,
+	    BLEND_DST_ALPHA                               = 0x06,
+	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07,
+	    BLEND_DST_COLOR                               = 0x08,
+	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09,
+	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a,
+	    BLEND_BOTH_SRC_ALPHA                          = 0x0b,
+	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c,
+	    BLEND_CONSTANT_COLOR                          = 0x0d,
+	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e,
+	    BLEND_SRC1_COLOR                              = 0x0f,
+	    BLEND_INV_SRC1_COLOR                          = 0x10,
+	    BLEND_SRC1_ALPHA                              = 0x11,
+	    BLEND_INV_SRC1_ALPHA                          = 0x12,
+	    BLEND_CONSTANT_ALPHA                          = 0x13,
+	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14,
+/* 	COLOR_COMB_FCN_mask                               = 0x07 << 5, */
+/* 	COLOR_COMB_FCN_shift                              = 5, */
+	    COMB_DST_PLUS_SRC                             = 0x00,
+	    COMB_SRC_MINUS_DST                            = 0x01,
+	    COMB_MIN_DST_SRC                              = 0x02,
+	    COMB_MAX_DST_SRC                              = 0x03,
+	    COMB_DST_MINUS_SRC                            = 0x04,
+/* 	COLOR_DESTBLEND_mask                              = 0x1f << 8, */
+/* 	COLOR_DESTBLEND_shift                             = 8, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	OPACITY_WEIGHT_bit                                = 1 << 13, */
+/* 	ALPHA_SRCBLEND_mask                               = 0x1f << 16, */
+/* 	ALPHA_SRCBLEND_shift                              = 16, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	ALPHA_COMB_FCN_mask                               = 0x07 << 21, */
+/* 	ALPHA_COMB_FCN_shift                              = 21, */
+/* 	    COMB_DST_PLUS_SRC                             = 0x00, */
+/* 	    COMB_SRC_MINUS_DST                            = 0x01, */
+/* 	    COMB_MIN_DST_SRC                              = 0x02, */
+/* 	    COMB_MAX_DST_SRC                              = 0x03, */
+/* 	    COMB_DST_MINUS_SRC                            = 0x04, */
+/* 	ALPHA_DESTBLEND_mask                              = 0x1f << 24, */
+/* 	ALPHA_DESTBLEND_shift                             = 24, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	SEPARATE_ALPHA_BLEND_bit                          = 1 << 29, */
+    CB_COLOR_CONTROL                                      = 0x00028808,
+	FOG_ENABLE_bit                                    = 1 << 0,
+	MULTIWRITE_ENABLE_bit                             = 1 << 1,
+	DITHER_ENABLE_bit                                 = 1 << 2,
+	DEGAMMA_ENABLE_bit                                = 1 << 3,
+	SPECIAL_OP_mask                                   = 0x07 << 4,
+	SPECIAL_OP_shift                                  = 4,
+	    SPECIAL_NORMAL                                = 0x00,
+	    SPECIAL_DISABLE                               = 0x01,
+	    SPECIAL_FAST_CLEAR                            = 0x02,
+	    SPECIAL_FORCE_CLEAR                           = 0x03,
+	    SPECIAL_EXPAND_COLOR                          = 0x04,
+	    SPECIAL_EXPAND_TEXTURE                        = 0x05,
+	    SPECIAL_EXPAND_SAMPLES                        = 0x06,
+	    SPECIAL_RESOLVE_BOX                           = 0x07,
+	PER_MRT_BLEND_bit                                 = 1 << 7,
+	TARGET_BLEND_ENABLE_mask                          = 0xff << 8,
+	TARGET_BLEND_ENABLE_shift                         = 8,
+	ROP3_mask                                         = 0xff << 16,
+	ROP3_shift                                        = 16,
+    DB_SHADER_CONTROL                                     = 0x0002880c,
+	Z_EXPORT_ENABLE_bit                               = 1 << 0,
+	STENCIL_REF_EXPORT_ENABLE_bit                     = 1 << 1,
+	Z_ORDER_mask                                      = 0x03 << 4,
+	Z_ORDER_shift                                     = 4,
+	    LATE_Z                                        = 0x00,
+	    EARLY_Z_THEN_LATE_Z                           = 0x01,
+	    RE_Z                                          = 0x02,
+	    EARLY_Z_THEN_RE_Z                             = 0x03,
+	KILL_ENABLE_bit                                   = 1 << 6,
+	COVERAGE_TO_MASK_ENABLE_bit                       = 1 << 7,
+	MASK_EXPORT_ENABLE_bit                            = 1 << 8,
+	DUAL_EXPORT_ENABLE_bit                            = 1 << 9,
+	EXEC_ON_HIER_FAIL_bit                             = 1 << 10,
+	EXEC_ON_NOOP_bit                                  = 1 << 11,
+    PA_CL_CLIP_CNTL                                       = 0x00028810,
+	UCP_ENA_0_bit                                     = 1 << 0,
+	UCP_ENA_1_bit                                     = 1 << 1,
+	UCP_ENA_2_bit                                     = 1 << 2,
+	UCP_ENA_3_bit                                     = 1 << 3,
+	UCP_ENA_4_bit                                     = 1 << 4,
+	UCP_ENA_5_bit                                     = 1 << 5,
+	PS_UCP_Y_SCALE_NEG_bit                            = 1 << 13,
+	PS_UCP_MODE_mask                                  = 0x03 << 14,
+	PS_UCP_MODE_shift                                 = 14,
+	CLIP_DISABLE_bit                                  = 1 << 16,
+	UCP_CULL_ONLY_ENA_bit                             = 1 << 17,
+	BOUNDARY_EDGE_FLAG_ENA_bit                        = 1 << 18,
+	DX_CLIP_SPACE_DEF_bit                             = 1 << 19,
+	DIS_CLIP_ERR_DETECT_bit                           = 1 << 20,
+	VTX_KILL_OR_bit                                   = 1 << 21,
+	DX_LINEAR_ATTR_CLIP_ENA_bit                       = 1 << 24,
+	VTE_VPORT_PROVOKE_DISABLE_bit                     = 1 << 25,
+	ZCLIP_NEAR_DISABLE_bit                            = 1 << 26,
+	ZCLIP_FAR_DISABLE_bit                             = 1 << 27,
+    PA_SU_SC_MODE_CNTL                                    = 0x00028814,
+	CULL_FRONT_bit                                    = 1 << 0,
+	CULL_BACK_bit                                     = 1 << 1,
+	FACE_bit                                          = 1 << 2,
+	POLY_MODE_mask                                    = 0x03 << 3,
+	POLY_MODE_shift                                   = 3,
+	    X_DISABLE_POLY_MODE                           = 0x00,
+	    X_DUAL_MODE                                   = 0x01,
+	POLYMODE_FRONT_PTYPE_mask                         = 0x07 << 5,
+	POLYMODE_FRONT_PTYPE_shift                        = 5,
+	    X_DRAW_POINTS                                 = 0x00,
+	    X_DRAW_LINES                                  = 0x01,
+	    X_DRAW_TRIANGLES                              = 0x02,
+	POLYMODE_BACK_PTYPE_mask                          = 0x07 << 8,
+	POLYMODE_BACK_PTYPE_shift                         = 8,
+/* 	    X_DRAW_POINTS                                 = 0x00, */
+/* 	    X_DRAW_LINES                                  = 0x01, */
+/* 	    X_DRAW_TRIANGLES                              = 0x02, */
+	POLY_OFFSET_FRONT_ENABLE_bit                      = 1 << 11,
+	POLY_OFFSET_BACK_ENABLE_bit                       = 1 << 12,
+	POLY_OFFSET_PARA_ENABLE_bit                       = 1 << 13,
+	VTX_WINDOW_OFFSET_ENABLE_bit                      = 1 << 16,
+	PROVOKING_VTX_LAST_bit                            = 1 << 19,
+	PERSP_CORR_DIS_bit                                = 1 << 20,
+	MULTI_PRIM_IB_ENA_bit                             = 1 << 21,
+    PA_CL_VTE_CNTL                                        = 0x00028818,
+	VPORT_X_SCALE_ENA_bit                             = 1 << 0,
+	VPORT_X_OFFSET_ENA_bit                            = 1 << 1,
+	VPORT_Y_SCALE_ENA_bit                             = 1 << 2,
+	VPORT_Y_OFFSET_ENA_bit                            = 1 << 3,
+	VPORT_Z_SCALE_ENA_bit                             = 1 << 4,
+	VPORT_Z_OFFSET_ENA_bit                            = 1 << 5,
+	VTX_XY_FMT_bit                                    = 1 << 8,
+	VTX_Z_FMT_bit                                     = 1 << 9,
+	VTX_W0_FMT_bit                                    = 1 << 10,
+	PERFCOUNTER_REF_bit                               = 1 << 11,
+    PA_CL_VS_OUT_CNTL                                     = 0x0002881c,
+	CLIP_DIST_ENA_0_bit                               = 1 << 0,
+	CLIP_DIST_ENA_1_bit                               = 1 << 1,
+	CLIP_DIST_ENA_2_bit                               = 1 << 2,
+	CLIP_DIST_ENA_3_bit                               = 1 << 3,
+	CLIP_DIST_ENA_4_bit                               = 1 << 4,
+	CLIP_DIST_ENA_5_bit                               = 1 << 5,
+	CLIP_DIST_ENA_6_bit                               = 1 << 6,
+	CLIP_DIST_ENA_7_bit                               = 1 << 7,
+	CULL_DIST_ENA_0_bit                               = 1 << 8,
+	CULL_DIST_ENA_1_bit                               = 1 << 9,
+	CULL_DIST_ENA_2_bit                               = 1 << 10,
+	CULL_DIST_ENA_3_bit                               = 1 << 11,
+	CULL_DIST_ENA_4_bit                               = 1 << 12,
+	CULL_DIST_ENA_5_bit                               = 1 << 13,
+	CULL_DIST_ENA_6_bit                               = 1 << 14,
+	CULL_DIST_ENA_7_bit                               = 1 << 15,
+	USE_VTX_POINT_SIZE_bit                            = 1 << 16,
+	USE_VTX_EDGE_FLAG_bit                             = 1 << 17,
+	USE_VTX_RENDER_TARGET_INDX_bit                    = 1 << 18,
+	USE_VTX_VIEWPORT_INDX_bit                         = 1 << 19,
+	USE_VTX_KILL_FLAG_bit                             = 1 << 20,
+	VS_OUT_MISC_VEC_ENA_bit                           = 1 << 21,
+	VS_OUT_CCDIST0_VEC_ENA_bit                        = 1 << 22,
+	VS_OUT_CCDIST1_VEC_ENA_bit                        = 1 << 23,
+    PA_CL_NANINF_CNTL                                     = 0x00028820,
+	VTE_XY_INF_DISCARD_bit                            = 1 << 0,
+	VTE_Z_INF_DISCARD_bit                             = 1 << 1,
+	VTE_W_INF_DISCARD_bit                             = 1 << 2,
+	VTE_0XNANINF_IS_0_bit                             = 1 << 3,
+	VTE_XY_NAN_RETAIN_bit                             = 1 << 4,
+	VTE_Z_NAN_RETAIN_bit                              = 1 << 5,
+	VTE_W_NAN_RETAIN_bit                              = 1 << 6,
+	VTE_W_RECIP_NAN_IS_0_bit                          = 1 << 7,
+	VS_XY_NAN_TO_INF_bit                              = 1 << 8,
+	VS_XY_INF_RETAIN_bit                              = 1 << 9,
+	VS_Z_NAN_TO_INF_bit                               = 1 << 10,
+	VS_Z_INF_RETAIN_bit                               = 1 << 11,
+	VS_W_NAN_TO_INF_bit                               = 1 << 12,
+	VS_W_INF_RETAIN_bit                               = 1 << 13,
+	VS_CLIP_DIST_INF_DISCARD_bit                      = 1 << 14,
+	VTE_NO_OUTPUT_NEG_0_bit                           = 1 << 20,
+    SQ_PGM_START_PS                                       = 0x00028840,
+    SQ_PGM_RESOURCES_PS                                   = 0x00028850,
+	NUM_GPRS_mask                                     = 0xff << 0,
+	NUM_GPRS_shift                                    = 0,
+	STACK_SIZE_mask                                   = 0xff << 8,
+	STACK_SIZE_shift                                  = 8,
+	SQ_PGM_RESOURCES_PS__DX10_CLAMP_bit               = 1 << 21,
+	FETCH_CACHE_LINES_mask                            = 0x07 << 24,
+	FETCH_CACHE_LINES_shift                           = 24,
+	UNCACHED_FIRST_INST_bit                           = 1 << 28,
+	CLAMP_CONSTS_bit                                  = 1 << 31,
+    SQ_PGM_EXPORTS_PS                                     = 0x00028854,
+	EXPORT_MODE_mask                                  = 0x1f << 0,
+	EXPORT_MODE_shift                                 = 0,
+    SQ_PGM_START_VS                                       = 0x00028858,
+    SQ_PGM_RESOURCES_VS                                   = 0x00028868,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_VS__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_GS                                       = 0x0002886c,
+    SQ_PGM_RESOURCES_GS                                   = 0x0002887c,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_GS__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_ES                                       = 0x00028880,
+    SQ_PGM_RESOURCES_ES                                   = 0x00028890,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_ES__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_FS                                       = 0x00028894,
+    SQ_PGM_RESOURCES_FS                                   = 0x000288a4,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_FS__DX10_CLAMP_bit               = 1 << 21,
+    SQ_ESGS_RING_ITEMSIZE                                 = 0x000288a8,
+	ITEMSIZE_mask                                     = 0x7fff << 0,
+	ITEMSIZE_shift                                    = 0,
+    SQ_GSVS_RING_ITEMSIZE                                 = 0x000288ac,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_ESTMP_RING_ITEMSIZE                                = 0x000288b0,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_GSTMP_RING_ITEMSIZE                                = 0x000288b4,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_VSTMP_RING_ITEMSIZE                                = 0x000288b8,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_PSTMP_RING_ITEMSIZE                                = 0x000288bc,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_FBUF_RING_ITEMSIZE                                 = 0x000288c0,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_REDUC_RING_ITEMSIZE                                = 0x000288c4,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_GS_VERT_ITEMSIZE                                   = 0x000288c8,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_PGM_CF_OFFSET_PS                                   = 0x000288cc,
+	PGM_CF_OFFSET_mask                                = 0xfffff << 0,
+	PGM_CF_OFFSET_shift                               = 0,
+    SQ_PGM_CF_OFFSET_VS                                   = 0x000288d0,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_GS                                   = 0x000288d4,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_ES                                   = 0x000288d8,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_FS                                   = 0x000288dc,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_VTX_SEMANTIC_CLEAR                                 = 0x000288e0,
+    SQ_ALU_CONST_CACHE_PS_0                               = 0x00028940,
+	SQ_ALU_CONST_CACHE_PS_0_num                       = 16,
+    SQ_ALU_CONST_CACHE_VS_0                               = 0x00028980,
+	SQ_ALU_CONST_CACHE_VS_0_num                       = 16,
+    SQ_ALU_CONST_CACHE_GS_0                               = 0x000289c0,
+	SQ_ALU_CONST_CACHE_GS_0_num                       = 16,
+    PA_SU_POINT_SIZE                                      = 0x00028a00,
+	PA_SU_POINT_SIZE__HEIGHT_mask                     = 0xffff << 0,
+	PA_SU_POINT_SIZE__HEIGHT_shift                    = 0,
+	PA_SU_POINT_SIZE__WIDTH_mask                      = 0xffff << 16,
+	PA_SU_POINT_SIZE__WIDTH_shift                     = 16,
+    PA_SU_POINT_MINMAX                                    = 0x00028a04,
+	MIN_SIZE_mask                                     = 0xffff << 0,
+	MIN_SIZE_shift                                    = 0,
+	MAX_SIZE_mask                                     = 0xffff << 16,
+	MAX_SIZE_shift                                    = 16,
+    PA_SU_LINE_CNTL                                       = 0x00028a08,
+	PA_SU_LINE_CNTL__WIDTH_mask                       = 0xffff << 0,
+	PA_SU_LINE_CNTL__WIDTH_shift                      = 0,
+    PA_SC_LINE_STIPPLE                                    = 0x00028a0c,
+	LINE_PATTERN_mask                                 = 0xffff << 0,
+	LINE_PATTERN_shift                                = 0,
+	REPEAT_COUNT_mask                                 = 0xff << 16,
+	REPEAT_COUNT_shift                                = 16,
+	PATTERN_BIT_ORDER_bit                             = 1 << 28,
+	AUTO_RESET_CNTL_mask                              = 0x03 << 29,
+	AUTO_RESET_CNTL_shift                             = 29,
+    VGT_OUTPUT_PATH_CNTL                                  = 0x00028a10,
+	PATH_SELECT_mask                                  = 0x03 << 0,
+	PATH_SELECT_shift                                 = 0,
+	    VGT_OUTPATH_VTX_REUSE                         = 0x00,
+	    VGT_OUTPATH_TESS_EN                           = 0x01,
+	    VGT_OUTPATH_PASSTHRU                          = 0x02,
+	    VGT_OUTPATH_GS_BLOCK                          = 0x03,
+    VGT_HOS_CNTL                                          = 0x00028a14,
+	TESS_MODE_mask                                    = 0x03 << 0,
+	TESS_MODE_shift                                   = 0,
+    VGT_HOS_MAX_TESS_LEVEL                                = 0x00028a18,
+    VGT_HOS_MIN_TESS_LEVEL                                = 0x00028a1c,
+    VGT_HOS_REUSE_DEPTH                                   = 0x00028a20,
+	REUSE_DEPTH_mask                                  = 0xff << 0,
+	REUSE_DEPTH_shift                                 = 0,
+    VGT_GROUP_PRIM_TYPE                                   = 0x00028a24,
+	VGT_GROUP_PRIM_TYPE__PRIM_TYPE_mask               = 0x1f << 0,
+	VGT_GROUP_PRIM_TYPE__PRIM_TYPE_shift              = 0,
+	    VGT_GRP_3D_POINT                              = 0x00,
+	    VGT_GRP_3D_LINE                               = 0x01,
+	    VGT_GRP_3D_TRI                                = 0x02,
+	    VGT_GRP_3D_RECT                               = 0x03,
+	    VGT_GRP_3D_QUAD                               = 0x04,
+	    VGT_GRP_2D_COPY_RECT_V0                       = 0x05,
+	    VGT_GRP_2D_COPY_RECT_V1                       = 0x06,
+	    VGT_GRP_2D_COPY_RECT_V2                       = 0x07,
+	    VGT_GRP_2D_COPY_RECT_V3                       = 0x08,
+	    VGT_GRP_2D_FILL_RECT                          = 0x09,
+	    VGT_GRP_2D_LINE                               = 0x0a,
+	    VGT_GRP_2D_TRI                                = 0x0b,
+	    VGT_GRP_PRIM_INDEX_LINE                       = 0x0c,
+	    VGT_GRP_PRIM_INDEX_TRI                        = 0x0d,
+	    VGT_GRP_PRIM_INDEX_QUAD                       = 0x0e,
+	    VGT_GRP_3D_LINE_ADJ                           = 0x0f,
+	    VGT_GRP_3D_TRI_ADJ                            = 0x10,
+	RETAIN_ORDER_bit                                  = 1 << 14,
+	RETAIN_QUADS_bit                                  = 1 << 15,
+	PRIM_ORDER_mask                                   = 0x07 << 16,
+	PRIM_ORDER_shift                                  = 16,
+	    VGT_GRP_LIST                                  = 0x00,
+	    VGT_GRP_STRIP                                 = 0x01,
+	    VGT_GRP_FAN                                   = 0x02,
+	    VGT_GRP_LOOP                                  = 0x03,
+	    VGT_GRP_POLYGON                               = 0x04,
+    VGT_GROUP_FIRST_DECR                                  = 0x00028a28,
+	FIRST_DECR_mask                                   = 0x0f << 0,
+	FIRST_DECR_shift                                  = 0,
+    VGT_GROUP_DECR                                        = 0x00028a2c,
+	DECR_mask                                         = 0x0f << 0,
+	DECR_shift                                        = 0,
+    VGT_GROUP_VECT_0_CNTL                                 = 0x00028a30,
+	COMP_X_EN_bit                                     = 1 << 0,
+	COMP_Y_EN_bit                                     = 1 << 1,
+	COMP_Z_EN_bit                                     = 1 << 2,
+	COMP_W_EN_bit                                     = 1 << 3,
+	VGT_GROUP_VECT_0_CNTL__STRIDE_mask                = 0xff << 8,
+	VGT_GROUP_VECT_0_CNTL__STRIDE_shift               = 8,
+	SHIFT_mask                                        = 0xff << 16,
+	SHIFT_shift                                       = 16,
+    VGT_GROUP_VECT_1_CNTL                                 = 0x00028a34,
+/* 	COMP_X_EN_bit                                     = 1 << 0, */
+/* 	COMP_Y_EN_bit                                     = 1 << 1, */
+/* 	COMP_Z_EN_bit                                     = 1 << 2, */
+/* 	COMP_W_EN_bit                                     = 1 << 3, */
+	VGT_GROUP_VECT_1_CNTL__STRIDE_mask                = 0xff << 8,
+	VGT_GROUP_VECT_1_CNTL__STRIDE_shift               = 8,
+/* 	SHIFT_mask                                        = 0xff << 16, */
+/* 	SHIFT_shift                                       = 16, */
+    VGT_GROUP_VECT_0_FMT_CNTL                             = 0x00028a38,
+	X_CONV_mask                                       = 0x0f << 0,
+	X_CONV_shift                                      = 0,
+	    VGT_GRP_INDEX_16                              = 0x00,
+	    VGT_GRP_INDEX_32                              = 0x01,
+	    VGT_GRP_UINT_16                               = 0x02,
+	    VGT_GRP_UINT_32                               = 0x03,
+	    VGT_GRP_SINT_16                               = 0x04,
+	    VGT_GRP_SINT_32                               = 0x05,
+	    VGT_GRP_FLOAT_32                              = 0x06,
+	    VGT_GRP_AUTO_PRIM                             = 0x07,
+	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08,
+	X_OFFSET_mask                                     = 0x0f << 4,
+	X_OFFSET_shift                                    = 4,
+	Y_CONV_mask                                       = 0x0f << 8,
+	Y_CONV_shift                                      = 8,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	Y_OFFSET_mask                                     = 0x0f << 12,
+	Y_OFFSET_shift                                    = 12,
+	Z_CONV_mask                                       = 0x0f << 16,
+	Z_CONV_shift                                      = 16,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	Z_OFFSET_mask                                     = 0x0f << 20,
+	Z_OFFSET_shift                                    = 20,
+	W_CONV_mask                                       = 0x0f << 24,
+	W_CONV_shift                                      = 24,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	W_OFFSET_mask                                     = 0x0f << 28,
+	W_OFFSET_shift                                    = 28,
+    VGT_GROUP_VECT_1_FMT_CNTL                             = 0x00028a3c,
+/* 	X_CONV_mask                                       = 0x0f << 0, */
+/* 	X_CONV_shift                                      = 0, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	X_OFFSET_mask                                     = 0x0f << 4, */
+/* 	X_OFFSET_shift                                    = 4, */
+/* 	Y_CONV_mask                                       = 0x0f << 8, */
+/* 	Y_CONV_shift                                      = 8, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	Y_OFFSET_mask                                     = 0x0f << 12, */
+/* 	Y_OFFSET_shift                                    = 12, */
+/* 	Z_CONV_mask                                       = 0x0f << 16, */
+/* 	Z_CONV_shift                                      = 16, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	Z_OFFSET_mask                                     = 0x0f << 20, */
+/* 	Z_OFFSET_shift                                    = 20, */
+/* 	W_CONV_mask                                       = 0x0f << 24, */
+/* 	W_CONV_shift                                      = 24, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	W_OFFSET_mask                                     = 0x0f << 28, */
+/* 	W_OFFSET_shift                                    = 28, */
+    VGT_GS_MODE                                           = 0x00028a40,
+	MODE_mask                                         = 0x03 << 0,
+	MODE_shift                                        = 0,
+	    GS_OFF                                        = 0x00,
+	    GS_SCENARIO_A                                 = 0x01,
+	    GS_SCENARIO_B                                 = 0x02,
+	    GS_SCENARIO_G                                 = 0x03,
+	ES_PASSTHRU_bit                                   = 1 << 2,
+	CUT_MODE_mask                                     = 0x03 << 3,
+	CUT_MODE_shift                                    = 3,
+	    GS_CUT_1024                                   = 0x00,
+	    GS_CUT_512                                    = 0x01,
+	    GS_CUT_256                                    = 0x02,
+	    GS_CUT_128                                    = 0x03,
+    PA_SC_MPASS_PS_CNTL                                   = 0x00028a48,
+	MPASS_PIX_VEC_PER_PASS_mask                       = 0xfffff << 0,
+	MPASS_PIX_VEC_PER_PASS_shift                      = 0,
+	MPASS_PS_ENA_bit                                  = 1 << 31,
+    PA_SC_MODE_CNTL                                       = 0x00028a4c,
+	MSAA_ENABLE_bit                                   = 1 << 0,
+	CLIPRECT_ENABLE_bit                               = 1 << 1,
+	LINE_STIPPLE_ENABLE_bit                           = 1 << 2,
+	MULTI_CHIP_PRIM_DISCARD_ENAB_bit                  = 1 << 3,
+	WALK_ORDER_ENABLE_bit                             = 1 << 4,
+	HALVE_DETAIL_SAMPLE_PERF_bit                      = 1 << 5,
+	WALK_SIZE_bit                                     = 1 << 6,
+	WALK_ALIGNMENT_bit                                = 1 << 7,
+	WALK_ALIGN8_PRIM_FITS_ST_bit                      = 1 << 8,
+	TILE_COVER_NO_SCISSOR_bit                         = 1 << 9,
+	KILL_PIX_POST_HI_Z_bit                            = 1 << 10,
+	KILL_PIX_POST_DETAIL_MASK_bit                     = 1 << 11,
+	MULTI_CHIP_SUPERTILE_ENABLE_bit                   = 1 << 12,
+	TILE_COVER_DISABLE_bit                            = 1 << 13,
+	FORCE_EOV_CNTDWN_ENABLE_bit                       = 1 << 14,
+	FORCE_EOV_TILE_ENABLE_bit                         = 1 << 15,
+	FORCE_EOV_REZ_ENABLE_bit                          = 1 << 16,
+	PS_ITER_SAMPLE_bit                                = 1 << 17,
+    VGT_ENHANCE                                           = 0x00028a50,
+	VGT_ENHANCE__MI_TIMESTAMP_RES_mask                = 0x03 << 0,
+	VGT_ENHANCE__MI_TIMESTAMP_RES_shift               = 0,
+	    X_0_992_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_32   = 0x00,
+	    X_0_496_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_16   = 0x01,
+	    X_0_248_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_8    = 0x02,
+	    X_0_124_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_4    = 0x03,
+	MISC_mask                                         = 0x3fffffff << 2,
+	MISC_shift                                        = 2,
+    VGT_GS_OUT_PRIM_TYPE                                  = 0x00028a6c,
+	OUTPRIM_TYPE_mask                                 = 0x3f << 0,
+	OUTPRIM_TYPE_shift                                = 0,
+	    POINTLIST                                     = 0x00,
+	    LINESTRIP                                     = 0x01,
+	    TRISTRIP                                      = 0x02,
+    VGT_DMA_SIZE                                          = 0x00028a74,
+    VGT_DMA_INDEX_TYPE                                    = 0x00028a7c,
+/* 	INDEX_TYPE_mask                                   = 0x03 << 0, */
+/* 	INDEX_TYPE_shift                                  = 0, */
+	    VGT_INDEX_16                                  = 0x00,
+	    VGT_INDEX_32                                  = 0x01,
+	SWAP_MODE_mask                                    = 0x03 << 2,
+	SWAP_MODE_shift                                   = 2,
+	    VGT_DMA_SWAP_NONE                             = 0x00,
+	    VGT_DMA_SWAP_16_BIT                           = 0x01,
+	    VGT_DMA_SWAP_32_BIT                           = 0x02,
+	    VGT_DMA_SWAP_WORD                             = 0x03,
+    VGT_PRIMITIVEID_EN                                    = 0x00028a84,
+	PRIMITIVEID_EN_bit                                = 1 << 0,
+    VGT_DMA_NUM_INSTANCES                                 = 0x00028a88,
+    VGT_EVENT_INITIATOR                                   = 0x00028a90,
+	EVENT_TYPE_mask                                   = 0x3f << 0,
+	EVENT_TYPE_shift                                  = 0,
+	    CACHE_FLUSH_TS                                = 0x04,
+	    CONTEXT_DONE                                  = 0x05,
+	    CACHE_FLUSH                                   = 0x06,
+	    VIZQUERY_START                                = 0x07,
+	    VIZQUERY_END                                  = 0x08,
+	    SC_WAIT_WC                                    = 0x09,
+	    MPASS_PS_CP_REFETCH                           = 0x0a,
+	    MPASS_PS_RST_START                            = 0x0b,
+	    MPASS_PS_INCR_START                           = 0x0c,
+	    RST_PIX_CNT                                   = 0x0d,
+	    RST_VTX_CNT                                   = 0x0e,
+	    VS_PARTIAL_FLUSH                              = 0x0f,
+	    PS_PARTIAL_FLUSH                              = 0x10,
+	    CACHE_FLUSH_AND_INV_TS_EVENT                  = 0x14,
+	    ZPASS_DONE                                    = 0x15,
+	    CACHE_FLUSH_AND_INV_EVENT                     = 0x16,
+	    PERFCOUNTER_START                             = 0x17,
+	    PERFCOUNTER_STOP                              = 0x18,
+	    PIPELINESTAT_START                            = 0x19,
+	    PIPELINESTAT_STOP                             = 0x1a,
+	    PERFCOUNTER_SAMPLE                            = 0x1b,
+	    FLUSH_ES_OUTPUT                               = 0x1c,
+	    FLUSH_GS_OUTPUT                               = 0x1d,
+	    SAMPLE_PIPELINESTAT                           = 0x1e,
+	    SO_VGTSTREAMOUT_FLUSH                         = 0x1f,
+	    SAMPLE_STREAMOUTSTATS                         = 0x20,
+	    RESET_VTX_CNT                                 = 0x21,
+	    BLOCK_CONTEXT_DONE                            = 0x22,
+	    CR_CONTEXT_DONE                               = 0x23,
+	    VGT_FLUSH                                     = 0x24,
+	    CR_DONE_TS                                    = 0x25,
+	    SQ_NON_EVENT                                  = 0x26,
+	    SC_SEND_DB_VPZ                                = 0x27,
+	    BOTTOM_OF_PIPE_TS                             = 0x28,
+	    DB_CACHE_FLUSH_AND_INV                        = 0x2a,
+	ADDRESS_HI_mask                                   = 0xff << 19,
+	ADDRESS_HI_shift                                  = 19,
+	EXTENDED_EVENT_bit                                = 1 << 27,
+    VGT_MULTI_PRIM_IB_RESET_EN                            = 0x00028a94,
+	RESET_EN_bit                                      = 1 << 0,
+    VGT_INSTANCE_STEP_RATE_0                              = 0x00028aa0,
+    VGT_INSTANCE_STEP_RATE_1                              = 0x00028aa4,
+    VGT_STRMOUT_EN                                        = 0x00028ab0,
+	STREAMOUT_bit                                     = 1 << 0,
+    VGT_REUSE_OFF                                         = 0x00028ab4,
+	REUSE_OFF_bit                                     = 1 << 0,
+    VGT_VTX_CNT_EN                                        = 0x00028ab8,
+	VTX_CNT_EN_bit                                    = 1 << 0,
+    VGT_STRMOUT_BUFFER_SIZE_0                             = 0x00028ad0,
+    VGT_STRMOUT_VTX_STRIDE_0                              = 0x00028ad4,
+	VGT_STRMOUT_VTX_STRIDE_0__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_0__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_0                             = 0x00028ad8,
+    VGT_STRMOUT_BUFFER_OFFSET_0                           = 0x00028adc,
+    VGT_STRMOUT_BUFFER_SIZE_1                             = 0x00028ae0,
+    VGT_STRMOUT_VTX_STRIDE_1                              = 0x00028ae4,
+	VGT_STRMOUT_VTX_STRIDE_1__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_1__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_1                             = 0x00028ae8,
+    VGT_STRMOUT_BUFFER_OFFSET_1                           = 0x00028aec,
+    VGT_STRMOUT_BUFFER_SIZE_2                             = 0x00028af0,
+    VGT_STRMOUT_VTX_STRIDE_2                              = 0x00028af4,
+	VGT_STRMOUT_VTX_STRIDE_2__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_2__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_2                             = 0x00028af8,
+    VGT_STRMOUT_BUFFER_OFFSET_2                           = 0x00028afc,
+    VGT_STRMOUT_BUFFER_SIZE_3                             = 0x00028b00,
+    VGT_STRMOUT_VTX_STRIDE_3                              = 0x00028b04,
+	VGT_STRMOUT_VTX_STRIDE_3__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_3__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_3                             = 0x00028b08,
+    VGT_STRMOUT_BUFFER_OFFSET_3                           = 0x00028b0c,
+    VGT_STRMOUT_BASE_OFFSET_0                             = 0x00028b10,
+    VGT_STRMOUT_BASE_OFFSET_1                             = 0x00028b14,
+    VGT_STRMOUT_BASE_OFFSET_2                             = 0x00028b18,
+    VGT_STRMOUT_BASE_OFFSET_3                             = 0x00028b1c,
+    VGT_STRMOUT_BUFFER_EN                                 = 0x00028b20,
+	BUFFER_0_EN_bit                                   = 1 << 0,
+	BUFFER_1_EN_bit                                   = 1 << 1,
+	BUFFER_2_EN_bit                                   = 1 << 2,
+	BUFFER_3_EN_bit                                   = 1 << 3,
+    VGT_STRMOUT_DRAW_OPAQUE_OFFSET                        = 0x00028b28,
+    VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE            = 0x00028b2c,
+    VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE                 = 0x00028b30,
+    VGT_STRMOUT_BASE_OFFSET_HI_0                          = 0x00028b44,
+	VGT_STRMOUT_BASE_OFFSET_HI_0__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_0__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_1                          = 0x00028b48,
+	VGT_STRMOUT_BASE_OFFSET_HI_1__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_1__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_2                          = 0x00028b4c,
+	VGT_STRMOUT_BASE_OFFSET_HI_2__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_2__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_3                          = 0x00028b50,
+	VGT_STRMOUT_BASE_OFFSET_HI_3__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_3__BASE_OFFSET_shift   = 0,
+    PA_SC_LINE_CNTL                                       = 0x00028c00,
+	BRES_CNTL_mask                                    = 0xff << 0,
+	BRES_CNTL_shift                                   = 0,
+	USE_BRES_CNTL_bit                                 = 1 << 8,
+	EXPAND_LINE_WIDTH_bit                             = 1 << 9,
+	LAST_PIXEL_bit                                    = 1 << 10,
+    PA_SC_AA_CONFIG                                       = 0x00028c04,
+	MSAA_NUM_SAMPLES_mask                             = 0x03 << 0,
+	MSAA_NUM_SAMPLES_shift                            = 0,
+	AA_MASK_CENTROID_DTMN_bit                         = 1 << 4,
+	MAX_SAMPLE_DIST_mask                              = 0x0f << 13,
+	MAX_SAMPLE_DIST_shift                             = 13,
+    PA_SU_VTX_CNTL                                        = 0x00028c08,
+	PIX_CENTER_bit                                    = 1 << 0,
+	PA_SU_VTX_CNTL__ROUND_MODE_mask                   = 0x03 << 1,
+	PA_SU_VTX_CNTL__ROUND_MODE_shift                  = 1,
+	    X_TRUNCATE                                    = 0x00,
+	    X_ROUND                                       = 0x01,
+	    X_ROUND_TO_EVEN                               = 0x02,
+	    X_ROUND_TO_ODD                                = 0x03,
+	QUANT_MODE_mask                                   = 0x07 << 3,
+	QUANT_MODE_shift                                  = 3,
+	    X_1_16TH                                      = 0x00,
+	    X_1_8TH                                       = 0x01,
+	    X_1_4TH                                       = 0x02,
+	    X_1_2                                         = 0x03,
+	    X_1                                           = 0x04,
+	    X_1_256TH                                     = 0x05,
+    PA_CL_GB_VERT_CLIP_ADJ                                = 0x00028c0c,
+    PA_CL_GB_VERT_DISC_ADJ                                = 0x00028c10,
+    PA_CL_GB_HORZ_CLIP_ADJ                                = 0x00028c14,
+    PA_CL_GB_HORZ_DISC_ADJ                                = 0x00028c18,
+    PA_SC_AA_SAMPLE_LOCS_MCTX                             = 0x00028c1c,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+/* 	S2_X_mask                                         = 0x0f << 16, */
+/* 	S2_X_shift                                        = 16, */
+/* 	S2_Y_mask                                         = 0x0f << 20, */
+/* 	S2_Y_shift                                        = 20, */
+/* 	S3_X_mask                                         = 0x0f << 24, */
+/* 	S3_X_shift                                        = 24, */
+/* 	S3_Y_mask                                         = 0x0f << 28, */
+/* 	S3_Y_shift                                        = 28, */
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX                      = 0x00028c20,
+/* 	S4_X_mask                                         = 0x0f << 0, */
+/* 	S4_X_shift                                        = 0, */
+/* 	S4_Y_mask                                         = 0x0f << 4, */
+/* 	S4_Y_shift                                        = 4, */
+/* 	S5_X_mask                                         = 0x0f << 8, */
+/* 	S5_X_shift                                        = 8, */
+/* 	S5_Y_mask                                         = 0x0f << 12, */
+/* 	S5_Y_shift                                        = 12, */
+/* 	S6_X_mask                                         = 0x0f << 16, */
+/* 	S6_X_shift                                        = 16, */
+/* 	S6_Y_mask                                         = 0x0f << 20, */
+/* 	S6_Y_shift                                        = 20, */
+/* 	S7_X_mask                                         = 0x0f << 24, */
+/* 	S7_X_shift                                        = 24, */
+/* 	S7_Y_mask                                         = 0x0f << 28, */
+/* 	S7_Y_shift                                        = 28, */
+    CB_CLRCMP_CONTROL                                     = 0x00028c30,
+	CLRCMP_FCN_SRC_mask                               = 0x07 << 0,
+	CLRCMP_FCN_SRC_shift                              = 0,
+	    CLRCMP_DRAW_ALWAYS                            = 0x00,
+	    CLRCMP_DRAW_NEVER                             = 0x01,
+	    CLRCMP_DRAW_ON_NEQ                            = 0x04,
+	    CLRCMP_DRAW_ON_EQ                             = 0x05,
+	CLRCMP_FCN_DST_mask                               = 0x07 << 8,
+	CLRCMP_FCN_DST_shift                              = 8,
+/* 	    CLRCMP_DRAW_ALWAYS                            = 0x00, */
+/* 	    CLRCMP_DRAW_NEVER                             = 0x01, */
+/* 	    CLRCMP_DRAW_ON_NEQ                            = 0x04, */
+/* 	    CLRCMP_DRAW_ON_EQ                             = 0x05, */
+	CLRCMP_FCN_SEL_mask                               = 0x03 << 24,
+	CLRCMP_FCN_SEL_shift                              = 24,
+	    CLRCMP_SEL_DST                                = 0x00,
+	    CLRCMP_SEL_SRC                                = 0x01,
+	    CLRCMP_SEL_AND                                = 0x02,
+    CB_CLRCMP_SRC                                         = 0x00028c34,
+    CB_CLRCMP_DST                                         = 0x00028c38,
+    CB_CLRCMP_MSK                                         = 0x00028c3c,
+    PA_SC_AA_MASK                                         = 0x00028c48,
+    VGT_VERTEX_REUSE_BLOCK_CNTL                           = 0x00028c58,
+	VTX_REUSE_DEPTH_mask                              = 0xff << 0,
+	VTX_REUSE_DEPTH_shift                             = 0,
+    VGT_OUT_DEALLOC_CNTL                                  = 0x00028c5c,
+	DEALLOC_DIST_mask                                 = 0x7f << 0,
+	DEALLOC_DIST_shift                                = 0,
+    DB_RENDER_CONTROL                                     = 0x00028d0c,
+	DEPTH_CLEAR_ENABLE_bit                            = 1 << 0,
+	STENCIL_CLEAR_ENABLE_bit                          = 1 << 1,
+	DEPTH_COPY_bit                                    = 1 << 2,
+	STENCIL_COPY_bit                                  = 1 << 3,
+	RESUMMARIZE_ENABLE_bit                            = 1 << 4,
+	STENCIL_COMPRESS_DISABLE_bit                      = 1 << 5,
+	DEPTH_COMPRESS_DISABLE_bit                        = 1 << 6,
+	COPY_CENTROID_bit                                 = 1 << 7,
+	COPY_SAMPLE_mask                                  = 0x07 << 8,
+	COPY_SAMPLE_shift                                 = 8,
+	ZPASS_INCREMENT_DISABLE_bit                       = 1 << 11,
+    DB_RENDER_OVERRIDE                                    = 0x00028d10,
+	FORCE_HIZ_ENABLE_mask                             = 0x03 << 0,
+	FORCE_HIZ_ENABLE_shift                            = 0,
+	    FORCE_OFF                                     = 0x00,
+	    FORCE_ENABLE                                  = 0x01,
+	    FORCE_DISABLE                                 = 0x02,
+	    FORCE_RESERVED                                = 0x03,
+	FORCE_HIS_ENABLE0_mask                            = 0x03 << 2,
+	FORCE_HIS_ENABLE0_shift                           = 2,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_HIS_ENABLE1_mask                            = 0x03 << 4,
+	FORCE_HIS_ENABLE1_shift                           = 4,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_SHADER_Z_ORDER_bit                          = 1 << 6,
+	FAST_Z_DISABLE_bit                                = 1 << 7,
+	FAST_STENCIL_DISABLE_bit                          = 1 << 8,
+	NOOP_CULL_DISABLE_bit                             = 1 << 9,
+	FORCE_COLOR_KILL_bit                              = 1 << 10,
+	FORCE_Z_READ_bit                                  = 1 << 11,
+	FORCE_STENCIL_READ_bit                            = 1 << 12,
+	FORCE_FULL_Z_RANGE_mask                           = 0x03 << 13,
+	FORCE_FULL_Z_RANGE_shift                          = 13,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_QC_SMASK_CONFLICT_bit                       = 1 << 15,
+	DISABLE_VIEWPORT_CLAMP_bit                        = 1 << 16,
+	IGNORE_SC_ZRANGE_bit                              = 1 << 17,
+    DB_HTILE_SURFACE                                      = 0x00028d24,
+	HTILE_WIDTH_bit                                   = 1 << 0,
+	HTILE_HEIGHT_bit                                  = 1 << 1,
+	LINEAR_bit                                        = 1 << 2,
+	FULL_CACHE_bit                                    = 1 << 3,
+	HTILE_USES_PRELOAD_WIN_bit                        = 1 << 4,
+	PRELOAD_bit                                       = 1 << 5,
+	PREFETCH_WIDTH_mask                               = 0x3f << 6,
+	PREFETCH_WIDTH_shift                              = 6,
+	PREFETCH_HEIGHT_mask                              = 0x3f << 12,
+	PREFETCH_HEIGHT_shift                             = 12,
+    DB_SRESULTS_COMPARE_STATE1                            = 0x00028d2c,
+	COMPAREFUNC1_mask                                 = 0x07 << 0,
+	COMPAREFUNC1_shift                                = 0,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	COMPAREVALUE1_mask                                = 0xff << 4,
+	COMPAREVALUE1_shift                               = 4,
+	COMPAREMASK1_mask                                 = 0xff << 12,
+	COMPAREMASK1_shift                                = 12,
+	ENABLE1_bit                                       = 1 << 24,
+    DB_PRELOAD_CONTROL                                    = 0x00028d30,
+	START_X_mask                                      = 0xff << 0,
+	START_X_shift                                     = 0,
+	START_Y_mask                                      = 0xff << 8,
+	START_Y_shift                                     = 8,
+	MAX_X_mask                                        = 0xff << 16,
+	MAX_X_shift                                       = 16,
+	MAX_Y_mask                                        = 0xff << 24,
+	MAX_Y_shift                                       = 24,
+    DB_PREFETCH_LIMIT                                     = 0x00028d34,
+	DEPTH_HEIGHT_TILE_MAX_mask                        = 0x3ff << 0,
+	DEPTH_HEIGHT_TILE_MAX_shift                       = 0,
+    PA_SU_POLY_OFFSET_DB_FMT_CNTL                         = 0x00028df8,
+	POLY_OFFSET_NEG_NUM_DB_BITS_mask                  = 0xff << 0,
+	POLY_OFFSET_NEG_NUM_DB_BITS_shift                 = 0,
+	POLY_OFFSET_DB_IS_FLOAT_FMT_bit                   = 1 << 8,
+    PA_SU_POLY_OFFSET_CLAMP                               = 0x00028dfc,
+    PA_SU_POLY_OFFSET_FRONT_SCALE                         = 0x00028e00,
+    PA_SU_POLY_OFFSET_FRONT_OFFSET                        = 0x00028e04,
+    PA_SU_POLY_OFFSET_BACK_SCALE                          = 0x00028e08,
+    PA_SU_POLY_OFFSET_BACK_OFFSET                         = 0x00028e0c,
+    PA_CL_POINT_X_RAD                                     = 0x00028e10,
+    PA_CL_POINT_Y_RAD                                     = 0x00028e14,
+    PA_CL_POINT_SIZE                                      = 0x00028e18,
+    PA_CL_POINT_CULL_RAD                                  = 0x00028e1c,
+    PA_CL_UCP_0_X                                         = 0x00028e20,
+	PA_CL_UCP_0_X_num                                 = 6,
+	PA_CL_UCP_0_X_offset                              = 16,
+    PA_CL_UCP_0_Y                                         = 0x00028e24,
+	PA_CL_UCP_0_Y_num                                 = 6,
+	PA_CL_UCP_0_Y_offset                              = 16,
+    PA_CL_UCP_0_Z                                         = 0x00028e28,
+	PA_CL_UCP_0_Z_num                                 = 6,
+	PA_CL_UCP_0_Z_offset                              = 16,
+    SQ_ALU_CONSTANT0_0                                    = 0x00030000,
+    SQ_ALU_CONSTANT1_0                                    = 0x00030004,
+    SQ_ALU_CONSTANT2_0                                    = 0x00030008,
+    SQ_ALU_CONSTANT3_0                                    = 0x0003000c,
+    SQ_VTX_CONSTANT_WORD0_0                               = 0x00038000,
+    SQ_TEX_RESOURCE_WORD0_0                               = 0x00038000,
+	DIM_mask                                          = 0x07 << 0,
+	DIM_shift                                         = 0,
+	    SQ_TEX_DIM_1D                                 = 0x00,
+	    SQ_TEX_DIM_2D                                 = 0x01,
+	    SQ_TEX_DIM_3D                                 = 0x02,
+	    SQ_TEX_DIM_CUBEMAP                            = 0x03,
+	    SQ_TEX_DIM_1D_ARRAY                           = 0x04,
+	    SQ_TEX_DIM_2D_ARRAY                           = 0x05,
+	    SQ_TEX_DIM_2D_MSAA                            = 0x06,
+	    SQ_TEX_DIM_2D_ARRAY_MSAA                      = 0x07,
+	SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask           = 0x0f << 3,
+	SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift          = 3,
+	TILE_TYPE_bit                                     = 1 << 7,
+	PITCH_mask                                        = 0x7ff << 8,
+	PITCH_shift                                       = 8,
+	TEX_WIDTH_mask                                    = 0x1fff << 19,
+	TEX_WIDTH_shift                                   = 19,
+    SQ_VTX_CONSTANT_WORD1_0                               = 0x00038004,
+    SQ_TEX_RESOURCE_WORD1_0                               = 0x00038004,
+	TEX_HEIGHT_mask                                   = 0x1fff << 0,
+	TEX_HEIGHT_shift                                  = 0,
+	TEX_DEPTH_mask                                    = 0x1fff << 13,
+	TEX_DEPTH_shift                                   = 13,
+	SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask         = 0x3f << 26,
+	SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift        = 26,
+    SQ_VTX_CONSTANT_WORD2_0                               = 0x00038008,
+	BASE_ADDRESS_HI_mask                              = 0xff << 0,
+	BASE_ADDRESS_HI_shift                             = 0,
+	SQ_VTX_CONSTANT_WORD2_0__STRIDE_mask              = 0x7ff << 8,
+	SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift             = 8,
+	SQ_VTX_CONSTANT_WORD2_0__CLAMP_X_bit              = 1 << 19,
+	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_mask         = 0x3f << 20,
+	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_shift        = 20,
+	SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_mask      = 0x03 << 26,
+	SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift     = 26,
+/* 	    SQ_NUM_FORMAT_NORM                            = 0x00, */
+/* 	    SQ_NUM_FORMAT_INT                             = 0x01, */
+/* 	    SQ_NUM_FORMAT_SCALED                          = 0x02, */
+	SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit      = 1 << 28,
+	SQ_VTX_CONSTANT_WORD2_0__SRF_MODE_ALL_bit         = 1 << 29,
+	SQ_VTX_CONSTANT_WORD2_0__ENDIAN_SWAP_mask         = 0x03 << 30,
+	SQ_VTX_CONSTANT_WORD2_0__ENDIAN_SWAP_shift        = 30,
+/* 	    SQ_ENDIAN_NONE                                = 0x00, */
+/* 	    SQ_ENDIAN_8IN16                               = 0x01, */
+/* 	    SQ_ENDIAN_8IN32                               = 0x02, */
+    SQ_TEX_RESOURCE_WORD2_0                               = 0x00038008,
+    SQ_VTX_CONSTANT_WORD3_0                               = 0x0003800c,
+	MEM_REQUEST_SIZE_mask                             = 0x03 << 0,
+	MEM_REQUEST_SIZE_shift                            = 0,
+    SQ_TEX_RESOURCE_WORD3_0                               = 0x0003800c,
+    SQ_TEX_RESOURCE_WORD4_0                               = 0x00038010,
+	FORMAT_COMP_X_mask                                = 0x03 << 0,
+	FORMAT_COMP_X_shift                               = 0,
+	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00,
+	    SQ_FORMAT_COMP_SIGNED                         = 0x01,
+	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02,
+	FORMAT_COMP_Y_mask                                = 0x03 << 2,
+	FORMAT_COMP_Y_shift                               = 2,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	FORMAT_COMP_Z_mask                                = 0x03 << 4,
+	FORMAT_COMP_Z_shift                               = 4,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	FORMAT_COMP_W_mask                                = 0x03 << 6,
+	FORMAT_COMP_W_shift                               = 6,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_mask      = 0x03 << 8,
+	SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_shift     = 8,
+/* 	    SQ_NUM_FORMAT_NORM                            = 0x00, */
+/* 	    SQ_NUM_FORMAT_INT                             = 0x01, */
+/* 	    SQ_NUM_FORMAT_SCALED                          = 0x02, */
+	SQ_TEX_RESOURCE_WORD4_0__SRF_MODE_ALL_bit         = 1 << 10,
+	SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit        = 1 << 11,
+	SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_mask         = 0x03 << 12,
+	SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_shift        = 12,
+/* 	    SQ_ENDIAN_NONE                                = 0x00, */
+/* 	    SQ_ENDIAN_8IN16                               = 0x01, */
+/* 	    SQ_ENDIAN_8IN32                               = 0x02, */
+	REQUEST_SIZE_mask                                 = 0x03 << 14,
+	REQUEST_SIZE_shift                                = 14,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask           = 0x07 << 16,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift          = 16,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask           = 0x07 << 19,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift          = 19,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask           = 0x07 << 22,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift          = 22,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask           = 0x07 << 25,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift          = 25,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	BASE_LEVEL_mask                                   = 0x0f << 28,
+	BASE_LEVEL_shift                                  = 28,
+    SQ_TEX_RESOURCE_WORD5_0                               = 0x00038014,
+	LAST_LEVEL_mask                                   = 0x0f << 0,
+	LAST_LEVEL_shift                                  = 0,
+	BASE_ARRAY_mask                                   = 0x1fff << 4,
+	BASE_ARRAY_shift                                  = 4,
+	LAST_ARRAY_mask                                   = 0x1fff << 17,
+	LAST_ARRAY_shift                                  = 17,
+    SQ_TEX_RESOURCE_WORD6_0                               = 0x00038018,
+	MPEG_CLAMP_mask                                   = 0x03 << 0,
+	MPEG_CLAMP_shift                                  = 0,
+	    SQ_TEX_MPEG_CLAMP_OFF                         = 0x00,
+	    SQ_TEX_MPEG_9                                 = 0x01,
+	    SQ_TEX_MPEG_10                                = 0x02,
+	PERF_MODULATION_mask                              = 0x07 << 5,
+	PERF_MODULATION_shift                             = 5,
+	INTERLACED_bit                                    = 1 << 8,
+	SQ_TEX_RESOURCE_WORD6_0__TYPE_mask                = 0x03 << 30,
+	SQ_TEX_RESOURCE_WORD6_0__TYPE_shift               = 30,
+	    SQ_TEX_VTX_INVALID_TEXTURE                    = 0x00,
+	    SQ_TEX_VTX_INVALID_BUFFER                     = 0x01,
+	    SQ_TEX_VTX_VALID_TEXTURE                      = 0x02,
+	    SQ_TEX_VTX_VALID_BUFFER                       = 0x03,
+    SQ_VTX_CONSTANT_WORD6_0                               = 0x00038018,
+	SQ_VTX_CONSTANT_WORD6_0__TYPE_mask                = 0x03 << 30,
+	SQ_VTX_CONSTANT_WORD6_0__TYPE_shift               = 30,
+/* 	    SQ_TEX_VTX_INVALID_TEXTURE                    = 0x00, */
+/* 	    SQ_TEX_VTX_INVALID_BUFFER                     = 0x01, */
+/* 	    SQ_TEX_VTX_VALID_TEXTURE                      = 0x02, */
+/* 	    SQ_TEX_VTX_VALID_BUFFER                       = 0x03, */
+    SQ_TEX_SAMPLER_WORD0_0                                = 0x0003c000,
+	SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_mask              = 0x07 << 0,
+	SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift             = 0,
+	    SQ_TEX_WRAP                                   = 0x00,
+	    SQ_TEX_MIRROR                                 = 0x01,
+	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02,
+	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03,
+	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04,
+	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05,
+	    SQ_TEX_CLAMP_BORDER                           = 0x06,
+	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07,
+	CLAMP_Y_mask                                      = 0x07 << 3,
+	CLAMP_Y_shift                                     = 3,
+/* 	    SQ_TEX_WRAP                                   = 0x00, */
+/* 	    SQ_TEX_MIRROR                                 = 0x01, */
+/* 	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02, */
+/* 	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03, */
+/* 	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04, */
+/* 	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05, */
+/* 	    SQ_TEX_CLAMP_BORDER                           = 0x06, */
+/* 	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07, */
+	CLAMP_Z_mask                                      = 0x07 << 6,
+	CLAMP_Z_shift                                     = 6,
+/* 	    SQ_TEX_WRAP                                   = 0x00, */
+/* 	    SQ_TEX_MIRROR                                 = 0x01, */
+/* 	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02, */
+/* 	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03, */
+/* 	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04, */
+/* 	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05, */
+/* 	    SQ_TEX_CLAMP_BORDER                           = 0x06, */
+/* 	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07, */
+	XY_MAG_FILTER_mask                                = 0x07 << 9,
+	XY_MAG_FILTER_shift                               = 9,
+	    SQ_TEX_XY_FILTER_POINT                        = 0x00,
+	    SQ_TEX_XY_FILTER_BILINEAR                     = 0x01,
+	    SQ_TEX_XY_FILTER_BICUBIC                      = 0x02,
+	XY_MIN_FILTER_mask                                = 0x07 << 12,
+	XY_MIN_FILTER_shift                               = 12,
+/* 	    SQ_TEX_XY_FILTER_POINT                        = 0x00, */
+/* 	    SQ_TEX_XY_FILTER_BILINEAR                     = 0x01, */
+/* 	    SQ_TEX_XY_FILTER_BICUBIC                      = 0x02, */
+	Z_FILTER_mask                                     = 0x03 << 15,
+	Z_FILTER_shift                                    = 15,
+	    SQ_TEX_Z_FILTER_NONE                          = 0x00,
+	    SQ_TEX_Z_FILTER_POINT                         = 0x01,
+	    SQ_TEX_Z_FILTER_LINEAR                        = 0x02,
+	MIP_FILTER_mask                                   = 0x03 << 17,
+	MIP_FILTER_shift                                  = 17,
+/* 	    SQ_TEX_Z_FILTER_NONE                          = 0x00, */
+/* 	    SQ_TEX_Z_FILTER_POINT                         = 0x01, */
+/* 	    SQ_TEX_Z_FILTER_LINEAR                        = 0x02, */
+	BORDER_COLOR_TYPE_mask                            = 0x03 << 22,
+	BORDER_COLOR_TYPE_shift                           = 22,
+	    SQ_TEX_BORDER_COLOR_TRANS_BLACK               = 0x00,
+	    SQ_TEX_BORDER_COLOR_OPAQUE_BLACK              = 0x01,
+	    SQ_TEX_BORDER_COLOR_OPAQUE_WHITE              = 0x02,
+	    SQ_TEX_BORDER_COLOR_REGISTER                  = 0x03,
+	POINT_SAMPLING_CLAMP_bit                          = 1 << 24,
+	TEX_ARRAY_OVERRIDE_bit                            = 1 << 25,
+	DEPTH_COMPARE_FUNCTION_mask                       = 0x07 << 26,
+	DEPTH_COMPARE_FUNCTION_shift                      = 26,
+	    SQ_TEX_DEPTH_COMPARE_NEVER                    = 0x00,
+	    SQ_TEX_DEPTH_COMPARE_LESS                     = 0x01,
+	    SQ_TEX_DEPTH_COMPARE_EQUAL                    = 0x02,
+	    SQ_TEX_DEPTH_COMPARE_LESSEQUAL                = 0x03,
+	    SQ_TEX_DEPTH_COMPARE_GREATER                  = 0x04,
+	    SQ_TEX_DEPTH_COMPARE_NOTEQUAL                 = 0x05,
+	    SQ_TEX_DEPTH_COMPARE_GREATEREQUAL             = 0x06,
+	    SQ_TEX_DEPTH_COMPARE_ALWAYS                   = 0x07,
+	CHROMA_KEY_mask                                   = 0x03 << 29,
+	CHROMA_KEY_shift                                  = 29,
+	    SQ_TEX_CHROMA_KEY_DISABLED                    = 0x00,
+	    SQ_TEX_CHROMA_KEY_KILL                        = 0x01,
+	    SQ_TEX_CHROMA_KEY_BLEND                       = 0x02,
+	LOD_USES_MINOR_AXIS_bit                           = 1 << 31,
+    SQ_TEX_SAMPLER_WORD1_0                                = 0x0003c004,
+	MIN_LOD_mask                                      = 0x3ff << 0,
+	MIN_LOD_shift                                     = 0,
+	MAX_LOD_mask                                      = 0x3ff << 10,
+	MAX_LOD_shift                                     = 10,
+	SQ_TEX_SAMPLER_WORD1_0__LOD_BIAS_mask             = 0xfff << 20,
+	SQ_TEX_SAMPLER_WORD1_0__LOD_BIAS_shift            = 20,
+    SQ_TEX_SAMPLER_WORD2_0                                = 0x0003c008,
+	LOD_BIAS_SEC_mask                                 = 0xfff << 0,
+	LOD_BIAS_SEC_shift                                = 0,
+	MC_COORD_TRUNCATE_bit                             = 1 << 12,
+	SQ_TEX_SAMPLER_WORD2_0__FORCE_DEGAMMA_bit         = 1 << 13,
+	HIGH_PRECISION_FILTER_bit                         = 1 << 14,
+	PERF_MIP_mask                                     = 0x07 << 15,
+	PERF_MIP_shift                                    = 15,
+	PERF_Z_mask                                       = 0x03 << 18,
+	PERF_Z_shift                                      = 18,
+	FETCH_4_bit                                       = 1 << 26,
+	SAMPLE_IS_PCF_bit                                 = 1 << 27,
+	SQ_TEX_SAMPLER_WORD2_0__TYPE_bit                  = 1 << 31,
+    SQ_VTX_BASE_VTX_LOC                                   = 0x0003cff0,
+    SQ_VTX_START_INST_LOC                                 = 0x0003cff4,
+    SQ_LOOP_CONST_DX10_0                                  = 0x0003e200,
+    SQ_LOOP_CONST_0                                       = 0x0003e200,
+	SQ_LOOP_CONST_0__COUNT_mask                       = 0xfff << 0,
+	SQ_LOOP_CONST_0__COUNT_shift                      = 0,
+	INIT_mask                                         = 0xfff << 12,
+	INIT_shift                                        = 12,
+	INC_mask                                          = 0xff << 24,
+	INC_shift                                         = 24,
+    SQ_BOOL_CONST_0                                       = 0x0003e380,
+	SQ_BOOL_CONST_0_num                               = 3,
+
+} ;
+
+#endif /* _AUTOREGS */
+
diff --git a/src/mesa/drivers/dri/r600/r600_reg_r6xx.h b/src/mesa/drivers/dri/r600/r600_reg_r6xx.h
new file mode 100644
index 0000000000..f7702c46de
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_reg_r6xx.h
@@ -0,0 +1,492 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_R6xx_H_
+#define _R600_REG_R6xx_H_
+
+/*
+ * Registers for R6xx chips that are not documented yet
+ */
+
+enum {
+
+    MM_INDEX                                              = 0x0000,
+    MM_DATA                                               = 0x0004,
+
+    SRBM_STATUS                                           = 0x0e50,
+	RLC_RQ_PENDING_bit                                = 1 << 3,
+	RCU_RQ_PENDING_bit                                = 1 << 4,
+	GRBM_RQ_PENDING_bit                               = 1 << 5,
+	HI_RQ_PENDING_bit                                 = 1 << 6,
+	IO_EXTERN_SIGNAL_bit                              = 1 << 7,
+	VMC_BUSY_bit                                      = 1 << 8,
+	MCB_BUSY_bit                                      = 1 << 9,
+	MCDZ_BUSY_bit                                     = 1 << 10,
+	MCDY_BUSY_bit                                     = 1 << 11,
+	MCDX_BUSY_bit                                     = 1 << 12,
+	MCDW_BUSY_bit                                     = 1 << 13,
+	SEM_BUSY_bit                                      = 1 << 14,
+	SRBM_STATUS__RLC_BUSY_bit                         = 1 << 15,
+	PDMA_BUSY_bit                                     = 1 << 16,
+	IH_BUSY_bit                                       = 1 << 17,
+	CSC_BUSY_bit                                      = 1 << 20,
+	CMC7_BUSY_bit                                     = 1 << 21,
+	CMC6_BUSY_bit                                     = 1 << 22,
+	CMC5_BUSY_bit                                     = 1 << 23,
+	CMC4_BUSY_bit                                     = 1 << 24,
+	CMC3_BUSY_bit                                     = 1 << 25,
+	CMC2_BUSY_bit                                     = 1 << 26,
+	CMC1_BUSY_bit                                     = 1 << 27,
+	CMC0_BUSY_bit                                     = 1 << 28,
+	BIF_BUSY_bit                                      = 1 << 29,
+	IDCT_BUSY_bit                                     = 1 << 30,
+
+    SRBM_READ_ERROR                                       = 0x0e98,
+	READ_ADDRESS_mask                                 = 0xffff << 2,
+	READ_ADDRESS_shift                                = 2,
+	READ_REQUESTER_HI_bit                             = 1 << 24,
+	READ_REQUESTER_GRBM_bit                           = 1 << 25,
+	READ_REQUESTER_RCU_bit                            = 1 << 26,
+	READ_REQUESTER_RLC_bit                            = 1 << 27,
+	READ_ERROR_bit                                    = 1 << 31,
+
+    SRBM_INT_STATUS                                       = 0x0ea4,
+	RDERR_INT_STAT_bit                                = 1 << 0,
+	GFX_CNTX_SWITCH_INT_STAT_bit                      = 1 << 1,
+    SRBM_INT_ACK                                          = 0x0ea8,
+	RDERR_INT_ACK_bit                                 = 1 << 0,
+	GFX_CNTX_SWITCH_INT_ACK_bit                       = 1 << 1,
+
+    R6XX_MC_VM_FB_LOCATION                                = 0x2180,
+
+    VENDOR_DEVICE_ID                                      = 0x4000,
+
+    D1GRPH_PRIMARY_SURFACE_ADDRESS                        = 0x6110,
+    D1GRPH_PITCH                                          = 0x6120,
+    D1GRPH_Y_END                                          = 0x6138,
+
+    GRBM_STATUS                                           = 0x8010,
+	CMDFIFO_AVAIL_mask                                = 0x1f << 0,
+	CMDFIFO_AVAIL_shift                               = 0,
+	SRBM_RQ_PENDING_bit                               = 1 << 5,
+	CP_RQ_PENDING_bit                                 = 1 << 6,
+	CF_RQ_PENDING_bit                                 = 1 << 7,
+	PF_RQ_PENDING_bit                                 = 1 << 8,
+	GRBM_EE_BUSY_bit                                  = 1 << 10,
+	GRBM_STATUS__VC_BUSY_bit                          = 1 << 11,
+	DB03_CLEAN_bit                                    = 1 << 12,
+	CB03_CLEAN_bit                                    = 1 << 13,
+	VGT_BUSY_NO_DMA_bit                               = 1 << 16,
+	GRBM_STATUS__VGT_BUSY_bit                         = 1 << 17,
+	TA03_BUSY_bit                                     = 1 << 18,
+	GRBM_STATUS__TC_BUSY_bit                          = 1 << 19,
+	SX_BUSY_bit                                       = 1 << 20,
+	SH_BUSY_bit                                       = 1 << 21,
+	SPI03_BUSY_bit                                    = 1 << 22,
+	SMX_BUSY_bit                                      = 1 << 23,
+	SC_BUSY_bit                                       = 1 << 24,
+	PA_BUSY_bit                                       = 1 << 25,
+	DB03_BUSY_bit                                     = 1 << 26,
+	CR_BUSY_bit                                       = 1 << 27,
+	CP_COHERENCY_BUSY_bit                             = 1 << 28,
+	GRBM_STATUS__CP_BUSY_bit                          = 1 << 29,
+	CB03_BUSY_bit                                     = 1 << 30,
+	GUI_ACTIVE_bit                                    = 1 << 31,
+    GRBM_STATUS2                                          = 0x8014,
+	CR_CLEAN_bit                                      = 1 << 0,
+	SMX_CLEAN_bit                                     = 1 << 1,
+	SPI0_BUSY_bit                                     = 1 << 8,
+	SPI1_BUSY_bit                                     = 1 << 9,
+	SPI2_BUSY_bit                                     = 1 << 10,
+	SPI3_BUSY_bit                                     = 1 << 11,
+	TA0_BUSY_bit                                      = 1 << 12,
+	TA1_BUSY_bit                                      = 1 << 13,
+	TA2_BUSY_bit                                      = 1 << 14,
+	TA3_BUSY_bit                                      = 1 << 15,
+	DB0_BUSY_bit                                      = 1 << 16,
+	DB1_BUSY_bit                                      = 1 << 17,
+	DB2_BUSY_bit                                      = 1 << 18,
+	DB3_BUSY_bit                                      = 1 << 19,
+	CB0_BUSY_bit                                      = 1 << 20,
+	CB1_BUSY_bit                                      = 1 << 21,
+	CB2_BUSY_bit                                      = 1 << 22,
+	CB3_BUSY_bit                                      = 1 << 23,
+    GRBM_SOFT_RESET                                       = 0x8020,
+	SOFT_RESET_CP_bit                                 = 1 << 0,
+	SOFT_RESET_CB_bit                                 = 1 << 1,
+	SOFT_RESET_CR_bit                                 = 1 << 2,
+	SOFT_RESET_DB_bit                                 = 1 << 3,
+	SOFT_RESET_PA_bit                                 = 1 << 5,
+	SOFT_RESET_SC_bit                                 = 1 << 6,
+	SOFT_RESET_SMX_bit                                = 1 << 7,
+	SOFT_RESET_SPI_bit                                = 1 << 8,
+	SOFT_RESET_SH_bit                                 = 1 << 9,
+	SOFT_RESET_SX_bit                                 = 1 << 10,
+	SOFT_RESET_TC_bit                                 = 1 << 11,
+	SOFT_RESET_TA_bit                                 = 1 << 12,
+	SOFT_RESET_VC_bit                                 = 1 << 13,
+	SOFT_RESET_VGT_bit                                = 1 << 14,
+	SOFT_RESET_GRBM_GCA_bit                           = 1 << 15,
+
+    WAIT_UNTIL                                            = 0x8040,
+	WAIT_CP_DMA_IDLE_bit                              = 1 << 8,
+	WAIT_CMDFIFO_bit                                  = 1 << 10,
+	WAIT_2D_IDLE_bit                                  = 1 << 14,
+	WAIT_3D_IDLE_bit                                  = 1 << 15,
+	WAIT_2D_IDLECLEAN_bit                             = 1 << 16,
+	WAIT_3D_IDLECLEAN_bit                             = 1 << 17,
+	WAIT_EXTERN_SIG_bit                               = 1 << 19,
+	CMDFIFO_ENTRIES_mask                              = 0x1f << 20,
+	CMDFIFO_ENTRIES_shift                             = 20,
+
+    GRBM_READ_ERROR                                       = 0x8058,
+/* 	READ_ADDRESS_mask                                 = 0xffff << 2, */
+/* 	READ_ADDRESS_shift                                = 2, */
+	READ_REQUESTER_SRBM_bit                           = 1 << 28,
+	READ_REQUESTER_CP_bit                             = 1 << 29,
+	READ_REQUESTER_WU_POLL_bit                        = 1 << 30,
+/* 	READ_ERROR_bit                                    = 1 << 31, */
+
+    SCRATCH_REG0		                          = 0x8500,
+    SCRATCH_REG1		                          = 0x8504,
+    SCRATCH_REG2		                          = 0x8508,
+    SCRATCH_REG3		                          = 0x850c,
+    SCRATCH_REG4		                          = 0x8510,
+    SCRATCH_REG5		                          = 0x8514,
+    SCRATCH_REG6		                          = 0x8518,
+    SCRATCH_REG7		                          = 0x851c,
+    SCRATCH_UMSK		                          = 0x8540,
+    SCRATCH_ADDR		                          = 0x8544,
+
+    CP_COHER_CNTL                                         = 0x85f0,
+	DEST_BASE_0_ENA_bit                               = 1 << 0,
+	DEST_BASE_1_ENA_bit                               = 1 << 1,
+	SO0_DEST_BASE_ENA_bit                             = 1 << 2,
+	SO1_DEST_BASE_ENA_bit                             = 1 << 3,
+	SO2_DEST_BASE_ENA_bit                             = 1 << 4,
+	SO3_DEST_BASE_ENA_bit                             = 1 << 5,
+	CB0_DEST_BASE_ENA_bit                             = 1 << 6,
+	CB1_DEST_BASE_ENA_bit                             = 1 << 7,
+	CB2_DEST_BASE_ENA_bit                             = 1 << 8,
+	CB3_DEST_BASE_ENA_bit                             = 1 << 9,
+	CB4_DEST_BASE_ENA_bit                             = 1 << 10,
+	CB5_DEST_BASE_ENA_bit                             = 1 << 11,
+	CB6_DEST_BASE_ENA_bit                             = 1 << 12,
+	CB7_DEST_BASE_ENA_bit                             = 1 << 13,
+	DB_DEST_BASE_ENA_bit                              = 1 << 14,
+	CR_DEST_BASE_ENA_bit                              = 1 << 15,
+	TC_ACTION_ENA_bit                                 = 1 << 23,
+	VC_ACTION_ENA_bit                                 = 1 << 24,
+	CB_ACTION_ENA_bit                                 = 1 << 25,
+	DB_ACTION_ENA_bit                                 = 1 << 26,
+	SH_ACTION_ENA_bit                                 = 1 << 27,
+	SMX_ACTION_ENA_bit                                = 1 << 28,
+	CR0_ACTION_ENA_bit                                = 1 << 29,
+	CR1_ACTION_ENA_bit                                = 1 << 30,
+	CR2_ACTION_ENA_bit                                = 1 << 31,
+    CP_COHER_SIZE                                         = 0x85f4,
+    CP_COHER_BASE                                         = 0x85f8,
+    CP_COHER_STATUS                                       = 0x85fc,
+	MATCHING_GFX_CNTX_mask                            = 0xff << 0,
+	MATCHING_GFX_CNTX_shift                           = 0,
+	MATCHING_CR_CNTX_mask                             = 0xffff << 8,
+	MATCHING_CR_CNTX_shift                            = 8,
+	STATUS_bit                                        = 1 << 31,
+
+    CP_STALLED_STAT1                                      = 0x8674,
+	RBIU_TO_DMA_NOT_RDY_TO_RCV_bit                    = 1 << 0,
+	RBIU_TO_IBS_NOT_RDY_TO_RCV_bit                    = 1 << 1,
+	RBIU_TO_SEM_NOT_RDY_TO_RCV_bit                    = 1 << 2,
+	RBIU_TO_2DREGS_NOT_RDY_TO_RCV_bit                 = 1 << 3,
+	RBIU_TO_MEMWR_NOT_RDY_TO_RCV_bit                  = 1 << 4,
+	RBIU_TO_MEMRD_NOT_RDY_TO_RCV_bit                  = 1 << 5,
+	RBIU_TO_EOPD_NOT_RDY_TO_RCV_bit                   = 1 << 6,
+	RBIU_TO_RECT_NOT_RDY_TO_RCV_bit                   = 1 << 7,
+	RBIU_TO_STRMO_NOT_RDY_TO_RCV_bit                  = 1 << 8,
+	RBIU_TO_PSTAT_NOT_RDY_TO_RCV_bit                  = 1 << 9,
+	MIU_WAITING_ON_RDREQ_FREE_bit                     = 1 << 16,
+	MIU_WAITING_ON_WRREQ_FREE_bit                     = 1 << 17,
+	MIU_NEEDS_AVAIL_WRREQ_PHASE_bit                   = 1 << 18,
+	RCIU_WAITING_ON_GRBM_FREE_bit                     = 1 << 24,
+	RCIU_WAITING_ON_VGT_FREE_bit                      = 1 << 25,
+	RCIU_STALLED_ON_ME_READ_bit                       = 1 << 26,
+	RCIU_STALLED_ON_DMA_READ_bit                      = 1 << 27,
+	RCIU_HALTED_BY_REG_VIOLATION_bit                  = 1 << 28,
+    CP_STALLED_STAT2                                      = 0x8678,
+	PFP_TO_CSF_NOT_RDY_TO_RCV_bit                     = 1 << 0,
+	PFP_TO_MEQ_NOT_RDY_TO_RCV_bit                     = 1 << 1,
+	PFP_TO_VGT_NOT_RDY_TO_RCV_bit                     = 1 << 2,
+	PFP_HALTED_BY_INSTR_VIOLATION_bit                 = 1 << 3,
+	MULTIPASS_IB_PENDING_IN_PFP_bit                   = 1 << 4,
+	ME_BRUSH_WC_NOT_RDY_TO_RCV_bit                    = 1 << 8,
+	ME_STALLED_ON_BRUSH_LOGIC_bit                     = 1 << 9,
+	CR_CNTX_NOT_AVAIL_TO_ME_bit                       = 1 << 10,
+	GFX_CNTX_NOT_AVAIL_TO_ME_bit                      = 1 << 11,
+	ME_RCIU_NOT_RDY_TO_RCV_bit                        = 1 << 12,
+	ME_TO_CONST_NOT_RDY_TO_RCV_bit                    = 1 << 13,
+	ME_WAITING_DATA_FROM_PFP_bit                      = 1 << 14,
+	ME_WAITING_ON_PARTIAL_FLUSH_bit                   = 1 << 15,
+	RECT_FIFO_NEEDS_CR_RECT_DONE_bit                  = 1 << 16,
+	RECT_FIFO_NEEDS_WR_CONFIRM_bit                    = 1 << 17,
+	EOPD_FIFO_NEEDS_SC_EOP_DONE_bit                   = 1 << 18,
+	EOPD_FIFO_NEEDS_SMX_EOP_DONE_bit                  = 1 << 19,
+	EOPD_FIFO_NEEDS_WR_CONFIRM_bit                    = 1 << 20,
+	EOPD_FIFO_NEEDS_SIGNAL_SEM_bit                    = 1 << 21,
+	SO_NUMPRIM_FIFO_NEEDS_SOADDR_bit                  = 1 << 22,
+	SO_NUMPRIM_FIFO_NEEDS_NUMPRIM_bit                 = 1 << 23,
+	PIPE_STATS_FIFO_NEEDS_SAMPLE_bit                  = 1 << 24,
+	SURF_SYNC_NEEDS_IDLE_CNTXS_bit                    = 1 << 30,
+	SURF_SYNC_NEEDS_ALL_CLEAN_bit                     = 1 << 31,
+    CP_BUSY_STAT                                          = 0x867c,
+	REG_BUS_FIFO_BUSY_bit                             = 1 << 0,
+	RING_FETCHING_DATA_bit                            = 1 << 1,
+	INDR1_FETCHING_DATA_bit                           = 1 << 2,
+	INDR2_FETCHING_DATA_bit                           = 1 << 3,
+	STATE_FETCHING_DATA_bit                           = 1 << 4,
+	PRED_FETCHING_DATA_bit                            = 1 << 5,
+	COHER_CNTR_NEQ_ZERO_bit                           = 1 << 6,
+	PFP_PARSING_PACKETS_bit                           = 1 << 7,
+	ME_PARSING_PACKETS_bit                            = 1 << 8,
+	RCIU_PFP_BUSY_bit                                 = 1 << 9,
+	RCIU_ME_BUSY_bit                                  = 1 << 10,
+	OUTSTANDING_READ_TAGS_bit                         = 1 << 11,
+	SEM_CMDFIFO_NOT_EMPTY_bit                         = 1 << 12,
+	SEM_FAILED_AND_HOLDING_bit                        = 1 << 13,
+	SEM_POLLING_FOR_PASS_bit                          = 1 << 14,
+	_3D_BUSY_bit                                      = 1 << 15,
+	_2D_BUSY_bit                                      = 1 << 16,
+    CP_STAT                                               = 0x8680,
+	CSF_RING_BUSY_bit                                 = 1 << 0,
+	CSF_WPTR_POLL_BUSY_bit                            = 1 << 1,
+	CSF_INDIRECT1_BUSY_bit                            = 1 << 2,
+	CSF_INDIRECT2_BUSY_bit                            = 1 << 3,
+	CSF_STATE_BUSY_bit                                = 1 << 4,
+	CSF_PREDICATE_BUSY_bit                            = 1 << 5,
+	CSF_BUSY_bit                                      = 1 << 6,
+	MIU_RDREQ_BUSY_bit                                = 1 << 7,
+	MIU_WRREQ_BUSY_bit                                = 1 << 8,
+	ROQ_RING_BUSY_bit                                 = 1 << 9,
+	ROQ_INDIRECT1_BUSY_bit                            = 1 << 10,
+	ROQ_INDIRECT2_BUSY_bit                            = 1 << 11,
+	ROQ_STATE_BUSY_bit                                = 1 << 12,
+	ROQ_PREDICATE_BUSY_bit                            = 1 << 13,
+	ROQ_ALIGN_BUSY_bit                                = 1 << 14,
+	PFP_BUSY_bit                                      = 1 << 15,
+	MEQ_BUSY_bit                                      = 1 << 16,
+	ME_BUSY_bit                                       = 1 << 17,
+	QUERY_BUSY_bit                                    = 1 << 18,
+	SEMAPHORE_BUSY_bit                                = 1 << 19,
+	INTERRUPT_BUSY_bit                                = 1 << 20,
+	SURFACE_SYNC_BUSY_bit                             = 1 << 21,
+	DMA_BUSY_bit                                      = 1 << 22,
+	RCIU_BUSY_bit                                     = 1 << 23,
+	CP_STAT__CP_BUSY_bit                              = 1 << 31,
+
+    CP_ME_CNTL                                            = 0x86d8,
+	ME_STATMUX_mask                                   = 0xff << 0,
+	ME_STATMUX_shift                                  = 0,
+	ME_HALT_bit                                       = 1 << 28,
+    CP_ME_STATUS                                          = 0x86dc,
+
+    CP_RB_RPTR                                            = 0x8700,
+	RB_RPTR_mask                                      = 0xfffff << 0,
+	RB_RPTR_shift                                     = 0,
+    CP_RB_WPTR_DELAY                                      = 0x8704,
+	PRE_WRITE_TIMER_mask                              = 0xfffffff << 0,
+	PRE_WRITE_TIMER_shift                             = 0,
+	PRE_WRITE_LIMIT_mask                              = 0x0f << 28,
+	PRE_WRITE_LIMIT_shift                             = 28,
+
+    CP_ROQ_RB_STAT                                        = 0x8780,
+	ROQ_RPTR_PRIMARY_mask                             = 0x3ff << 0,
+	ROQ_RPTR_PRIMARY_shift                            = 0,
+	ROQ_WPTR_PRIMARY_mask                             = 0x3ff << 16,
+	ROQ_WPTR_PRIMARY_shift                            = 16,
+    CP_ROQ_IB1_STAT                                       = 0x8784,
+	ROQ_RPTR_INDIRECT1_mask                           = 0x3ff << 0,
+	ROQ_RPTR_INDIRECT1_shift                          = 0,
+	ROQ_WPTR_INDIRECT1_mask                           = 0x3ff << 16,
+	ROQ_WPTR_INDIRECT1_shift                          = 16,
+    CP_ROQ_IB2_STAT                                       = 0x8788,
+	ROQ_RPTR_INDIRECT2_mask                           = 0x3ff << 0,
+	ROQ_RPTR_INDIRECT2_shift                          = 0,
+	ROQ_WPTR_INDIRECT2_mask                           = 0x3ff << 16,
+	ROQ_WPTR_INDIRECT2_shift                          = 16,
+
+    CP_MEQ_STAT                                           = 0x8794,
+	MEQ_RPTR_mask                                     = 0x3ff << 0,
+	MEQ_RPTR_shift                                    = 0,
+	MEQ_WPTR_mask                                     = 0x3ff << 16,
+	MEQ_WPTR_shift                                    = 16,
+
+    CC_GC_SHADER_PIPE_CONFIG                              = 0x8950,
+	INACTIVE_QD_PIPES_mask                            = 0xff << 8,
+	INACTIVE_QD_PIPES_shift                           = 8,
+	    R6XX_MAX_QD_PIPES                             = 8,
+	INACTIVE_SIMDS_mask                               = 0xff << 16,
+	INACTIVE_SIMDS_shift                              = 16,
+	    R6XX_MAX_SIMDS                                = 8,
+    GC_USER_SHADER_PIPE_CONFIG                            = 0x8954,
+
+    VC_ENHANCE                                            = 0x9714,
+    DB_DEBUG                                              = 0x9830,
+        PREZ_MUST_WAIT_FOR_POSTZ_DONE                     = 1 << 31,
+
+    DB_WATERMARKS                                         = 0x00009838,
+	DEPTH_FREE_mask                                   = 0x1f << 0,
+	DEPTH_FREE_shift                                  = 0,
+	DEPTH_FLUSH_mask                                  = 0x3f << 5,
+	DEPTH_FLUSH_shift                                 = 5,
+	FORCE_SUMMARIZE_mask                              = 0x0f << 11,
+	FORCE_SUMMARIZE_shift                             = 11,
+	DEPTH_PENDING_FREE_mask                           = 0x1f << 15,
+	DEPTH_PENDING_FREE_shift                          = 15,
+	DEPTH_CACHELINE_FREE_mask                         = 0x1f << 20,
+	DEPTH_CACHELINE_FREE_shift                        = 20,
+	EARLY_Z_PANIC_DISABLE_bit                         = 1 << 25,
+	LATE_Z_PANIC_DISABLE_bit                          = 1 << 26,
+	RE_Z_PANIC_DISABLE_bit                            = 1 << 27,
+	DB_EXTRA_DEBUG_mask                               = 0x0f << 28,
+	DB_EXTRA_DEBUG_shift                              = 28,
+
+    CP_RB_BASE                                            = 0xc100,
+    CP_RB_CNTL                                            = 0xc104,
+        RB_BUFSZ_mask                                     = 0x3f << 0,
+    CP_RB_WPTR                                            = 0xc114,
+	RB_WPTR_mask                                      = 0xfffff << 0,
+	RB_WPTR_shift                                     = 0,
+    CP_RB_RPTR_WR                                         = 0xc108,
+	RB_RPTR_WR_mask                                   = 0xfffff << 0,
+	RB_RPTR_WR_shift                                  = 0,
+
+    CP_INT_STATUS                                         = 0xc128,
+	DISABLE_CNTX_SWITCH_INT_STAT_bit                  = 1 << 0,
+	ENABLE_CNTX_SWITCH_INT_STAT_bit                   = 1 << 1,
+	SEM_SIGNAL_INT_STAT_bit                           = 1 << 18,
+	CNTX_BUSY_INT_STAT_bit                            = 1 << 19,
+	CNTX_EMPTY_INT_STAT_bit                           = 1 << 20,
+	WAITMEM_SEM_INT_STAT_bit                          = 1 << 21,
+	PRIV_INSTR_INT_STAT_bit                           = 1 << 22,
+	PRIV_REG_INT_STAT_bit                             = 1 << 23,
+	OPCODE_ERROR_INT_STAT_bit                         = 1 << 24,
+	SCRATCH_INT_STAT_bit                              = 1 << 25,
+	TIME_STAMP_INT_STAT_bit                           = 1 << 26,
+	RESERVED_BIT_ERROR_INT_STAT_bit                   = 1 << 27,
+	DMA_INT_STAT_bit                                  = 1 << 28,
+	IB2_INT_STAT_bit                                  = 1 << 29,
+	IB1_INT_STAT_bit                                  = 1 << 30,
+	RB_INT_STAT_bit                                   = 1 << 31,
+
+//  SX_ALPHA_TEST_CONTROL                                 = 0x00028410,
+	ALPHA_FUNC__REF_NEVER                             = 0,
+	ALPHA_FUNC__REF_ALWAYS                            = 7,
+//  DB_SHADER_CONTROL                                     = 0x0002880c,
+	Z_ORDER__EARLY_Z_THEN_LATE_Z                      = 2,
+//  PA_SU_SC_MODE_CNTL                                    = 0x00028814,
+//	POLY_MODE_mask                                    = 0x03 << 3,
+	POLY_MODE__TRIANGLES = 0, POLY_MODE__DUAL_MODE,
+//	POLYMODE_FRONT_PTYPE_mask                         = 0x07 << 5,
+	POLYMODE_PTYPE__POINTS = 0, POLYMODE_PTYPE__LINES, POLYMODE_PTYPE__TRIANGLES,
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1_M                         = 0x00028c20,
+    DB_SRESULTS_COMPARE_STATE0                            = 0x00028d28,	/* See autoregs: DB_SRESULTS_COMPARE_STATE1 */
+//  DB_SRESULTS_COMPARE_STATE1                            = 0x00028d2c,
+    DB_ALPHA_TO_MASK                                      = 0x00028d44,
+	ALPHA_TO_MASK_ENABLE                              = 1 << 0,
+	ALPHA_TO_MASK_OFFSET0_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET0_shift                       = 8,
+	ALPHA_TO_MASK_OFFSET1_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET1_shift                       = 10,
+	ALPHA_TO_MASK_OFFSET2_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET2_shift                       = 12,
+	ALPHA_TO_MASK_OFFSET3_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET3_shift                       = 14,
+
+//  SQ_VTX_CONSTANT_WORD2_0                               = 0x00038008,
+//    	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_mask         = 0x3f << 20,
+	FMT_INVALID=0,      FMT_8,          FMT_4_4,            FMT_3_3_2,
+	                    FMT_16=5,       FMT_16_FLOAT,       FMT_8_8,
+	FMT_5_6_5,          FMT_6_5_5,      FMT_1_5_5_5,        FMT_4_4_4_4,
+	FMT_5_5_5_1,        FMT_32,         FMT_32_FLOAT,       FMT_16_16,
+	FMT_16_16_FLOAT=16, FMT_8_24,       FMT_8_24_FLOAT,     FMT_24_8,
+	FMT_24_8_FLOAT,     FMT_10_11_11,   FMT_10_11_11_FLOAT, FMT_11_11_10,
+	FMT_11_11_10_FLOAT, FMT_2_10_10_10, FMT_8_8_8_8,        FMT_10_10_10_2,
+	FMT_X24_8_32_FLOAT, FMT_32_32,      FMT_32_32_FLOAT,    FMT_16_16_16_16,
+	FMT_16_16_16_16_FLOAT=32,           FMT_32_32_32_32=34, FMT_32_32_32_32_FLOAT,
+	                    FMT_1 = 37,                         FMT_GB_GR=39,
+	FMT_BG_RG,          FMT_32_AS_8,    FMT_32_AS_8_8,      FMT_5_9_9_9_SHAREDEXP,
+	FMT_8_8_8,          FMT_16_16_16,   FMT_16_16_16_FLOAT, FMT_32_32_32,
+	FMT_32_32_32_FLOAT=48,
+
+//  High level register file lengths
+    SQ_ALU_CONSTANT                                       = SQ_ALU_CONSTANT0_0,	/* 256 PS, 256 VS */
+    SQ_ALU_CONSTANT_ps_num                                = 256,
+    SQ_ALU_CONSTANT_vs_num                                = 256,
+    SQ_ALU_CONSTANT_all_num                               = 512,
+    SQ_ALU_CONSTANT_offset                                = 16,
+    SQ_ALU_CONSTANT_ps                                    = 0,
+    SQ_ALU_CONSTANT_vs                                    = SQ_ALU_CONSTANT_ps + SQ_ALU_CONSTANT_ps_num,
+    SQ_TEX_RESOURCE                                       = SQ_TEX_RESOURCE_WORD0_0,	/* 160 PS, 160 VS, 16 FS, 160 GS */
+    SQ_TEX_RESOURCE_ps_num                                = 160,
+    SQ_TEX_RESOURCE_vs_num                                = 160,
+    SQ_TEX_RESOURCE_fs_num                                = 16,
+    SQ_TEX_RESOURCE_gs_num                                = 160,
+    SQ_TEX_RESOURCE_all_num                               = 496,
+    SQ_TEX_RESOURCE_offset                                = 28,
+    SQ_TEX_RESOURCE_ps                                    = 0,
+    SQ_TEX_RESOURCE_vs                                    = SQ_TEX_RESOURCE_ps + SQ_TEX_RESOURCE_ps_num,
+    SQ_TEX_RESOURCE_fs                                    = SQ_TEX_RESOURCE_vs + SQ_TEX_RESOURCE_vs_num,
+    SQ_TEX_RESOURCE_gs                                    = SQ_TEX_RESOURCE_fs + SQ_TEX_RESOURCE_fs_num,
+    SQ_VTX_RESOURCE                                       = SQ_VTX_CONSTANT_WORD0_0,	/* 160 PS, 160 VS, 16 FS, 160 GS */
+    SQ_VTX_RESOURCE_ps_num                                = 160,
+    SQ_VTX_RESOURCE_vs_num                                = 160,
+    SQ_VTX_RESOURCE_fs_num                                = 16,
+    SQ_VTX_RESOURCE_gs_num                                = 160,
+    SQ_VTX_RESOURCE_all_num                               = 496,
+    SQ_VTX_RESOURCE_offset                                = 28,
+    SQ_VTX_RESOURCE_ps                                    = 0,
+    SQ_VTX_RESOURCE_vs                                    = SQ_VTX_RESOURCE_ps + SQ_VTX_RESOURCE_ps_num,
+    SQ_VTX_RESOURCE_fs                                    = SQ_VTX_RESOURCE_vs + SQ_VTX_RESOURCE_vs_num,
+    SQ_VTX_RESOURCE_gs                                    = SQ_VTX_RESOURCE_fs + SQ_VTX_RESOURCE_fs_num,
+    SQ_TEX_SAMPLER_WORD                                   = SQ_TEX_SAMPLER_WORD0_0,	/* 18 per PS, VS, GS */
+    SQ_TEX_SAMPLER_WORD_ps_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_vs_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_gs_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_all_num                           = 54,
+    SQ_TEX_SAMPLER_WORD_offset                            = 12,
+    SQ_TEX_SAMPLER_WORD_ps                                = 0,
+    SQ_TEX_SAMPLER_WORD_vs                                = SQ_TEX_SAMPLER_WORD_ps + SQ_TEX_SAMPLER_WORD_ps_num,
+    SQ_TEX_SAMPLER_WORD_gs                                = SQ_TEX_SAMPLER_WORD_vs + SQ_TEX_SAMPLER_WORD_vs_num,
+    SQ_LOOP_CONST                                         = SQ_LOOP_CONST_0,		/* 32 per PS, VS, GS */
+    SQ_LOOP_CONST_ps_num                                  = 32,
+    SQ_LOOP_CONST_vs_num                                  = 32,
+    SQ_LOOP_CONST_gs_num                                  = 32,
+    SQ_LOOP_CONST_all_num                                 = 96,
+    SQ_LOOP_CONST_offset                                  = 4,
+    SQ_LOOP_CONST_ps                                      = 0,
+    SQ_LOOP_CONST_vs                                      = SQ_LOOP_CONST_ps + SQ_LOOP_CONST_ps_num,
+    SQ_LOOP_CONST_gs                                      = SQ_LOOP_CONST_vs + SQ_LOOP_CONST_vs_num,
+} ;
+
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/r600_reg_r7xx.h b/src/mesa/drivers/dri/r600/r600_reg_r7xx.h
new file mode 100644
index 0000000000..e5c01c861a
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_reg_r7xx.h
@@ -0,0 +1,149 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_R7xx_H_
+#define _R600_REG_R7xx_H_
+
+/*
+ * Register update for R7xx chips
+ */
+
+enum {
+
+    R7XX_MC_VM_FB_LOCATION                                = 0x00002024,
+
+//  GRBM_STATUS                                           = 0x00008010,
+	R7XX_TA_BUSY_bit                                  = 1 << 14,
+
+    R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ                     = 0x00008d8c,
+	RING0_OFFSET_mask                                 = 0xff << 0,
+	RING0_OFFSET_shift                                = 0,
+	ISOLATE_ES_ENABLE_bit                             = 1 << 12,
+	ISOLATE_GS_ENABLE_bit                             = 1 << 13,
+	VS_PC_LIMIT_ENABLE_bit                            = 1 << 14,
+
+//  SQ_ALU_WORD0                                          = 0x00008dfc,
+//	SRC0_SEL_mask                                     = 0x1ff << 0,
+// 	SRC1_SEL_mask                                     = 0x1ff << 13,
+	    R7xx_SQ_ALU_SRC_1_DBL_L                       = 0xf4,
+	    R7xx_SQ_ALU_SRC_1_DBL_M                       = 0xf5,
+	    R7xx_SQ_ALU_SRC_0_5_DBL_L                     = 0xf6,
+	    R7xx_SQ_ALU_SRC_0_5_DBL_M                     = 0xf7,
+// 	INDEX_MODE_mask                                   = 0x07 << 26,
+	    R7xx_SQ_INDEX_GLOBAL                          = 0x05,
+	    R7xx_SQ_INDEX_GLOBAL_AR_X                     = 0x06,
+    R6xx_SQ_ALU_WORD1_OP2                                 = 0x00008dfc,
+    R7xx_SQ_ALU_WORD1_OP2_V2                              = 0x00008dfc,
+	R6xx_FOG_MERGE_bit                                = 1 << 5,
+	R6xx_OMOD_mask                                    = 0x03 << 6,
+	R7xx_OMOD_mask                                    = 0x03 << 5,
+	R6xx_OMOD_shift                                   = 6,
+	R7xx_OMOD_shift                                   = 5,
+	R6xx_SQ_ALU_WORD1_OP2__ALU_INST_mask              = 0x3ff << 8,
+	R7xx_SQ_ALU_WORD1_OP2_V2__ALU_INST_mask           = 0x7ff << 7,
+	R6xx_SQ_ALU_WORD1_OP2__ALU_INST_shift             = 8,
+	R7xx_SQ_ALU_WORD1_OP2_V2__ALU_INST_shift          = 7,
+	    R7xx_SQ_OP2_INST_FREXP_64                     = 0x07,
+	    R7xx_SQ_OP2_INST_ADD_64                       = 0x17,
+	    R7xx_SQ_OP2_INST_MUL_64                       = 0x1b,
+	    R7xx_SQ_OP2_INST_FLT64_TO_FLT32               = 0x1c,
+	    R7xx_SQ_OP2_INST_FLT32_TO_FLT64               = 0x1d,
+	    R7xx_SQ_OP2_INST_LDEXP_64                     = 0x7a,
+	    R7xx_SQ_OP2_INST_FRACT_64                     = 0x7b,
+	    R7xx_SQ_OP2_INST_PRED_SETGT_64                = 0x7c,
+	    R7xx_SQ_OP2_INST_PRED_SETE_64                 = 0x7d,
+	    R7xx_SQ_OP2_INST_PRED_SETGE_64                = 0x7e,
+//  SQ_ALU_WORD1_OP3                                      = 0x00008dfc,
+//	SRC2_SEL_mask                                     = 0x1ff << 0,
+//	    R7xx_SQ_ALU_SRC_1_DBL_L                       = 0xf4,
+//	    R7xx_SQ_ALU_SRC_1_DBL_M                       = 0xf5,
+//	    R7xx_SQ_ALU_SRC_0_5_DBL_L                     = 0xf6,
+//	    R7xx_SQ_ALU_SRC_0_5_DBL_M                     = 0xf7,
+// 	SQ_ALU_WORD1_OP3__ALU_INST_mask                   = 0x1f << 13,
+	    R7xx_SQ_OP3_INST_MULADD_64                    = 0x08,
+	    R7xx_SQ_OP3_INST_MULADD_64_M2                 = 0x09,
+	    R7xx_SQ_OP3_INST_MULADD_64_M4                 = 0x0a,
+	    R7xx_SQ_OP3_INST_MULADD_64_D2                 = 0x0b,
+//  SQ_CF_ALU_WORD1                                       = 0x00008dfc,
+	R6xx_USES_WATERFALL_bit                           = 1 << 25,
+	R7xx_SQ_CF_ALU_WORD1__ALT_CONST_bit               = 1 << 25,
+//  SQ_CF_ALLOC_EXPORT_WORD0                              = 0x00008dfc,
+//	ARRAY_BASE_mask                                   = 0x1fff << 0,
+//	TYPE_mask                                         = 0x03 << 13,
+//	    SQ_EXPORT_PARAM                               = 0x02,
+//	    X_UNUSED_FOR_SX_EXPORTS                       = 0x03,
+//	ELEM_SIZE_mask                                    = 0x03 << 30,
+//  SQ_CF_ALLOC_EXPORT_WORD1                              = 0x00008dfc,
+//	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_mask            = 0x7f << 23,
+	    R7xx_SQ_CF_INST_MEM_EXPORT                    = 0x3a,
+//  SQ_CF_WORD1                                           = 0x00008dfc,
+//	SQ_CF_WORD1__COUNT_mask                           = 0x07 << 10,
+	R7xx_COUNT_3_bit                                  = 1 << 19,
+//	SQ_CF_WORD1__CF_INST_mask                         = 0x7f << 23,
+	    R7xx_SQ_CF_INST_END_PROGRAM                   = 0x19,
+	    R7xx_SQ_CF_INST_WAIT_ACK                      = 0x1a,
+	    R7xx_SQ_CF_INST_TEX_ACK                       = 0x1b,
+	    R7xx_SQ_CF_INST_VTX_ACK                       = 0x1c,
+	    R7xx_SQ_CF_INST_VTX_TC_ACK                    = 0x1d,
+//  SQ_VTX_WORD0                                          = 0x00008dfc,
+//	VTX_INST_mask                                     = 0x1f << 0,
+	    R7xx_SQ_VTX_INST_MEM                          = 0x02,
+//  SQ_VTX_WORD2                                          = 0x00008dfc,
+	R7xx_SQ_VTX_WORD2__ALT_CONST_bit                  = 1 << 20,
+
+//  SQ_TEX_WORD0                                          = 0x00008dfc,
+//	TEX_INST_mask                                     = 0x1f << 0,
+	    R7xx_X_MEMORY_READ                            = 0x02,
+	    R7xx_SQ_TEX_INST_KEEP_GRADIENTS               = 0x0a,
+	    R7xx_X_FETCH4_LOAD4_INSTRUCTION_FOR_DX10_1    = 0x0f,
+	R7xx_SQ_TEX_WORD0__ALT_CONST_bit                  = 1 << 24,
+
+    R7xx_PA_SC_EDGERULE                                   = 0x00028230,
+    R7xx_SPI_THREAD_GROUPING                              = 0x000286c8,
+	PS_GROUPING_mask                                  = 0x1f << 0,
+	PS_GROUPING_shift                                 = 0,
+	VS_GROUPING_mask                                  = 0x1f << 8,
+	VS_GROUPING_shift                                 = 8,
+	GS_GROUPING_mask                                  = 0x1f << 16,
+	GS_GROUPING_shift                                 = 16,
+	ES_GROUPING_mask                                  = 0x1f << 24,
+	ES_GROUPING_shift                                 = 24,
+    R7xx_CB_SHADER_CONTROL                                = 0x000287a0,
+	RT0_ENABLE_bit                                    = 1 << 0,
+	RT1_ENABLE_bit                                    = 1 << 1,
+	RT2_ENABLE_bit                                    = 1 << 2,
+	RT3_ENABLE_bit                                    = 1 << 3,
+	RT4_ENABLE_bit                                    = 1 << 4,
+	RT5_ENABLE_bit                                    = 1 << 5,
+	RT6_ENABLE_bit                                    = 1 << 6,
+	RT7_ENABLE_bit                                    = 1 << 7,
+//  DB_ALPHA_TO_MASK                                      = 0x00028d44,
+	R7xx_OFFSET_ROUND_bit                             = 1 << 16,
+//  SQ_TEX_SAMPLER_MISC_0                                 = 0x0003d03c,
+	R7xx_TRUNCATE_COORD_bit                           = 1 << 9,
+	R7xx_DISABLE_CUBE_WRAP_bit                        = 1 << 10,
+
+} ;
+
+#endif /* _R600_REG_R7xx_H_ */
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
new file mode 100644
index 0000000000..d105b90cd1
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_tex.c
@@ -0,0 +1,440 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/colormac.h"
+#include "main/context.h"
+#include "main/enums.h"
+#include "main/image.h"
+#include "main/mipmap.h"
+#include "main/simple_list.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "main/teximage.h"
+#include "main/texobj.h"
+
+#include "texmem.h"
+
+#include "r600_context.h"
+#include "r700_state.h"
+#include "radeon_mipmap_tree.h"
+#include "r600_tex.h"
+
+#include "xmlpool.h"
+
+
+static unsigned int translate_wrap_mode(GLenum wrapmode)
+{
+	switch(wrapmode) {
+	case GL_REPEAT: return SQ_TEX_WRAP;
+	case GL_CLAMP: return SQ_TEX_CLAMP_HALF_BORDER;
+	case GL_CLAMP_TO_EDGE: return SQ_TEX_CLAMP_LAST_TEXEL;
+	case GL_CLAMP_TO_BORDER: return SQ_TEX_CLAMP_BORDER;
+	case GL_MIRRORED_REPEAT: return SQ_TEX_MIRROR;
+	case GL_MIRROR_CLAMP_EXT: return SQ_TEX_MIRROR_ONCE_HALF_BORDER;
+	case GL_MIRROR_CLAMP_TO_EDGE_EXT: return SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
+	case GL_MIRROR_CLAMP_TO_BORDER_EXT: return SQ_TEX_MIRROR_ONCE_BORDER;
+	default:
+		radeon_error("bad wrap mode in %s", __FUNCTION__);
+		return 0;
+	}
+}
+
+
+/**
+ * Update the cached hardware registers based on the current texture wrap modes.
+ *
+ * \param t Texture object whose wrap modes are to be set
+ */
+static void r600UpdateTexWrap(radeonTexObjPtr t)
+{
+	struct gl_texture_object *tObj = &t->base;
+
+        SETfield(t->SQ_TEX_SAMPLER0, translate_wrap_mode(tObj->WrapS),
+                 SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift, SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_mask);
+
+	if (tObj->Target != GL_TEXTURE_1D) {
+		SETfield(t->SQ_TEX_SAMPLER0, translate_wrap_mode(tObj->WrapT),
+			 CLAMP_Y_shift, CLAMP_Y_mask);
+
+		if (tObj->Target == GL_TEXTURE_3D)
+			SETfield(t->SQ_TEX_SAMPLER0, translate_wrap_mode(tObj->WrapR),
+				 CLAMP_Z_shift, CLAMP_Z_mask);
+	}
+}
+
+static void r600SetTexDefaultState(radeonTexObjPtr t)
+{
+        /* Init text object to default states. */
+        t->SQ_TEX_RESOURCE0              = 0;
+        SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_2D, DIM_shift, DIM_mask);
+        SETfield(t->SQ_TEX_RESOURCE0, ARRAY_LINEAR_GENERAL,
+                 SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift, SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+        CLEARbit(t->SQ_TEX_RESOURCE0, TILE_TYPE_bit);
+
+        t->SQ_TEX_RESOURCE1                = 0;
+        SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+                 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+        t->SQ_TEX_RESOURCE2                = 0;
+        t->SQ_TEX_RESOURCE3                = 0;
+
+        t->SQ_TEX_RESOURCE4                   = 0;
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+                 FORMAT_COMP_X_shift, FORMAT_COMP_X_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+                 FORMAT_COMP_Y_shift, FORMAT_COMP_Y_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+                 FORMAT_COMP_Z_shift, FORMAT_COMP_Z_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+                 FORMAT_COMP_W_shift, FORMAT_COMP_W_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_NUM_FORMAT_NORM,
+                 SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_shift, SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_mask);
+        CLEARbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__SRF_MODE_ALL_bit);
+        CLEARbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_ENDIAN_NONE,
+                 SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_shift, SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, 1, REQUEST_SIZE_shift, REQUEST_SIZE_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+		 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift,
+		 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+		 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift,
+		 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+		 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift,
+		 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+		 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift,
+		 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, 0, BASE_LEVEL_shift, BASE_LEVEL_mask); /* mip-maps */
+
+        t->SQ_TEX_RESOURCE5 = 0;
+        t->SQ_TEX_RESOURCE6 = 0;
+
+        SETfield(t->SQ_TEX_RESOURCE6, SQ_TEX_VTX_VALID_TEXTURE,
+                 SQ_TEX_RESOURCE_WORD6_0__TYPE_shift, SQ_TEX_RESOURCE_WORD6_0__TYPE_mask);
+
+        /* Initialize sampler registers */
+        t->SQ_TEX_SAMPLER0                           = 0;
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_WRAP, SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift,
+		 SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_mask);
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_WRAP, CLAMP_Y_shift, CLAMP_Y_mask);
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_WRAP, CLAMP_Z_shift, CLAMP_Z_mask);
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_XY_FILTER_POINT, XY_MAG_FILTER_shift, XY_MAG_FILTER_mask);
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_XY_FILTER_POINT, XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_Z_FILTER_NONE, Z_FILTER_shift, Z_FILTER_mask);
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_Z_FILTER_NONE, MIP_FILTER_shift, MIP_FILTER_mask);
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_BORDER_COLOR_TRANS_BLACK, BORDER_COLOR_TYPE_shift, BORDER_COLOR_TYPE_mask);
+
+        t->SQ_TEX_SAMPLER1                           = 0;
+        SETfield(t->SQ_TEX_SAMPLER1, 0x3ff, MAX_LOD_shift, MAX_LOD_mask);
+
+        t->SQ_TEX_SAMPLER2                          = 0;
+        SETbit(t->SQ_TEX_SAMPLER2, SQ_TEX_SAMPLER_WORD2_0__TYPE_bit);
+}
+
+
+#if 0
+static GLuint aniso_filter(GLfloat anisotropy)
+{
+	if (anisotropy >= 16.0) {
+		return R300_TX_MAX_ANISO_16_TO_1;
+	} else if (anisotropy >= 8.0) {
+		return R300_TX_MAX_ANISO_8_TO_1;
+	} else if (anisotropy >= 4.0) {
+		return R300_TX_MAX_ANISO_4_TO_1;
+	} else if (anisotropy >= 2.0) {
+		return R300_TX_MAX_ANISO_2_TO_1;
+	} else {
+		return R300_TX_MAX_ANISO_1_TO_1;
+	}
+	return 0;
+}
+#endif
+
+/**
+ * Set the texture magnification and minification modes.
+ *
+ * \param t Texture whose filter modes are to be set
+ * \param minf Texture minification mode
+ * \param magf Texture magnification mode
+ * \param anisotropy Maximum anisotropy level
+ */
+static void r600SetTexFilter(radeonTexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
+{
+	/* Force revalidation to account for switches from/to mipmapping. */
+	t->validated = GL_FALSE;
+
+	/* Note that EXT_texture_filter_anisotropic is extremely vague about
+	 * how anisotropic filtering interacts with the "normal" filter modes.
+	 * When anisotropic filtering is enabled, we override min and mag
+	 * filter settings completely. This includes driconf's settings.
+	 */
+	if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
+		/*t->pp_txfilter |= R300_TX_MAG_FILTER_ANISO
+			| R300_TX_MIN_FILTER_ANISO
+			| R300_TX_MIN_FILTER_MIP_LINEAR
+			| aniso_filter(anisotropy);*/
+		radeon_print(RADEON_TEXTURE, RADEON_NORMAL, "Using maximum anisotropy of %f\n", anisotropy);
+		return;
+	}
+
+	switch (minf) {
+	case GL_NEAREST:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Point,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_None,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_LINEAR:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Linear,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_None,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_NEAREST_MIPMAP_NEAREST:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Point,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_Point,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_NEAREST_MIPMAP_LINEAR:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Point,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_Linear,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_LINEAR_MIPMAP_NEAREST:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Linear,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_Point,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_LINEAR_MIPMAP_LINEAR:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Linear,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_Linear,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	}
+
+	/* Note we don't have 3D mipmaps so only use the mag filter setting
+	 * to set the 3D texture filter mode.
+	 */
+	switch (magf) {
+	case GL_NEAREST:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Point,
+			 XY_MAG_FILTER_shift, XY_MAG_FILTER_mask);
+		break;
+	case GL_LINEAR:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Linear,
+			 XY_MAG_FILTER_shift, XY_MAG_FILTER_mask);
+		break;
+	}
+}
+
+static void r600SetTexBorderColor(radeonTexObjPtr t, const GLfloat color[4])
+{
+	t->TD_PS_SAMPLER0_BORDER_ALPHA = *((uint32_t*)&(color[3]));
+	t->TD_PS_SAMPLER0_BORDER_RED = *((uint32_t*)&(color[2]));
+	t->TD_PS_SAMPLER0_BORDER_GREEN = *((uint32_t*)&(color[1]));
+	t->TD_PS_SAMPLER0_BORDER_BLUE = *((uint32_t*)&(color[0]));
+        SETfield(t->SQ_TEX_SAMPLER0, SQ_TEX_BORDER_COLOR_REGISTER,
+		 BORDER_COLOR_TYPE_shift, BORDER_COLOR_TYPE_mask);
+}
+
+/**
+ * Changes variables and flags for a state update, which will happen at the
+ * next UpdateTextureState
+ */
+
+static void r600TexParameter(GLcontext * ctx, GLenum target,
+			     struct gl_texture_object *texObj,
+			     GLenum pname, const GLfloat * params)
+{
+	radeonTexObj* t = radeon_tex_obj(texObj);
+
+	radeon_print(RADEON_STATE | RADEON_TEXTURE, RADEON_VERBOSE,
+			"%s( %s )\n", __FUNCTION__,
+			_mesa_lookup_enum_by_nr(pname));
+
+	switch (pname) {
+	case GL_TEXTURE_MIN_FILTER:
+	case GL_TEXTURE_MAG_FILTER:
+	case GL_TEXTURE_MAX_ANISOTROPY_EXT:
+		r600SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
+		break;
+
+	case GL_TEXTURE_WRAP_S:
+	case GL_TEXTURE_WRAP_T:
+	case GL_TEXTURE_WRAP_R:
+		r600UpdateTexWrap(t);
+		break;
+
+	case GL_TEXTURE_BORDER_COLOR:
+		r600SetTexBorderColor(t, texObj->BorderColor);
+		break;
+
+	case GL_TEXTURE_BASE_LEVEL:
+	case GL_TEXTURE_MAX_LEVEL:
+	case GL_TEXTURE_MIN_LOD:
+	case GL_TEXTURE_MAX_LOD:
+		/* This isn't the most efficient solution but there doesn't appear to
+		 * be a nice alternative.  Since there's no LOD clamping,
+		 * we just have to rely on loading the right subset of mipmap levels
+		 * to simulate a clamped LOD.
+		 */
+		if (t->mt) {
+			radeon_miptree_unreference(t->mt);
+			t->mt = 0;
+			t->validated = GL_FALSE;
+		}
+		break;
+
+	case GL_DEPTH_TEXTURE_MODE:
+		if (!texObj->Image[0][texObj->BaseLevel])
+			return;
+		if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat
+		    == GL_DEPTH_COMPONENT) {
+			r600SetDepthTexMode(texObj);
+			break;
+		} else {
+			/* If the texture isn't a depth texture, changing this
+			 * state won't cause any changes to the hardware.
+			 * Don't force a flush of texture state.
+			 */
+			return;
+		}
+
+	default:
+		return;
+	}
+}
+
+static void r600DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	context_t* rmesa = R700_CONTEXT(ctx);
+	radeonTexObj* t = radeon_tex_obj(texObj);
+
+	radeon_print(RADEON_STATE | RADEON_TEXTURE, RADEON_NORMAL,
+		"%s( %p (target = %s) )\n", __FUNCTION__,
+			(void *)texObj,
+			_mesa_lookup_enum_by_nr(texObj->Target));
+
+	if (rmesa) {
+		int i;
+		radeon_firevertices(&rmesa->radeon);
+
+		for(i = 0; i < R700_MAX_TEXTURE_UNITS; ++i)
+			if (rmesa->hw.textures[i] == t)
+				rmesa->hw.textures[i] = 0;
+	}
+
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = 0;
+	}
+	_mesa_delete_texture_object(ctx, texObj);
+}
+
+/**
+ * Allocate a new texture object.
+ * Called via ctx->Driver.NewTextureObject.
+ * Note: this function will be called during context creation to
+ * allocate the default texture objects.
+ * Fixup MaxAnisotropy according to user preference.
+ */
+static struct gl_texture_object *r600NewTextureObject(GLcontext * ctx,
+						      GLuint name,
+						      GLenum target)
+{
+	context_t* rmesa = R700_CONTEXT(ctx);
+	radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
+
+	radeon_print(RADEON_STATE | RADEON_TEXTURE, RADEON_NORMAL,
+		"%s( %p (target = %s) )\n", __FUNCTION__,
+			t, _mesa_lookup_enum_by_nr(target));
+
+	_mesa_initialize_texture_object(&t->base, name, target);
+	t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+	/* Initialize hardware state */
+	r600SetTexDefaultState(t);
+	r600UpdateTexWrap(t);
+	r600SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
+	r600SetTexBorderColor(t, t->base.BorderColor);
+
+	return &t->base;
+}
+
+void r600InitTextureFuncs(struct dd_function_table *functions)
+{
+	/* Note: we only plug in the functions we implement in the driver
+	 * since _mesa_init_driver_functions() was already called.
+	 */
+	functions->NewTextureImage = radeonNewTextureImage;
+	functions->FreeTexImageData = radeonFreeTexImageData;
+	functions->MapTexture = radeonMapTexture;
+	functions->UnmapTexture = radeonUnmapTexture;
+
+	functions->ChooseTextureFormat = radeonChooseTextureFormat_mesa;
+	functions->TexImage1D = radeonTexImage1D;
+	functions->TexImage2D = radeonTexImage2D;
+	functions->TexImage3D = radeonTexImage3D;
+	functions->TexSubImage1D = radeonTexSubImage1D;
+	functions->TexSubImage2D = radeonTexSubImage2D;
+	functions->TexSubImage3D = radeonTexSubImage3D;
+	functions->GetTexImage = radeonGetTexImage;
+	functions->GetCompressedTexImage = radeonGetCompressedTexImage;
+	functions->NewTextureObject = r600NewTextureObject;
+	functions->DeleteTexture = r600DeleteTexture;
+	functions->IsTextureResident = driIsTextureResident;
+
+	functions->TexParameter = r600TexParameter;
+
+	functions->CompressedTexImage2D = radeonCompressedTexImage2D;
+	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
+
+	functions->GenerateMipmap = radeonGenerateMipmap;
+
+	driInitTextureFormats();
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.h b/src/mesa/drivers/dri/r600/r600_tex.h
index 3add775b82..fb0e1a023e 100644
--- a/src/mesa/drivers/dri/r300/radeon_ioctl.h
+++ b/src/mesa/drivers/dri/r600/r600_tex.h
@@ -32,26 +32,32 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *   Keith Whitwell <keith@tungstengraphics.com>
  */
 
-#ifndef __RADEON_IOCTL_H__
-#define __RADEON_IOCTL_H__
-
-#include "main/simple_list.h"
-#include "radeon_dri.h"
-#include "radeon_lock.h"
-
-#include "xf86drm.h"
-#include "drm.h"
-#if 0
-#include "r200context.h"
-#endif
-#include "radeon_drm.h"
-
-extern void radeonCopyBuffer(__DRIdrawablePrivate * drawable,
-			     const drm_clip_rect_t	* rect);
-extern void radeonPageFlip(__DRIdrawablePrivate * drawable);
-extern void radeonFlush(GLcontext * ctx);
-extern void radeonFinish(GLcontext * ctx);
-extern void radeonWaitForIdleLocked(radeonContextPtr radeon);
-extern uint32_t radeonGetAge(radeonContextPtr radeon);
-
-#endif				/* __RADEON_IOCTL_H__ */
+#ifndef __r600_TEX_H__
+#define __r600_TEX_H__
+
+/* TODO : review this after texture load code. */
+#define R700_BLIT_WIDTH_BYTES 1024
+/* The BASE_ADDRESS and MIP_ADDRESS fields are 256-byte-aligned */
+#define R700_TEXTURE_ALIGNMENT_MASK     0x255
+/* Texel pitch is 8 alignment. */
+#define R700_TEXEL_PITCH_ALIGNMENT_MASK 0x7
+
+#define R700_MAX_TEXTURE_UNITS 8 /* TODO : should be 16, lets make it work, review later */
+
+extern void r600SetDepthTexMode(struct gl_texture_object *tObj);
+
+extern void r600SetTexBuffer(__DRIcontext *pDRICtx, GLint target,
+			     __DRIdrawable *dPriv);
+
+extern void r600SetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
+			      GLint format, __DRIdrawable *dPriv);
+
+extern void r600SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+			     unsigned long long offset, GLint depth,
+			     GLuint pitch);
+
+extern GLboolean r600ValidateBuffers(GLcontext * ctx);
+
+extern void r600InitTextureFuncs(struct dd_function_table *functions);
+
+#endif				/* __r600_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c
new file mode 100644
index 0000000000..7d7e77d355
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_texstate.c
@@ -0,0 +1,1003 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \todo Enable R300 texture tiling code?
+ */
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/texformat.h"
+#include "main/teximage.h"
+#include "main/texobj.h"
+#include "main/enums.h"
+#include "main/simple_list.h"
+
+#include "r600_context.h"
+#include "r700_state.h"
+#include "radeon_mipmap_tree.h"
+#include "r600_tex.h"
+#include "r700_fragprog.h"
+#include "r700_vertprog.h"
+
+void r600UpdateTextureState(GLcontext * ctx);
+
+void r600UpdateTextureState(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	struct gl_texture_unit *texUnit;
+	struct radeon_tex_obj *t;
+	GLuint    unit;
+
+	R600_STATECHANGE(context, tx);
+	R600_STATECHANGE(context, tx_smplr);
+	R600_STATECHANGE(context, tx_brdr_clr);
+
+	for (unit = 0; unit < R700_MAX_TEXTURE_UNITS; unit++) {
+		texUnit = &ctx->Texture.Unit[unit];
+		t = radeon_tex_obj(ctx->Texture.Unit[unit]._Current);
+		r700->textures[unit] = NULL;
+		if (texUnit->_ReallyEnabled) {
+			if (!t)
+				continue;
+			r700->textures[unit] = t;
+		}
+	}
+}
+
+static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_format)
+{
+	radeonTexObj *t = radeon_tex_obj(tObj);
+
+	CLEARfield(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+	CLEARfield(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+	CLEARfield(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	CLEARfield(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+
+	switch (mesa_format) /* This is mesa format. */
+	{
+	case MESA_FORMAT_RGBA8888:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_RGBA8888_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_ARGB8888:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_ARGB8888_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_RGB888:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_RGB565:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_5_6_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_RGB565_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_5_6_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_ARGB4444:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_4_4_4_4,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_ARGB4444_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_4_4_4_4,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_ARGB1555:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_1_5_5_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_ARGB1555_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_1_5_5_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_AL88:
+	case MESA_FORMAT_AL88_REV: /* TODO : Check this. */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_RGB332:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_3_3_2,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_A8: /* ZERO, ZERO, ZERO, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_L8: /* X, X, X, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_I8: /* X, X, X, X */
+	case MESA_FORMAT_CI8:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+		/* YUV422 TODO conversion */  /* X, Y, Z, ONE, G8R8_G8B8 */
+		/*
+		  case MESA_FORMAT_YCBCR:
+		  t->SQ_TEX_RESOURCE1.bitfields.DATA_FORMAT = ;
+		  break;
+		*/
+		/* VUY422 TODO conversion */  /* X, Y, Z, ONE, G8R8_G8B8 */
+		/*
+		  case MESA_FORMAT_YCBCR_REV:
+		  t->SQ_TEX_RESOURCE1.bitfields.DATA_FORMAT = ;
+		  break;
+		*/
+	case MESA_FORMAT_RGB_DXT1: /* not supported yet */
+
+		break;
+	case MESA_FORMAT_RGBA_DXT1: /* not supported yet */
+
+		break;
+	case MESA_FORMAT_RGBA_DXT3: /* not supported yet */
+
+		break;
+	case MESA_FORMAT_RGBA_DXT5: /* not supported yet */
+
+		break;
+	case MESA_FORMAT_RGBA_FLOAT32:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_32_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_RGBA_FLOAT16:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_16_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_RGB_FLOAT32: /* X, Y, Z, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_RGB_FLOAT16:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_ALPHA_FLOAT32: /* ZERO, ZERO, ZERO, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_ALPHA_FLOAT16: /* ZERO, ZERO, ZERO, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_LUMINANCE_FLOAT32: /* X, X, X, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_LUMINANCE_FLOAT16: /* X, X, X, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_INTENSITY_FLOAT32: /* X, X, X, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_INTENSITY_FLOAT16: /* X, X, X, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		break;
+	case MESA_FORMAT_Z16:
+	case MESA_FORMAT_Z24_S8:
+	case MESA_FORMAT_Z32:
+		switch (mesa_format) {
+		case MESA_FORMAT_Z16:
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_16,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+			break;
+		case MESA_FORMAT_Z24_S8:
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_24_8,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+			break;
+		case MESA_FORMAT_Z32:
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_32,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+			break;
+		};
+		switch (tObj->DepthMode) {
+		case GL_LUMINANCE:  /* X, X, X, ONE */
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+			break;
+		case GL_INTENSITY:  /* X, X, X, X */
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+			break;
+		case GL_ALPHA:     /* ZERO, ZERO, ZERO, X */
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_0,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+			break;
+		default:
+			return GL_FALSE;
+		}
+		break;
+	/* EXT_texture_sRGB */
+	case MESA_FORMAT_SRGBA8:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+		break;
+	case MESA_FORMAT_SLA8:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+		break;
+	case MESA_FORMAT_SL8: /* X, X, X, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+		break;
+	default:
+		/* Not supported format */
+		return GL_FALSE;
+	};
+
+	return GL_TRUE;
+}
+
+void r600SetDepthTexMode(struct gl_texture_object *tObj)
+{
+	radeonTexObjPtr t;
+
+	if (!tObj)
+		return;
+
+	t = radeon_tex_obj(tObj);
+
+	r600GetTexFormat(tObj, tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat);
+
+}
+
+/**
+ * Compute the cached hardware register values for the given texture object.
+ *
+ * \param rmesa Context pointer
+ * \param t the r300 texture object
+ */
+static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *texObj)
+{
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	const struct gl_texture_image *firstImage;
+	int firstlevel = t->mt ? t->mt->firstLevel : 0;
+	GLuint uTexelPitch, row_align;
+
+	if (rmesa->radeon.radeonScreen->driScreen->dri2.enabled &&
+	    t->image_override &&
+	    t->bo)
+		return;
+
+	firstImage = t->base.Image[0][firstlevel];
+
+	if (!t->image_override) {
+		if (!r600GetTexFormat(texObj, firstImage->TexFormat->MesaFormat)) {
+			radeon_error("unexpected texture format in %s\n",
+				      __FUNCTION__);
+			return;
+		}
+	}
+
+	switch (texObj->Target) {
+        case GL_TEXTURE_1D:
+		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_1D, DIM_shift, DIM_mask);
+		SETfield(t->SQ_TEX_RESOURCE1, 0, TEX_DEPTH_shift, TEX_DEPTH_mask);
+		break;
+        case GL_TEXTURE_2D:
+        case GL_TEXTURE_RECTANGLE_NV:
+		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_2D, DIM_shift, DIM_mask);
+		SETfield(t->SQ_TEX_RESOURCE1, 0, TEX_DEPTH_shift, TEX_DEPTH_mask);
+		break;
+        case GL_TEXTURE_3D:
+		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_3D, DIM_shift, DIM_mask);
+		SETfield(t->SQ_TEX_RESOURCE1, firstImage->Depth - 1, // ???
+			 TEX_DEPTH_shift, TEX_DEPTH_mask);
+		break;
+        case GL_TEXTURE_CUBE_MAP:
+		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_CUBEMAP, DIM_shift, DIM_mask);
+		SETfield(t->SQ_TEX_RESOURCE1, 0, TEX_DEPTH_shift, TEX_DEPTH_mask);
+		break;
+        default:
+		radeon_error("unexpected texture target type in %s\n", __FUNCTION__);
+		return;
+	}
+
+	row_align = rmesa->radeon.texture_row_align - 1;
+	uTexelPitch = ((firstImage->Width * t->mt->bpp + row_align) & ~row_align) / t->mt->bpp;
+	uTexelPitch = (uTexelPitch + R700_TEXEL_PITCH_ALIGNMENT_MASK)
+		& ~R700_TEXEL_PITCH_ALIGNMENT_MASK;
+
+	/* min pitch is 8 */
+	if (uTexelPitch < 8)
+		uTexelPitch = 8;
+
+	SETfield(t->SQ_TEX_RESOURCE0, (uTexelPitch/8)-1, PITCH_shift, PITCH_mask);
+	SETfield(t->SQ_TEX_RESOURCE0, firstImage->Width - 1,
+		 TEX_WIDTH_shift, TEX_WIDTH_mask);
+	SETfield(t->SQ_TEX_RESOURCE1, firstImage->Height - 1,
+		 TEX_HEIGHT_shift, TEX_HEIGHT_mask);
+
+	if ((t->mt->lastLevel - t->mt->firstLevel) > 0) {
+		t->SQ_TEX_RESOURCE3 = t->mt->levels[0].size / 256;
+		SETfield(t->SQ_TEX_RESOURCE4, t->mt->firstLevel, BASE_LEVEL_shift, BASE_LEVEL_mask);
+		SETfield(t->SQ_TEX_RESOURCE5, t->mt->lastLevel, LAST_LEVEL_shift, LAST_LEVEL_mask);
+	}
+}
+
+/**
+ * Ensure the given texture is ready for rendering.
+ *
+ * Mostly this means populating the texture object's mipmap tree.
+ */
+static GLboolean r600_validate_texture(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	context_t *rmesa = R700_CONTEXT(ctx);
+	radeonTexObj *t = radeon_tex_obj(texObj);
+
+	if (!radeon_validate_texture_miptree(ctx, texObj))
+		return GL_FALSE;
+
+	/* Configure the hardware registers (more precisely, the cached version
+	 * of the hardware registers). */
+	setup_hardware_state(rmesa, texObj);
+
+	t->validated = GL_TRUE;
+	return GL_TRUE;
+}
+
+/**
+ * Ensure all enabled and complete textures are uploaded along with any buffers being used.
+ */
+GLboolean r600ValidateBuffers(GLcontext * ctx)
+{
+	context_t *rmesa = R700_CONTEXT(ctx);
+	struct radeon_renderbuffer *rrb;
+	struct radeon_bo *pbo;
+	int i;
+	int ret;
+
+	radeon_cs_space_reset_bos(rmesa->radeon.cmdbuf.cs);
+
+	rrb = radeon_get_colorbuffer(&rmesa->radeon);
+	/* color buffer */
+	if (rrb && rrb->bo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+						  rrb->bo, 0,
+						  RADEON_GEM_DOMAIN_VRAM);
+	}
+
+	/* depth buffer */
+	rrb = radeon_get_depthbuffer(&rmesa->radeon);
+	if (rrb && rrb->bo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+						  rrb->bo, 0,
+						  RADEON_GEM_DOMAIN_VRAM);
+	}
+	
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+		radeonTexObj *t;
+
+		if (!ctx->Texture.Unit[i]._ReallyEnabled)
+			continue;
+
+		if (!r600_validate_texture(ctx, ctx->Texture.Unit[i]._Current)) {
+			radeon_warning("failed to validate texture for unit %d.\n", i);
+		}
+		t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+		if (t->image_override && t->bo)
+			radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+							  t->bo,
+							  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+		else if (t->mt->bo)
+			radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+							  t->mt->bo,
+							  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+	}
+
+	pbo = (struct radeon_bo *)r700GetActiveFpShaderBo(ctx);
+	if (pbo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, pbo,
+						  RADEON_GEM_DOMAIN_GTT, 0);
+	}
+
+	pbo = (struct radeon_bo *)r700GetActiveVpShaderBo(ctx);
+	if (pbo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, pbo,
+						  RADEON_GEM_DOMAIN_GTT, 0);
+	}
+
+	ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, first_elem(&rmesa->radeon.dma.reserved)->bo, RADEON_GEM_DOMAIN_GTT, 0);
+	if (ret)
+		return GL_FALSE;
+	return GL_TRUE;
+}
+
+void r600SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+		      unsigned long long offset, GLint depth, GLuint pitch)
+{
+	context_t *rmesa = pDRICtx->driverPrivate;
+	struct gl_texture_object *tObj =
+	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
+	uint32_t pitch_val, size;
+
+	if (!tObj)
+		return;
+
+	t->image_override = GL_TRUE;
+
+	if (!offset)
+		return;
+
+	size = pitch;//h * w * (depth / 8);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	t->bo = radeon_legacy_bo_alloc_fake(rmesa->radeon.radeonScreen->bom, size, offset);
+	t->override_offset = offset;
+	pitch_val = pitch;
+	switch (depth) {
+	case 32:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		pitch_val /= 4;
+		break;
+	case 24:
+	default:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		pitch_val /= 4;
+		break;
+	case 16:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_5_6_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		pitch_val /= 2;
+		break;
+	}
+
+	pitch_val = (pitch_val + R700_TEXEL_PITCH_ALIGNMENT_MASK)
+		& ~R700_TEXEL_PITCH_ALIGNMENT_MASK;
+
+	/* min pitch is 8 */
+	if (pitch_val < 8)
+		pitch_val = 8;
+
+	SETfield(t->SQ_TEX_RESOURCE0, (pitch_val/8)-1, PITCH_shift, PITCH_mask);
+}
+
+void r600SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format, __DRIdrawable *dPriv)
+{
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	context_t *rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT) {
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		} else {
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+			SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+				 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		}
+		pitch_val /= 4;
+		break;
+	case 3:
+	default:
+		// FMT_8_8_8 ???
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		pitch_val /= 4;
+		break;
+	case 2:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_5_6_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_1,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
+		pitch_val /= 2;
+		break;
+	}
+
+	pitch_val = (pitch_val + R700_TEXEL_PITCH_ALIGNMENT_MASK)
+		& ~R700_TEXEL_PITCH_ALIGNMENT_MASK;
+
+	/* min pitch is 8 */
+	if (pitch_val < 8)
+		pitch_val = 8;
+
+	SETfield(t->SQ_TEX_RESOURCE0, (pitch_val/8)-1, PITCH_shift, PITCH_mask);
+	SETfield(t->SQ_TEX_RESOURCE0, rb->base.Width - 1,
+		 TEX_WIDTH_shift, TEX_WIDTH_mask);
+	SETfield(t->SQ_TEX_RESOURCE1, rb->base.Height - 1,
+		 TEX_HEIGHT_shift, TEX_HEIGHT_mask);
+
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
+}
+
+void r600SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+        r600SetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
new file mode 100644
index 0000000000..fda6692725
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -0,0 +1,4118 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "main/mtypes.h"
+#include "main/imports.h"
+
+#include "radeon_debug.h"
+#include "r600_context.h"
+
+#include "r700_assembler.h"
+
+BITS addrmode_PVSDST(PVSDST * pPVSDST)
+{
+	return pPVSDST->addrmode0 | ((BITS)pPVSDST->addrmode1 << 1);
+}
+
+void setaddrmode_PVSDST(PVSDST * pPVSDST, BITS addrmode) 
+{
+	pPVSDST->addrmode0 = addrmode & 1;
+	pPVSDST->addrmode1 = (addrmode >> 1) & 1;
+}
+
+void nomask_PVSDST(PVSDST * pPVSDST) 
+{
+	pPVSDST->writex = pPVSDST->writey = pPVSDST->writez = pPVSDST->writew = 1;
+}
+
+BITS addrmode_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	return pPVSSRC->addrmode0 | ((BITS)pPVSSRC->addrmode1 << 1);
+}
+
+void setaddrmode_PVSSRC(PVSSRC* pPVSSRC, BITS addrmode) 
+{
+	pPVSSRC->addrmode0 = addrmode & 1;
+	pPVSSRC->addrmode1 = (addrmode >> 1) & 1;
+}
+
+
+void setswizzle_PVSSRC(PVSSRC* pPVSSRC, BITS swz) 
+{
+	pPVSSRC->swizzlex = 
+	pPVSSRC->swizzley = 
+	pPVSSRC->swizzlez = 
+	pPVSSRC->swizzlew = swz;
+}
+
+void noswizzle_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	pPVSSRC->swizzlex = SQ_SEL_X;
+	pPVSSRC->swizzley = SQ_SEL_Y;
+	pPVSSRC->swizzlez = SQ_SEL_Z;
+	pPVSSRC->swizzlew = SQ_SEL_W;
+}
+
+void
+swizzleagain_PVSSRC(PVSSRC * pPVSSRC, BITS x, BITS y, BITS z, BITS w)
+{
+    switch (x) 
+    {
+        case SQ_SEL_X: x = pPVSSRC->swizzlex; 
+            break;
+        case SQ_SEL_Y: x = pPVSSRC->swizzley; 
+            break;
+        case SQ_SEL_Z: x = pPVSSRC->swizzlez; 
+            break;
+        case SQ_SEL_W: x = pPVSSRC->swizzlew; 
+            break;
+        default:;
+    }
+
+    switch (y) 
+    {
+        case SQ_SEL_X: y = pPVSSRC->swizzlex; 
+            break;
+        case SQ_SEL_Y: y = pPVSSRC->swizzley; 
+            break;
+        case SQ_SEL_Z: y = pPVSSRC->swizzlez; 
+            break;
+        case SQ_SEL_W: y = pPVSSRC->swizzlew; 
+            break;
+        default:;
+    }
+
+    switch (z) 
+    {
+        case SQ_SEL_X: z = pPVSSRC->swizzlex; 
+            break;
+        case SQ_SEL_Y: z = pPVSSRC->swizzley; 
+            break;
+        case SQ_SEL_Z: z = pPVSSRC->swizzlez; 
+            break;
+        case SQ_SEL_W: z = pPVSSRC->swizzlew; 
+            break;
+        default:;
+    }
+
+    switch (w) 
+    {
+        case SQ_SEL_X: w = pPVSSRC->swizzlex; 
+            break;
+        case SQ_SEL_Y: w = pPVSSRC->swizzley; 
+            break;
+        case SQ_SEL_Z: w = pPVSSRC->swizzlez; 
+            break;
+        case SQ_SEL_W: w = pPVSSRC->swizzlew; 
+            break;
+        default:;
+    }
+
+    pPVSSRC->swizzlex = x;
+    pPVSSRC->swizzley = y;
+    pPVSSRC->swizzlez = z;
+    pPVSSRC->swizzlew = w;
+}
+
+void neg_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	pPVSSRC->negx = 1;
+	pPVSSRC->negy = 1;
+	pPVSSRC->negz = 1;
+	pPVSSRC->negw = 1;
+}
+
+void noneg_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	pPVSSRC->negx = 0;
+	pPVSSRC->negy = 0;
+	pPVSSRC->negz = 0;
+	pPVSSRC->negw = 0;
+}
+
+// negate argument (for SUB instead of ADD and alike)
+void flipneg_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	pPVSSRC->negx = !pPVSSRC->negx;
+	pPVSSRC->negy = !pPVSSRC->negy;
+	pPVSSRC->negz = !pPVSSRC->negz;
+	pPVSSRC->negw = !pPVSSRC->negw;
+}
+
+void zerocomp_PVSSRC(PVSSRC* pPVSSRC, int c) 
+{
+	switch (c) 
+	{
+		case 0: pPVSSRC->swizzlex = SQ_SEL_0; pPVSSRC->negx = 0; break;
+		case 1: pPVSSRC->swizzley = SQ_SEL_0; pPVSSRC->negy = 0; break;
+		case 2: pPVSSRC->swizzlez = SQ_SEL_0; pPVSSRC->negz = 0; break;
+		case 3: pPVSSRC->swizzlew = SQ_SEL_0; pPVSSRC->negw = 0; break;
+		default:;
+	} 
+}
+
+void onecomp_PVSSRC(PVSSRC* pPVSSRC, int c) 
+{
+	switch (c) 
+	{
+		case 0: pPVSSRC->swizzlex = SQ_SEL_1; pPVSSRC->negx = 0; break;
+		case 1: pPVSSRC->swizzley = SQ_SEL_1; pPVSSRC->negy = 0; break;
+		case 2: pPVSSRC->swizzlez = SQ_SEL_1; pPVSSRC->negz = 0; break;
+		case 3: pPVSSRC->swizzlew = SQ_SEL_1; pPVSSRC->negw = 0; break;
+		default:;
+	} 
+}
+
+BITS is_misc_component_exported(VAP_OUT_VTX_FMT_0* pOutVTXFmt0)  
+{
+	  return (pOutVTXFmt0->point_size            |
+			  pOutVTXFmt0->edge_flag             |
+			  pOutVTXFmt0->rta_index             |
+			  pOutVTXFmt0->kill_flag             |
+			  pOutVTXFmt0->viewport_index);
+}
+
+BITS is_depth_component_exported(OUT_FRAGMENT_FMT_0* pFPOutFmt) 
+{
+	  return (pFPOutFmt->depth            | 
+			  pFPOutFmt->stencil_ref      | 
+			  pFPOutFmt->mask             | 
+			  pFPOutFmt->coverage_to_mask);
+}
+
+GLboolean is_reduction_opcode(PVSDWORD* dest)
+{
+    if (dest->dst.op3 == 0) 
+    {
+        if ( (dest->dst.opcode == SQ_OP2_INST_DOT4 || dest->dst.opcode == SQ_OP2_INST_DOT4_IEEE) ) 
+        {
+            return GL_TRUE;
+        }
+    }
+    return GL_FALSE;
+}
+
+GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size)
+{
+    GLuint format = FMT_INVALID;
+	GLuint uiElemSize = 0;
+
+    switch (eType)
+    {
+        case GL_BYTE:
+        case GL_UNSIGNED_BYTE:
+			uiElemSize = 1;
+            switch(nChannels)
+            {
+                case 1:
+                    format = FMT_8; break;
+                case 2:
+                    format = FMT_8_8; break;
+                case 3:
+                    format = FMT_8_8_8; break;
+                case 4:
+                    format = FMT_8_8_8_8; break;
+                default:
+                    break;
+            }
+            break;
+
+        case GL_UNSIGNED_SHORT:
+        case GL_SHORT:
+			uiElemSize = 2;
+            switch(nChannels)
+            {
+                case 1:
+                    format = FMT_16; break;
+                case 2:
+                    format = FMT_16_16; break;
+                case 3:
+                    format = FMT_16_16_16; break;
+                case 4:
+                    format = FMT_16_16_16_16; break;
+                default:
+                    break;
+            }
+            break;
+
+        case GL_UNSIGNED_INT:
+        case GL_INT:
+			uiElemSize = 4;
+            switch(nChannels)
+            {
+                case 1:
+                    format = FMT_32; break;
+                case 2:
+                    format = FMT_32_32; break;
+                case 3:
+                    format = FMT_32_32_32; break;
+                case 4:
+                    format = FMT_32_32_32_32; break;
+                default:
+                    break;
+            }
+            break;
+
+        case GL_FLOAT:
+			uiElemSize = 4;
+			switch(nChannels)
+            {
+                case 1:
+                    format = FMT_32_FLOAT; break;
+                case 2:
+                    format = FMT_32_32_FLOAT; break;
+                case 3:
+                    format = FMT_32_32_32_FLOAT; break;
+                case 4:
+                    format = FMT_32_32_32_32_FLOAT; break;
+                default:
+                    break;
+            }
+			break;
+        case GL_DOUBLE:
+			uiElemSize = 8;
+            switch(nChannels)
+            {
+                case 1:
+                    format = FMT_32_FLOAT; break;
+                case 2:
+                    format = FMT_32_32_FLOAT; break;
+                case 3:
+                    format = FMT_32_32_32_FLOAT; break;
+                case 4:
+                    format = FMT_32_32_32_32_FLOAT; break;
+                default:
+                    break;
+            }
+            break;
+        default:
+			;
+            //GL_ASSERT_NO_CASE();
+    }
+
+    if(NULL != pClient_size)
+    {
+	    *pClient_size = uiElemSize * nChannels;
+    }
+
+    return(format);
+}
+
+unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
+{
+    if(pAsm->D.dst.op3)
+    {
+        return 3;
+    }
+
+    switch (pAsm->D.dst.opcode)
+    {
+    case SQ_OP2_INST_ADD:                          
+    case SQ_OP2_INST_MUL: 
+    case SQ_OP2_INST_MAX:
+    case SQ_OP2_INST_MIN:
+    //case SQ_OP2_INST_MAX_DX10:
+    //case SQ_OP2_INST_MIN_DX10:
+    case SQ_OP2_INST_SETGT:
+    case SQ_OP2_INST_SETGE:
+    case SQ_OP2_INST_PRED_SETE:
+    case SQ_OP2_INST_PRED_SETGT:
+    case SQ_OP2_INST_PRED_SETGE:
+    case SQ_OP2_INST_PRED_SETNE:
+    case SQ_OP2_INST_DOT4:
+    case SQ_OP2_INST_DOT4_IEEE:
+        return 2;  
+
+    case SQ_OP2_INST_MOV: 
+    case SQ_OP2_INST_FRACT:
+    case SQ_OP2_INST_FLOOR:
+    case SQ_OP2_INST_KILLGT:
+    case SQ_OP2_INST_EXP_IEEE:
+    case SQ_OP2_INST_LOG_CLAMPED:
+    case SQ_OP2_INST_LOG_IEEE:
+    case SQ_OP2_INST_RECIP_IEEE:
+    case SQ_OP2_INST_RECIPSQRT_IEEE:
+    case SQ_OP2_INST_FLT_TO_INT:
+    case SQ_OP2_INST_SIN:
+    case SQ_OP2_INST_COS:
+        return 1;
+        
+    default: radeon_error(
+		    "Need instruction operand number for %x.\n", pAsm->D.dst.opcode);
+    };
+
+    return 3;
+}
+
+int Init_r700_AssemblerBase(SHADER_PIPE_TYPE spt, r700_AssemblerBase* pAsm, R700_Shader* pShader)
+{
+    GLuint i;
+
+    Init_R700_Shader(pShader);
+    pAsm->pR700Shader = pShader;
+    pAsm->currentShaderType = spt;
+
+	pAsm->cf_last_export_ptr   = NULL;
+
+	pAsm->cf_current_export_clause_ptr = NULL;
+	pAsm->cf_current_alu_clause_ptr    = NULL;
+	pAsm->cf_current_tex_clause_ptr    = NULL;
+	pAsm->cf_current_vtx_clause_ptr    = NULL;
+	pAsm->cf_current_cf_clause_ptr     = NULL;
+
+	// No clause has been created yet
+	pAsm->cf_current_clause_type = CF_EMPTY_CLAUSE;
+
+	pAsm->number_of_colorandz_exports = 0;
+	pAsm->number_of_exports           = 0;
+	pAsm->number_of_export_opcodes    = 0;
+
+
+	pAsm->D.bits = 0;
+	pAsm->S[0].bits = 0;
+	pAsm->S[1].bits = 0;
+	pAsm->S[2].bits = 0;
+
+	pAsm->uLastPosUpdate = 0; 
+	
+	*(BITS *) &pAsm->fp_stOutFmt0 = 0;
+
+	pAsm->uIIns = 0;
+	pAsm->uOIns = 0;
+	pAsm->number_used_registers = 0;
+	pAsm->uUsedConsts = 256; 
+
+
+	// Fragment programs
+	pAsm->uBoolConsts = 0;
+	pAsm->uIntConsts = 0;
+	pAsm->uInsts = 0;
+	pAsm->uConsts = 0;
+
+	pAsm->FCSP = 0;
+	pAsm->fc_stack[0].type = FC_NONE;
+
+	pAsm->branch_depth     = 0;
+	pAsm->max_branch_depth = 0;
+
+	pAsm->aArgSubst[0] =
+	pAsm->aArgSubst[1] =
+	pAsm->aArgSubst[2] =
+	pAsm->aArgSubst[3] = (-1);
+
+	pAsm->uOutputs = 0;
+
+	for (i=0; i<NUMBER_OF_OUTPUT_COLORS; i++) 
+	{
+		pAsm->color_export_register_number[i] = (-1);
+	}
+
+
+	pAsm->depth_export_register_number = (-1);
+	pAsm->stencil_export_register_number = (-1);
+	pAsm->coverage_to_mask_export_register_number = (-1);
+	pAsm->mask_export_register_number = (-1);
+
+	pAsm->starting_export_register_number = 0;
+	pAsm->starting_vfetch_register_number = 0;
+	pAsm->starting_temp_register_number   = 0;
+	pAsm->uFirstHelpReg = 0;
+
+
+	pAsm->input_position_is_used = GL_FALSE;
+	pAsm->input_normal_is_used   = GL_FALSE;
+
+
+	for (i=0; i<NUMBER_OF_INPUT_COLORS; i++) 
+	{
+		pAsm->input_color_is_used[ i ] = GL_FALSE;
+	}
+
+	for (i=0; i<NUMBER_OF_TEXTURE_UNITS; i++) 
+	{
+		pAsm->input_texture_unit_is_used[ i ] = GL_FALSE;
+	}
+
+	for (i=0; i<VERT_ATTRIB_MAX; i++) 
+	{
+		pAsm->vfetch_instruction_ptr_array[ i ] = NULL;
+	}
+
+	pAsm->number_of_inputs = 0;
+
+	return 0;
+}
+
+GLboolean IsTex(gl_inst_opcode Opcode)
+{
+    if( (OPCODE_TEX==Opcode) || (OPCODE_TXP==Opcode) || (OPCODE_TXB==Opcode) )
+    {
+        return GL_TRUE;
+    }
+    return GL_FALSE;
+}
+
+GLboolean IsAlu(gl_inst_opcode Opcode)
+{
+    //TODO : more for fc and ex for higher spec.
+    if( IsTex(Opcode) )
+    {
+        return GL_FALSE;
+    }
+    return GL_TRUE;
+}
+
+int check_current_clause(r700_AssemblerBase* pAsm,
+					     CF_CLAUSE_TYPE      new_clause_type)
+{
+	if (pAsm->cf_current_clause_type != new_clause_type) 
+	{	//Close last open clause
+		switch (pAsm->cf_current_clause_type) 
+		{
+		case CF_ALU_CLAUSE:
+			if ( pAsm->cf_current_alu_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_alu_clause_ptr = NULL;
+            }
+			break;
+		case CF_VTX_CLAUSE:
+			if ( pAsm->cf_current_vtx_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_vtx_clause_ptr = NULL;
+            }
+			break;
+		case CF_TEX_CLAUSE:
+			if ( pAsm->cf_current_tex_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_tex_clause_ptr = NULL;
+            }
+			break;
+		case CF_EXPORT_CLAUSE:
+			if ( pAsm->cf_current_export_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_export_clause_ptr = NULL;
+            }
+			break;
+		case CF_OTHER_CLAUSE:
+			if ( pAsm->cf_current_cf_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_cf_clause_ptr = NULL;
+            }
+			break;
+		case CF_EMPTY_CLAUSE:
+			break;
+		default:
+            radeon_error(
+                       "Unknown CF_CLAUSE_TYPE (%d) in check_current_clause. \n", (int) new_clause_type);
+			return GL_FALSE;
+		}
+
+        pAsm->cf_current_clause_type = CF_EMPTY_CLAUSE;
+
+		// Create new clause
+        switch (new_clause_type) 
+	    {
+        case CF_ALU_CLAUSE:
+            pAsm->cf_current_clause_type = CF_ALU_CLAUSE;
+            break;
+        case CF_VTX_CLAUSE:
+            pAsm->cf_current_clause_type = CF_VTX_CLAUSE;
+            break;
+        case CF_TEX_CLAUSE:        
+            pAsm->cf_current_clause_type = CF_TEX_CLAUSE;
+            break;
+        case CF_EXPORT_CLAUSE:
+            {
+                R700ControlFlowSXClause* pR700ControlFlowSXClause 
+                            = (R700ControlFlowSXClause*) CALLOC_STRUCT(R700ControlFlowSXClause); 
+            
+                // Add new export instruction to control flow program        
+                if (pR700ControlFlowSXClause != 0) 
+                {
+                    pAsm->cf_current_export_clause_ptr = pR700ControlFlowSXClause;
+                    Init_R700ControlFlowSXClause(pR700ControlFlowSXClause);
+                    AddCFInstruction( pAsm->pR700Shader, 
+                                      (R700ControlFlowInstruction *)pR700ControlFlowSXClause );
+                }
+                else 
+                {
+                    radeon_error(
+                               "Error allocating new EXPORT CF instruction in check_current_clause. \n");
+                    return GL_FALSE;
+                }
+                pAsm->cf_current_clause_type = CF_EXPORT_CLAUSE;
+            }
+            break;
+        case CF_EMPTY_CLAUSE:
+            break;
+        case CF_OTHER_CLAUSE:
+            pAsm->cf_current_clause_type = CF_OTHER_CLAUSE;
+            break;
+        default:
+            radeon_error(
+                       "Unknown CF_CLAUSE_TYPE (%d) in check_current_clause. \n", (int) new_clause_type);
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean add_vfetch_instruction(r700_AssemblerBase*     pAsm,
+								 R700VertexInstruction*  vertex_instruction_ptr)
+{
+	if( GL_FALSE == check_current_clause(pAsm,  CF_VTX_CLAUSE) )
+	{
+		return GL_FALSE;
+	}
+
+    if( pAsm->cf_current_vtx_clause_ptr == NULL ||
+        ( (pAsm->cf_current_vtx_clause_ptr != NULL) && 
+         (pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count >= GetCFMaxInstructions(pAsm->cf_current_vtx_clause_ptr->m_ShaderInstType)-1) 
+        ) ) 
+    { 
+		// Create new Vfetch control flow instruction for this new clause
+		pAsm->cf_current_vtx_clause_ptr = (R700ControlFlowGenericClause*) CALLOC_STRUCT(R700ControlFlowGenericClause);
+
+		if (pAsm->cf_current_vtx_clause_ptr != NULL) 
+		{
+			Init_R700ControlFlowGenericClause(pAsm->cf_current_vtx_clause_ptr);
+			AddCFInstruction( pAsm->pR700Shader, 
+                              (R700ControlFlowInstruction *)pAsm->cf_current_vtx_clause_ptr );
+		}
+		else 
+		{
+            radeon_error("Could not allocate a new VFetch CF instruction.\n");
+			return GL_FALSE;
+		}
+
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.pop_count        = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cf_const         = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count            = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_VTX;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+		LinkVertexInstruction(pAsm->cf_current_vtx_clause_ptr, vertex_instruction_ptr );
+	}
+	else
+	{
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count++;
+	}
+
+	AddVTXInstruction(pAsm->pR700Shader, vertex_instruction_ptr);
+
+	return GL_TRUE;
+}
+
+GLboolean add_tex_instruction(r700_AssemblerBase*     pAsm,
+                              R700TextureInstruction* tex_instruction_ptr)
+{ 
+    if ( GL_FALSE == check_current_clause(pAsm, CF_TEX_CLAUSE) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( pAsm->cf_current_tex_clause_ptr == NULL ||
+         ( (pAsm->cf_current_tex_clause_ptr != NULL) && 
+           (pAsm->cf_current_tex_clause_ptr->m_Word1.f.count >= GetCFMaxInstructions(pAsm->cf_current_tex_clause_ptr->m_ShaderInstType)-1) 
+         ) ) 
+    {
+        // new tex cf instruction for this new clause  
+        pAsm->cf_current_tex_clause_ptr = (R700ControlFlowGenericClause*) CALLOC_STRUCT(R700ControlFlowGenericClause);
+
+		if (pAsm->cf_current_tex_clause_ptr != NULL) 
+		{
+			Init_R700ControlFlowGenericClause(pAsm->cf_current_tex_clause_ptr);
+			AddCFInstruction( pAsm->pR700Shader, 
+                              (R700ControlFlowInstruction *)pAsm->cf_current_tex_clause_ptr );
+		}
+		else 
+		{
+            radeon_error("Could not allocate a new TEX CF instruction.\n");
+			return GL_FALSE;
+		}
+        
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.pop_count        = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cf_const         = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_TEX;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.barrier          = 0x0;   //0x1;
+    }
+    else 
+    {        
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.count++;
+    }
+
+    // If this clause constains any TEX instruction that is dependent on a previous instruction, 
+    // set the barrier bit
+    if( pAsm->pInstDeps[pAsm->uiCurInst].nDstDep > (-1) )
+    {
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.barrier = 0x1;  
+    }
+
+    if(NULL == pAsm->cf_current_tex_clause_ptr->m_pLinkedTEXInstruction)
+    {
+        pAsm->cf_current_tex_clause_ptr->m_pLinkedTEXInstruction = tex_instruction_ptr;
+        tex_instruction_ptr->m_pLinkedGenericClause = pAsm->cf_current_tex_clause_ptr;
+    }
+
+    AddTEXInstruction(pAsm->pR700Shader, tex_instruction_ptr);
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_vfetch_instruction(r700_AssemblerBase* pAsm,
+								GLuint gl_client_id,
+                                GLuint destination_register,
+								GLuint number_of_elements,
+                                GLenum dataElementType,
+								VTX_FETCH_METHOD* pFetchMethod)
+{
+    GLuint client_size_inbyte;
+	GLuint data_format;
+    GLuint mega_fetch_count;
+	GLuint is_mega_fetch_flag;
+
+	R700VertexGenericFetch*   vfetch_instruction_ptr;
+	R700VertexGenericFetch*   assembled_vfetch_instruction_ptr = pAsm->vfetch_instruction_ptr_array[ gl_client_id ];
+
+	if (assembled_vfetch_instruction_ptr == NULL) 
+	{
+		vfetch_instruction_ptr = (R700VertexGenericFetch*) CALLOC_STRUCT(R700VertexGenericFetch);
+		if (vfetch_instruction_ptr == NULL) 
+		{
+			return GL_FALSE;
+		}
+        Init_R700VertexGenericFetch(vfetch_instruction_ptr);
+    }
+	else 
+	{
+		vfetch_instruction_ptr = assembled_vfetch_instruction_ptr;
+	}
+
+	data_format = GetSurfaceFormat(dataElementType, number_of_elements, &client_size_inbyte);
+
+	if(GL_TRUE == pFetchMethod->bEnableMini) //More conditions here
+	{
+		//TODO : mini fetch
+	}
+	else
+	{
+		mega_fetch_count = MEGA_FETCH_BYTES - 1;
+		is_mega_fetch_flag       = 0x1;
+		pFetchMethod->mega_fetch_remainder = MEGA_FETCH_BYTES - client_size_inbyte;
+	}
+
+	vfetch_instruction_ptr->m_Word0.f.vtx_inst         = SQ_VTX_INST_FETCH;
+	vfetch_instruction_ptr->m_Word0.f.fetch_type       = SQ_VTX_FETCH_VERTEX_DATA;
+	vfetch_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;
+
+	vfetch_instruction_ptr->m_Word0.f.buffer_id        = gl_client_id;
+	vfetch_instruction_ptr->m_Word0.f.src_gpr          = 0x0; 
+	vfetch_instruction_ptr->m_Word0.f.src_rel          = SQ_ABSOLUTE;
+	vfetch_instruction_ptr->m_Word0.f.src_sel_x        = SQ_SEL_X;
+	vfetch_instruction_ptr->m_Word0.f.mega_fetch_count = mega_fetch_count;
+
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (number_of_elements < 1) ? SQ_SEL_0 : SQ_SEL_X;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (number_of_elements < 2) ? SQ_SEL_0 : SQ_SEL_Y;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (number_of_elements < 3) ? SQ_SEL_0 : SQ_SEL_Z;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (number_of_elements < 4) ? SQ_SEL_1 : SQ_SEL_W;
+
+	vfetch_instruction_ptr->m_Word1.f.use_const_fields = 1;
+
+	// Destination register
+	vfetch_instruction_ptr->m_Word1_GPR.f.dst_gpr = destination_register; 
+	vfetch_instruction_ptr->m_Word1_GPR.f.dst_rel = SQ_ABSOLUTE;
+
+	vfetch_instruction_ptr->m_Word2.f.offset              = 0;
+	vfetch_instruction_ptr->m_Word2.f.const_buf_no_stride = 0x0;
+
+	vfetch_instruction_ptr->m_Word2.f.mega_fetch          = is_mega_fetch_flag;
+
+	if (assembled_vfetch_instruction_ptr == NULL) 
+	{
+		if ( GL_FALSE == add_vfetch_instruction(pAsm, (R700VertexInstruction *)vfetch_instruction_ptr) ) 
+        {   
+			return GL_FALSE;
+		}
+
+		if (pAsm->vfetch_instruction_ptr_array[ gl_client_id ] != NULL) 
+		{
+			return GL_FALSE;
+		}
+		else 
+		{
+			pAsm->vfetch_instruction_ptr_array[ gl_client_id ] = vfetch_instruction_ptr;
+		}
+	}
+
+	return GL_TRUE;
+}
+
+GLuint gethelpr(r700_AssemblerBase* pAsm) 
+{
+    GLuint r = pAsm->uHelpReg;
+    pAsm->uHelpReg++;
+    if (pAsm->uHelpReg > pAsm->number_used_registers)
+    {
+        pAsm->number_used_registers = pAsm->uHelpReg;
+	}
+    return r;
+}
+void resethelpr(r700_AssemblerBase* pAsm) 
+{
+    pAsm->uHelpReg = pAsm->uFirstHelpReg;
+}
+
+void checkop_init(r700_AssemblerBase* pAsm)
+{
+    resethelpr(pAsm);
+    pAsm->aArgSubst[0] =
+    pAsm->aArgSubst[1] =
+    pAsm->aArgSubst[2] =
+    pAsm->aArgSubst[3] = -1;
+}
+
+GLboolean mov_temp(r700_AssemblerBase* pAsm, int src)
+{
+    GLuint tmp = gethelpr(pAsm);
+
+    //mov src to temp helper gpr.
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+  
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = tmp;
+
+    nomask_PVSDST(&(pAsm->D.dst));
+
+    if( GL_FALSE == assemble_src(pAsm, src, 0) )
+    {
+        return GL_FALSE;
+    }
+
+    noswizzle_PVSSRC(&(pAsm->S[0].src));
+    noneg_PVSSRC(&(pAsm->S[0].src));
+   
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->aArgSubst[1 + src] = tmp;
+
+    return GL_TRUE;
+}
+
+GLboolean checkop1(r700_AssemblerBase* pAsm)
+{
+    checkop_init(pAsm);
+    return GL_TRUE;
+}
+
+GLboolean checkop2(r700_AssemblerBase* pAsm)
+{
+    GLboolean bSrcConst[2];
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    checkop_init(pAsm);
+
+    if( (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[0].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[0].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[0].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[0] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[0] = GL_FALSE;
+    }
+    if( (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[1].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[1].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[1].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[1] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[1] = GL_FALSE;
+    }
+
+    if( (bSrcConst[0] == GL_TRUE) && (bSrcConst[1] == GL_TRUE) )
+    {
+        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[1].Index)
+        {
+            if( GL_FALSE == mov_temp(pAsm, 1) )
+            {
+                return GL_FALSE;
+            }
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean checkop3(r700_AssemblerBase* pAsm)
+{
+    GLboolean bSrcConst[3];
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    checkop_init(pAsm);
+
+    if( (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[0].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[0].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[0].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[0] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[0] = GL_FALSE;
+    }
+    if( (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[1].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[1].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[1].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[1] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[1] = GL_FALSE;
+    }
+    if( (pILInst->SrcReg[2].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[2].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[2].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[2].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[2] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[2] = GL_FALSE;
+    }
+
+    if( (GL_TRUE == bSrcConst[0]) && 
+        (GL_TRUE == bSrcConst[1]) && 
+        (GL_TRUE == bSrcConst[2]) ) 
+    {
+        if( GL_FALSE == mov_temp(pAsm, 1) )
+        {
+            return GL_FALSE;
+        }
+        if( GL_FALSE == mov_temp(pAsm, 2) )
+        {
+            return GL_FALSE;
+        }
+
+        return GL_TRUE;
+    }
+    else if( (GL_TRUE == bSrcConst[0]) && 
+             (GL_TRUE == bSrcConst[1]) ) 
+    {
+        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[1].Index)    
+	    {
+            if( GL_FALSE == mov_temp(pAsm, 1) )
+            {
+                return 1;
+            }
+        }
+
+        return GL_TRUE;
+    }
+    else if ( (GL_TRUE == bSrcConst[0]) && 
+              (GL_TRUE == bSrcConst[2]) )  
+    {
+        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[2].Index)     
+	    {
+            if( GL_FALSE == mov_temp(pAsm, 2) )
+            {
+                return GL_FALSE;
+            }
+        }
+
+        return GL_TRUE;
+    }
+    else if( (GL_TRUE == bSrcConst[1]) && 
+             (GL_TRUE == bSrcConst[2]) ) 
+    {
+        if(pILInst->SrcReg[1].Index != pILInst->SrcReg[2].Index)
+	    {
+            if( GL_FALSE == mov_temp(pAsm, 2) )
+            {
+                return GL_FALSE;
+            }
+        }
+
+        return GL_TRUE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_src(r700_AssemblerBase *pAsm,
+                       int src, 
+                       int fld)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    if (fld == -1)
+    {
+        fld = src;
+    }
+
+    if(pAsm->aArgSubst[1+src] >= 0) 
+    {
+        setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
+        pAsm->S[fld].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[fld].src.reg   = pAsm->aArgSubst[1+src];
+    }
+    else 
+    {
+        switch (pILInst->SrcReg[src].File)
+        {
+        case PROGRAM_TEMPORARY:
+            setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
+            pAsm->S[fld].src.rtype = SRC_REG_TEMPORARY;
+            pAsm->S[fld].src.reg = pILInst->SrcReg[src].Index + pAsm->starting_temp_register_number;
+            break;
+        case PROGRAM_CONSTANT:
+        case PROGRAM_LOCAL_PARAM:
+        case PROGRAM_ENV_PARAM:
+        case PROGRAM_STATE_VAR:
+            if (1 == pILInst->SrcReg[src].RelAddr)
+            {
+                setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_RELATIVE_A0);
+            }
+            else
+            {
+                setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);              
+            }
+
+            pAsm->S[fld].src.rtype = SRC_REG_CONSTANT;
+            pAsm->S[fld].src.reg   = pILInst->SrcReg[src].Index;
+            break;      
+        case PROGRAM_INPUT:
+            setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
+            pAsm->S[fld].src.rtype = SRC_REG_INPUT;
+            switch (pAsm->currentShaderType)
+            {
+            case SPT_FP:
+                pAsm->S[fld].src.reg = pAsm->uiFP_AttributeMap[pILInst->SrcReg[src].Index];
+                break;
+            case SPT_VP:
+                pAsm->S[fld].src.reg = pAsm->ucVP_AttributeMap[pILInst->SrcReg[src].Index];
+                break;
+            }
+            break;      
+        default:
+            radeon_error("Invalid source argument type\n");
+            return GL_FALSE;
+        }
+    } 
+
+    pAsm->S[fld].src.swizzlex = pILInst->SrcReg[src].Swizzle & 0x7;
+    pAsm->S[fld].src.swizzley = (pILInst->SrcReg[src].Swizzle >> 3) & 0x7;
+    pAsm->S[fld].src.swizzlez = (pILInst->SrcReg[src].Swizzle >> 6) & 0x7;
+    pAsm->S[fld].src.swizzlew = (pILInst->SrcReg[src].Swizzle >> 9) & 0x7;
+
+    pAsm->S[fld].src.negx = pILInst->SrcReg[src].Negate & 0x1;
+    pAsm->S[fld].src.negy = (pILInst->SrcReg[src].Negate >> 1) & 0x1;
+    pAsm->S[fld].src.negz = (pILInst->SrcReg[src].Negate >> 2) & 0x1;
+    pAsm->S[fld].src.negw = (pILInst->SrcReg[src].Negate >> 3) & 0x1;
+     
+    return GL_TRUE;
+}
+
+GLboolean assemble_dst(r700_AssemblerBase *pAsm)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+    switch (pILInst->DstReg.File) 
+    {
+    case PROGRAM_TEMPORARY:
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg = pILInst->DstReg.Index + pAsm->starting_temp_register_number;
+        break;
+    case PROGRAM_ADDRESS:
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_A0;
+        pAsm->D.dst.reg = 0;
+        break;
+    case PROGRAM_OUTPUT:
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_OUT;
+        switch (pAsm->currentShaderType)
+        {
+        case SPT_FP:
+            pAsm->D.dst.reg = pAsm->uiFP_OutputMap[pILInst->DstReg.Index];
+            break;
+        case SPT_VP:
+            pAsm->D.dst.reg = pAsm->ucVP_OutputMap[pILInst->DstReg.Index];
+            break;
+        }
+        break;   
+    default:
+        radeon_error("Invalid destination output argument type\n");
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = pILInst->DstReg.WriteMask & 0x1;
+    pAsm->D.dst.writey = (pILInst->DstReg.WriteMask >> 1) & 0x1;
+    pAsm->D.dst.writez = (pILInst->DstReg.WriteMask >> 2) & 0x1;
+    pAsm->D.dst.writew = (pILInst->DstReg.WriteMask >> 3) & 0x1;
+  
+    return GL_TRUE;
+}
+
+GLboolean tex_dst(r700_AssemblerBase *pAsm)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    if(PROGRAM_TEMPORARY == pILInst->DstReg.File)
+    {
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = pAsm->pILInst[pAsm->uiCurInst].DstReg.Index + pAsm->starting_temp_register_number;
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    }
+    else if(PROGRAM_OUTPUT == pILInst->DstReg.File)
+    {
+        pAsm->D.dst.rtype = DST_REG_OUT;
+        switch (pAsm->currentShaderType)
+        {
+        case SPT_FP:
+            pAsm->D.dst.reg = pAsm->uiFP_OutputMap[pILInst->DstReg.Index];
+            break;
+        case SPT_VP:
+            pAsm->D.dst.reg = pAsm->ucVP_OutputMap[pILInst->DstReg.Index];
+            break;
+        }
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    }
+    else 
+    {
+        radeon_error("Invalid destination output argument type\n");
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = pILInst->DstReg.WriteMask & 0x1;
+    pAsm->D.dst.writey = (pILInst->DstReg.WriteMask >> 1) & 0x1;
+    pAsm->D.dst.writez = (pILInst->DstReg.WriteMask >> 2) & 0x1;
+    pAsm->D.dst.writew = (pILInst->DstReg.WriteMask >> 3) & 0x1;
+  
+    return GL_TRUE;
+}
+
+GLboolean tex_src(r700_AssemblerBase *pAsm)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    GLboolean bValidTexCoord = GL_FALSE;
+
+    switch (pILInst->SrcReg[0].File) {
+    case PROGRAM_CONSTANT:
+    case PROGRAM_LOCAL_PARAM:
+    case PROGRAM_ENV_PARAM:
+    case PROGRAM_STATE_VAR:
+	    bValidTexCoord = GL_TRUE;
+	    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+	    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+	    pAsm->S[0].src.reg   = pAsm->aArgSubst[1];
+	    break;
+    case PROGRAM_TEMPORARY:
+	    bValidTexCoord = GL_TRUE;
+	    pAsm->S[0].src.reg   = pILInst->SrcReg[0].Index +
+		    pAsm->starting_temp_register_number;
+	    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+       break;
+    case PROGRAM_INPUT:
+	    switch (pILInst->SrcReg[0].Index)
+	    {
+	    case FRAG_ATTRIB_COL0:
+	    case FRAG_ATTRIB_COL1:
+	    case FRAG_ATTRIB_TEX0:
+	    case FRAG_ATTRIB_TEX1:
+	    case FRAG_ATTRIB_TEX2:
+	    case FRAG_ATTRIB_TEX3:
+	    case FRAG_ATTRIB_TEX4:
+	    case FRAG_ATTRIB_TEX5:
+	    case FRAG_ATTRIB_TEX6:
+	    case FRAG_ATTRIB_TEX7:
+		    bValidTexCoord = GL_TRUE;
+		    pAsm->S[0].src.reg   =
+			    pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
+		    pAsm->S[0].src.rtype = SRC_REG_INPUT;
+		    break;
+	    }
+	    break;
+    }
+
+    if(GL_TRUE == bValidTexCoord)
+    {
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    }
+    else
+    {
+        radeon_error("Invalid source texcoord for TEX instruction\n");
+        return GL_FALSE;
+    }
+
+    pAsm->S[0].src.swizzlex = pILInst->SrcReg[0].Swizzle & 0x7;
+    pAsm->S[0].src.swizzley = (pILInst->SrcReg[0].Swizzle >> 3) & 0x7;
+    pAsm->S[0].src.swizzlez = (pILInst->SrcReg[0].Swizzle >> 6) & 0x7;
+    pAsm->S[0].src.swizzlew = (pILInst->SrcReg[0].Swizzle >> 9) & 0x7;
+
+    pAsm->S[0].src.negx = pILInst->SrcReg[0].Negate & 0x1;
+    pAsm->S[0].src.negy = (pILInst->SrcReg[0].Negate >> 1) & 0x1;
+    pAsm->S[0].src.negz = (pILInst->SrcReg[0].Negate >> 2) & 0x1;
+    pAsm->S[0].src.negw = (pILInst->SrcReg[0].Negate >> 3) & 0x1;
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm, GLboolean normalized)
+{
+    PVSSRC *   texture_coordinate_source;
+    PVSSRC *   texture_unit_source;
+    
+    R700TextureInstruction* tex_instruction_ptr = (R700TextureInstruction*) CALLOC_STRUCT(R700TextureInstruction);
+	if (tex_instruction_ptr == NULL) 
+	{
+		return GL_FALSE;
+	}
+    Init_R700TextureInstruction(tex_instruction_ptr);
+
+    texture_coordinate_source = &(pAsm->S[0].src);
+    texture_unit_source       = &(pAsm->S[1].src);
+
+    tex_instruction_ptr->m_Word0.f.tex_inst         = pAsm->D.dst.opcode;
+    tex_instruction_ptr->m_Word0.f.bc_frac_mode     = 0x0;
+    tex_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;
+
+    tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg;
+
+    tex_instruction_ptr->m_Word1.f.lod_bias     = 0x0;
+    if (normalized) {
+	    tex_instruction_ptr->m_Word1.f.coord_type_x = SQ_TEX_NORMALIZED;
+	    tex_instruction_ptr->m_Word1.f.coord_type_y = SQ_TEX_NORMALIZED;
+	    tex_instruction_ptr->m_Word1.f.coord_type_z = SQ_TEX_NORMALIZED;
+	    tex_instruction_ptr->m_Word1.f.coord_type_w = SQ_TEX_NORMALIZED;
+    } else {
+	    /* XXX: UNNORMALIZED tex coords have limited wrap modes */
+	    tex_instruction_ptr->m_Word1.f.coord_type_x = SQ_TEX_UNNORMALIZED;
+	    tex_instruction_ptr->m_Word1.f.coord_type_y = SQ_TEX_UNNORMALIZED;
+	    tex_instruction_ptr->m_Word1.f.coord_type_z = SQ_TEX_UNNORMALIZED;
+	    tex_instruction_ptr->m_Word1.f.coord_type_w = SQ_TEX_UNNORMALIZED;
+    }
+
+    tex_instruction_ptr->m_Word2.f.offset_x   = 0x0;
+    tex_instruction_ptr->m_Word2.f.offset_y   = 0x0;
+    tex_instruction_ptr->m_Word2.f.offset_z   = 0x0;
+
+    tex_instruction_ptr->m_Word2.f.sampler_id = texture_unit_source->reg;
+
+    // dst
+    if ( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
+         (pAsm->D.dst.rtype == DST_REG_OUT) ) 
+    {
+        tex_instruction_ptr->m_Word0.f.src_gpr    = texture_coordinate_source->reg;
+        tex_instruction_ptr->m_Word0.f.src_rel    = SQ_ABSOLUTE;
+
+        tex_instruction_ptr->m_Word1.f.dst_gpr    = pAsm->D.dst.reg;
+        tex_instruction_ptr->m_Word1.f.dst_rel    = SQ_ABSOLUTE;
+
+        tex_instruction_ptr->m_Word1.f.dst_sel_x  = (pAsm->D.dst.writex ? texture_unit_source->swizzlex : SQ_SEL_MASK);
+        tex_instruction_ptr->m_Word1.f.dst_sel_y  = (pAsm->D.dst.writey ? texture_unit_source->swizzley : SQ_SEL_MASK);
+        tex_instruction_ptr->m_Word1.f.dst_sel_z  = (pAsm->D.dst.writez ? texture_unit_source->swizzlez : SQ_SEL_MASK);
+        tex_instruction_ptr->m_Word1.f.dst_sel_w  = (pAsm->D.dst.writew ? texture_unit_source->swizzlew : SQ_SEL_MASK);
+
+
+        tex_instruction_ptr->m_Word2.f.src_sel_x  = texture_coordinate_source->swizzlex;
+        tex_instruction_ptr->m_Word2.f.src_sel_y  = texture_coordinate_source->swizzley;
+        tex_instruction_ptr->m_Word2.f.src_sel_z  = texture_coordinate_source->swizzlez;
+        tex_instruction_ptr->m_Word2.f.src_sel_w  = texture_coordinate_source->swizzlew;
+    }
+    else 
+    {
+        radeon_error("Only temp destination registers supported for TEX dest regs.\n");
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == add_tex_instruction(pAsm, tex_instruction_ptr) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+void initialize(r700_AssemblerBase *pAsm)
+{
+    GLuint cycle, component;
+
+    for (cycle=0; cycle<NUMBER_OF_CYCLES; cycle++) 
+    {
+        for (component=0; component<NUMBER_OF_COMPONENTS; component++) 
+        {
+            pAsm->hw_gpr[cycle][component] = (-1);
+        }
+    }
+    for (component=0; component<NUMBER_OF_COMPONENTS; component++) 
+    {
+        pAsm->hw_cfile_addr[component] = (-1);
+        pAsm->hw_cfile_chan[component] = (-1);
+    }
+}
+
+GLboolean assemble_alu_src(R700ALUInstruction*  alu_instruction_ptr,
+                           int                  source_index,
+                           PVSSRC*              pSource,
+                           BITS                 scalar_channel_index)
+{
+    BITS src_sel;
+    BITS src_rel;
+    BITS src_chan;
+    BITS src_neg;
+
+    //--------------------------------------------------------------------------
+    // Source for operands src0, src1. 
+    // Values [0,127] correspond to GPR[0..127]. 
+    // Values [256,511] correspond to cfile constants c[0..255]. 
+
+    //--------------------------------------------------------------------------
+    // Other special values are shown in the list below.
+
+    // 248	SQ_ALU_SRC_0: special constant 0.0.
+    // 249	SQ_ALU_SRC_1: special constant 1.0 float.
+
+    // 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
+    // 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
+
+    // 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
+    // 253	SQ_ALU_SRC_LITERAL: literal constant.
+
+    // 254	SQ_ALU_SRC_PV: previous vector result.
+    // 255	SQ_ALU_SRC_PS: previous scalar result.
+    //--------------------------------------------------------------------------
+
+    BITS channel_swizzle;
+    switch (scalar_channel_index) 
+    {
+        case 0: channel_swizzle = pSource->swizzlex; break;
+        case 1: channel_swizzle = pSource->swizzley; break;
+        case 2: channel_swizzle = pSource->swizzlez; break;
+        case 3: channel_swizzle = pSource->swizzlew; break;
+        default: channel_swizzle = SQ_SEL_MASK; break;
+    }
+
+    if(channel_swizzle == SQ_SEL_0) 
+    {
+        src_sel = SQ_ALU_SRC_0; 
+    }
+    else if (channel_swizzle == SQ_SEL_1) 
+    {
+        src_sel = SQ_ALU_SRC_1; 
+    }
+    else 
+    {
+        if ( (pSource->rtype == SRC_REG_TEMPORARY) || 
+             (pSource->rtype == SRC_REG_INPUT)
+        ) 
+        {
+            src_sel = pSource->reg;
+        }
+        else if (pSource->rtype == SRC_REG_CONSTANT)
+        {
+            src_sel = pSource->reg + CFILE_REGISTER_OFFSET;            
+        }
+        else
+        {
+            radeon_error("Source (%d) register type (%d) not one of TEMP, INPUT, or CONSTANT.\n",
+                     source_index, pSource->rtype);
+            return GL_FALSE;
+        }
+    }
+
+    if( ADDR_ABSOLUTE == addrmode_PVSSRC(pSource) ) 
+    {
+        src_rel = SQ_ABSOLUTE;
+    }
+    else 
+    {
+        src_rel = SQ_RELATIVE;
+    }
+
+    switch (channel_swizzle) 
+    {
+        case SQ_SEL_X: 
+            src_chan = SQ_CHAN_X; 
+            break;
+        case SQ_SEL_Y: 
+            src_chan = SQ_CHAN_Y; 
+            break;
+        case SQ_SEL_Z: 
+            src_chan = SQ_CHAN_Z; 
+            break;
+        case SQ_SEL_W: 
+            src_chan = SQ_CHAN_W; 
+            break;
+        case SQ_SEL_0:
+        case SQ_SEL_1:
+            // Does not matter since src_sel controls
+            src_chan = SQ_CHAN_X; 
+            break;
+        default:
+            radeon_error("Unknown source select value (%d) in assemble_alu_src().\n", channel_swizzle);
+            return GL_FALSE;
+            break;
+    }
+
+    switch (scalar_channel_index) 
+    {
+        case 0: src_neg = pSource->negx; break;
+        case 1: src_neg = pSource->negy; break;
+        case 2: src_neg = pSource->negz; break;
+        case 3: src_neg = pSource->negw; break;
+        default: src_neg = 0; break;
+    }
+
+    switch (source_index) 
+    {
+        case 0:
+            alu_instruction_ptr->m_Word0.f.src0_sel  = src_sel;
+            alu_instruction_ptr->m_Word0.f.src0_rel  = src_rel;
+            alu_instruction_ptr->m_Word0.f.src0_chan = src_chan;
+            alu_instruction_ptr->m_Word0.f.src0_neg  = src_neg;
+            break;
+        case 1:
+            alu_instruction_ptr->m_Word0.f.src1_sel  = src_sel;
+            alu_instruction_ptr->m_Word0.f.src1_rel  = src_rel;
+            alu_instruction_ptr->m_Word0.f.src1_chan = src_chan;
+            alu_instruction_ptr->m_Word0.f.src1_neg  = src_neg;
+            break;
+        case 2:
+            alu_instruction_ptr->m_Word1_OP3.f.src2_sel  = src_sel;
+            alu_instruction_ptr->m_Word1_OP3.f.src2_rel  = src_rel;
+            alu_instruction_ptr->m_Word1_OP3.f.src2_chan = src_chan;
+            alu_instruction_ptr->m_Word1_OP3.f.src2_neg  = src_neg;
+            break;
+        default:
+            radeon_error("Only three sources allowed in ALU opcodes.\n");
+          return GL_FALSE;
+          break;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
+                              R700ALUInstruction* alu_instruction_ptr,
+                              GLuint              contiguous_slots_needed)
+{
+    if( GL_FALSE == check_current_clause(pAsm, CF_ALU_CLAUSE) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( pAsm->cf_current_alu_clause_ptr == NULL ||
+         ( (pAsm->cf_current_alu_clause_ptr != NULL) && 
+           (pAsm->cf_current_alu_clause_ptr->m_Word1.f.count >= (GetCFMaxInstructions(pAsm->cf_current_alu_clause_ptr->m_ShaderInstType)-contiguous_slots_needed-1) )
+         ) ) 
+    {
+
+        //new cf inst for this clause
+        pAsm->cf_current_alu_clause_ptr = (R700ControlFlowALUClause*) CALLOC_STRUCT(R700ControlFlowALUClause);
+            
+        // link the new cf to cf segment    
+        if(NULL != pAsm->cf_current_alu_clause_ptr) 
+        {
+            Init_R700ControlFlowALUClause(pAsm->cf_current_alu_clause_ptr);
+			AddCFInstruction( pAsm->pR700Shader, 
+                              (R700ControlFlowInstruction *)pAsm->cf_current_alu_clause_ptr );            
+        }
+        else 
+        {
+            radeon_error("Could not allocate a new ALU CF instruction.\n");
+            return GL_FALSE;
+        }
+
+        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_bank0 = 0x0;
+        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_bank1 = 0x0;
+        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_mode0 = SQ_CF_KCACHE_NOP;
+
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_mode1 = SQ_CF_KCACHE_NOP;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_addr0 = 0x0;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_addr1 = 0x0;
+
+        //cf_current_alu_clause_ptr->m_Word1.f.count           = number_of_scalar_operations - 1;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count           = 0x0;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.cf_inst         = SQ_CF_INST_ALU;
+
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.whole_quad_mode = 0x0;
+
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.barrier         = 0x1;
+    }
+    else 
+    {
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count++;
+    }
+
+    // If this clause constains any instruction that is forward dependent on a TEX instruction, 
+    // set the whole_quad_mode for this clause
+    if ( pAsm->pInstDeps[pAsm->uiCurInst].nDstDep > (-1) ) 
+    {
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.whole_quad_mode = 0x1;   
+    }
+
+    if (pAsm->cf_current_alu_clause_ptr->m_Word1.f.count >= (GetCFMaxInstructions(pAsm->cf_current_alu_clause_ptr->m_ShaderInstType)-1) ) 
+    {
+        alu_instruction_ptr->m_Word0.f.last = 1;
+    }
+
+    if(NULL == pAsm->cf_current_alu_clause_ptr->m_pLinkedALUInstruction)
+    {
+        pAsm->cf_current_alu_clause_ptr->m_pLinkedALUInstruction = alu_instruction_ptr;
+        alu_instruction_ptr->m_pLinkedALUClause = pAsm->cf_current_alu_clause_ptr;
+    }
+    
+    AddALUInstruction(pAsm->pR700Shader, alu_instruction_ptr);
+
+    return GL_TRUE;
+}
+
+void get_src_properties(R700ALUInstruction*  alu_instruction_ptr,
+                        int                  source_index,
+                        BITS*                psrc_sel,
+                        BITS*                psrc_rel,
+                        BITS*                psrc_chan,
+                        BITS*                psrc_neg)
+{
+    switch (source_index) 
+    {
+        case 0:
+            *psrc_sel  = alu_instruction_ptr->m_Word0.f.src0_sel ;
+            *psrc_rel  = alu_instruction_ptr->m_Word0.f.src0_rel ;
+            *psrc_chan = alu_instruction_ptr->m_Word0.f.src0_chan;
+            *psrc_neg  = alu_instruction_ptr->m_Word0.f.src0_neg ;
+            break;
+
+        case 1:
+            *psrc_sel  = alu_instruction_ptr->m_Word0.f.src1_sel ;
+            *psrc_rel  = alu_instruction_ptr->m_Word0.f.src1_rel ;
+            *psrc_chan = alu_instruction_ptr->m_Word0.f.src1_chan;
+            *psrc_neg  = alu_instruction_ptr->m_Word0.f.src1_neg ;
+            break;
+
+        case 2:
+            *psrc_sel  = alu_instruction_ptr->m_Word1_OP3.f.src2_sel;
+            *psrc_rel  = alu_instruction_ptr->m_Word1_OP3.f.src2_rel;
+            *psrc_chan = alu_instruction_ptr->m_Word1_OP3.f.src2_chan;
+            *psrc_neg  = alu_instruction_ptr->m_Word1_OP3.f.src2_neg;
+            break;
+    }
+}
+
+int is_cfile(BITS sel) 
+{
+    if (sel > 255 && sel < 512) 
+    {
+        return 1;
+    }
+    return 0;
+}
+
+int is_const(BITS sel) 
+{
+    if (is_cfile(sel)) 
+    {
+        return 1;
+    }
+    else if(sel >= SQ_ALU_SRC_0 && sel <= SQ_ALU_SRC_LITERAL) 
+    {
+        return 1;
+    }
+    return 0;
+}
+
+int is_gpr(BITS sel) 
+{
+    if (sel >= 0 && sel < 128) 
+    {
+        return 1;
+    }
+    return 0;
+}
+
+const GLuint BANK_SWIZZLE_VEC[8] = {SQ_ALU_VEC_210,  //000
+                                    SQ_ALU_VEC_120,  //001
+                                    SQ_ALU_VEC_102,  //010
+
+                                    SQ_ALU_VEC_201,  //011
+                                    SQ_ALU_VEC_012,  //100
+                                    SQ_ALU_VEC_021,  //101
+
+                                    SQ_ALU_VEC_012,  //110
+                                    SQ_ALU_VEC_012}; //111
+
+const GLuint BANK_SWIZZLE_SCL[8] = {SQ_ALU_SCL_210,  //000
+                                    SQ_ALU_SCL_122,  //001 
+                                    SQ_ALU_SCL_122,  //010
+
+                                    SQ_ALU_SCL_221,  //011
+                                    SQ_ALU_SCL_212,  //100
+                                    SQ_ALU_SCL_122,  //101
+
+                                    SQ_ALU_SCL_122,  //110
+                                    SQ_ALU_SCL_122}; //111
+
+GLboolean reserve_cfile(r700_AssemblerBase* pAsm, 
+                        GLuint sel, 
+                        GLuint chan)
+{
+    int res_match = (-1);
+    int res_empty = (-1);
+
+    GLint res;
+
+    for (res=3; res>=0; res--) 
+    {
+        if(pAsm->hw_cfile_addr[ res] < 0)  
+        {
+            res_empty = res;
+        }
+        else if( (pAsm->hw_cfile_addr[res] == (int)sel)
+                 &&
+                 (pAsm->hw_cfile_chan[ res ] == (int) chan) ) 
+        {
+            res_match = res;
+        }
+    }
+
+    if(res_match >= 0) 
+    {
+        // Read for this scalar component already reserved, nothing to do here.
+        ;
+    }
+    else if(res_empty >= 0) 
+    {
+        pAsm->hw_cfile_addr[ res_empty ] = sel;
+        pAsm->hw_cfile_chan[ res_empty ] = chan;
+    }
+    else 
+    {
+        radeon_error("All cfile read ports are used, cannot reference C$sel, channel $chan.\n");
+        return GL_FALSE;
+    }
+    return GL_TRUE;
+}
+
+GLboolean reserve_gpr(r700_AssemblerBase* pAsm, GLuint sel, GLuint chan, GLuint cycle)
+{
+    if(pAsm->hw_gpr[cycle][chan] < 0) 
+    {
+        pAsm->hw_gpr[cycle][chan] = sel;
+    }
+    else if(pAsm->hw_gpr[cycle][chan] != (int)sel) 
+    {
+        radeon_error("Another scalar operation has already used GPR read port for given channel\n");
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean cycle_for_scalar_bank_swizzle(const int swiz, const int sel, GLuint* pCycle)
+{
+    switch (swiz) 
+    {
+        case SQ_ALU_SCL_210:
+            {
+                int table[3] = {2,	1,	0};
+                *pCycle = table[sel];
+                return GL_TRUE;
+            }
+            break;
+        case SQ_ALU_SCL_122:
+            {
+                int table[3] = {1,	2,	2};
+                *pCycle = table[sel];
+                return GL_TRUE;
+            }
+            break;
+        case SQ_ALU_SCL_212:
+            {	
+                int table[3] = {2,	1,	2};
+                *pCycle = table[sel];
+                return GL_TRUE;
+            }
+            break;
+        case SQ_ALU_SCL_221:
+            {
+                int table[3] = {2, 2, 1};
+                *pCycle = table[sel];
+                return GL_TRUE;
+            }
+            break;
+        default:
+            radeon_error("Bad Scalar bank swizzle value\n");
+            break;
+    }
+
+    return GL_FALSE;
+}
+
+GLboolean cycle_for_vector_bank_swizzle(const int swiz, const int sel, GLuint* pCycle)
+{
+    switch (swiz) 
+    {
+        case SQ_ALU_VEC_012:
+            {
+                int table[3] = {0, 1, 2};
+                *pCycle = table[sel];
+            }
+            break;
+        case SQ_ALU_VEC_021:
+            {
+                int table[3] = {0, 2,	1};
+                *pCycle = table[sel];
+            }
+            break;        
+        case SQ_ALU_VEC_120:
+            {
+                int table[3] = {1, 2,	0};
+                *pCycle = table[sel];
+            }
+            break;
+        case SQ_ALU_VEC_102:
+            {
+                int table[3] = {1, 0,	2};
+                *pCycle = table[sel];
+            }
+            break;
+        case SQ_ALU_VEC_201:
+            {
+                int table[3] = {2, 0,	1};
+                *pCycle = table[sel];
+            }
+            break;
+        case SQ_ALU_VEC_210:
+            {
+                int table[3] = {2, 1,	0};
+                *pCycle = table[sel];
+            }
+            break;
+        default:
+            radeon_error("Bad Vec bank swizzle value\n");
+            return GL_FALSE;
+            break;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean check_scalar(r700_AssemblerBase* pAsm,
+                       R700ALUInstruction* alu_instruction_ptr)
+{
+    GLuint cycle;
+    GLuint bank_swizzle;
+    GLuint const_count = 0;
+
+    BITS sel;
+    BITS chan;
+    BITS rel;
+    BITS neg;
+
+    GLuint src;
+
+    BITS src_sel [3] = {0,0,0};
+    BITS src_chan[3] = {0,0,0};
+    BITS src_rel [3] = {0,0,0};
+    BITS src_neg [3] = {0,0,0};
+
+    GLuint swizzle_key;
+
+    GLuint number_of_operands = r700GetNumOperands(pAsm);
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        get_src_properties(alu_instruction_ptr,
+                           src,
+                           &(src_sel[src]), 
+                           &(src_rel[src]), 
+                           &(src_chan[src]), 
+                           &(src_neg[src]) );
+    }
+
+
+    swizzle_key = ( (is_const( src_sel[0] ) ? 4 : 0) + 
+                    (is_const( src_sel[1] ) ? 2 : 0) + 
+                    (is_const( src_sel[2] ) ? 1 : 0) );
+  
+    alu_instruction_ptr->m_Word1.f.bank_swizzle = BANK_SWIZZLE_SCL[ swizzle_key ];
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        sel  = src_sel [src];
+        chan = src_chan[src];
+        rel  = src_rel [src];
+        neg  = src_neg [src];
+
+        if (is_const( sel )) 
+        {
+            // Any constant, including literal and inline constants
+            const_count++;
+
+            if (is_cfile( sel )) 
+            {
+                reserve_cfile(pAsm, sel, chan);
+            }
+
+        }
+    }
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        sel  = src_sel [src];
+        chan = src_chan[src];
+        rel  = src_rel [src];
+        neg  = src_neg [src];
+
+        if( is_gpr(sel) ) 
+        {
+            bank_swizzle = alu_instruction_ptr->m_Word1.f.bank_swizzle;
+
+            if( GL_FALSE == cycle_for_scalar_bank_swizzle(bank_swizzle, src, &cycle) )
+            {
+                return GL_FALSE;
+            }
+
+            if(cycle < const_count) 
+            {
+                if( GL_FALSE == reserve_gpr(pAsm, sel, chan, cycle) )
+                {
+                    return GL_FALSE;
+                }
+            }
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean check_vector(r700_AssemblerBase* pAsm,
+                       R700ALUInstruction* alu_instruction_ptr)
+{
+    GLuint cycle;
+    GLuint bank_swizzle;
+    GLuint const_count = 0;
+
+    GLuint src;
+
+    BITS sel;
+    BITS chan;
+    BITS rel;
+    BITS neg;
+
+    BITS src_sel [3] = {0,0,0};
+    BITS src_chan[3] = {0,0,0};
+    BITS src_rel [3] = {0,0,0};
+    BITS src_neg [3] = {0,0,0};
+
+    GLuint swizzle_key;
+
+    GLuint number_of_operands = r700GetNumOperands(pAsm);
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        get_src_properties(alu_instruction_ptr,
+                           src,
+                           &(src_sel[src]), 
+                           &(src_rel[src]), 
+                           &(src_chan[src]), 
+                           &(src_neg[src]) );
+    }
+
+
+    swizzle_key = ( (is_const( src_sel[0] ) ? 4 : 0) + 
+                           (is_const( src_sel[1] ) ? 2 : 0) + 
+                           (is_const( src_sel[2] ) ? 1 : 0) 
+                         );
+
+    alu_instruction_ptr->m_Word1.f.bank_swizzle = BANK_SWIZZLE_VEC[swizzle_key];
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        sel  = src_sel [src];
+        chan = src_chan[src];
+        rel  = src_rel [src];
+        neg  = src_neg [src];
+
+
+        bank_swizzle = alu_instruction_ptr->m_Word1.f.bank_swizzle;
+
+        if( is_gpr(sel) ) 
+        {
+            if( GL_FALSE == cycle_for_vector_bank_swizzle(bank_swizzle, src, &cycle) )
+            {
+                return GL_FALSE;
+            }
+
+            if ( (src  == 1)          && 
+                 (sel  == src_sel[0]) &&
+                 (chan == src_chan[0]) ) 
+            {        
+            }
+            else 
+            {
+                if( GL_FALSE == reserve_gpr(pAsm, sel, chan, cycle) )
+                {
+                    return GL_FALSE;
+                }
+            }
+        }
+        else if( is_const(sel) ) 
+        {                  
+            const_count++;
+
+            if( is_cfile(sel) ) 
+            {        
+                if( GL_FALSE == reserve_cfile(pAsm, sel, chan) )
+                {
+                    return GL_FALSE;
+                }
+            }
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
+{
+    GLuint    number_of_scalar_operations;
+    GLboolean is_single_scalar_operation;
+    GLuint    scalar_channel_index;
+
+    PVSSRC * pcurrent_source;
+    int    current_source_index;
+    GLuint contiguous_slots_needed;
+
+    GLuint    uNumSrc = r700GetNumOperands(pAsm);
+    GLuint    channel_swizzle, j;
+    GLuint    chan_counter[4] = {0, 0, 0, 0};
+    PVSSRC *  pSource[3];
+    GLboolean bSplitInst = GL_FALSE;
+
+    if (1 == pAsm->D.dst.math) 
+    {
+        is_single_scalar_operation = GL_TRUE;
+        number_of_scalar_operations = 1;
+    }
+    else 
+    {
+        is_single_scalar_operation = GL_FALSE;
+        number_of_scalar_operations = 4;
+        
+        /* check read port, only very preliminary algorithm, not count in 
+           src0/1 same comp case and prev slot repeat case; also not count relative
+           addressing. TODO: improve performance. */
+        for(j=0; j<uNumSrc; j++)
+        {
+            pSource[j] = &(pAsm->S[j].src);
+        }
+        for(scalar_channel_index=0; scalar_channel_index<4; scalar_channel_index++) 
+        {
+            for(j=0; j<uNumSrc; j++) 
+            {
+                switch (scalar_channel_index) 
+                {
+                    case 0: channel_swizzle = pSource[j]->swizzlex; break;
+                    case 1: channel_swizzle = pSource[j]->swizzley; break;
+                    case 2: channel_swizzle = pSource[j]->swizzlez; break;
+                    case 3: channel_swizzle = pSource[j]->swizzlew; break;
+                    default: channel_swizzle = SQ_SEL_MASK; break;
+                }
+                if ( ((pSource[j]->rtype == SRC_REG_TEMPORARY) || 
+                     (pSource[j]->rtype == SRC_REG_INPUT))
+                     && (channel_swizzle <= SQ_SEL_W) )
+                {                    
+                    chan_counter[channel_swizzle]++;                        
+                }
+            }
+        }
+        if(   (chan_counter[SQ_SEL_X] > 3)
+           || (chan_counter[SQ_SEL_Y] > 3)
+           || (chan_counter[SQ_SEL_Z] > 3)
+           || (chan_counter[SQ_SEL_W] > 3) ) /* each chan bank has only 3 ports. */
+        {
+            bSplitInst = GL_TRUE;
+        }
+    }
+
+    contiguous_slots_needed = 0;
+
+    if(GL_TRUE == is_reduction_opcode(&(pAsm->D)) ) 
+    {
+        contiguous_slots_needed = 4;
+    }
+
+    initialize(pAsm);    
+
+    for (scalar_channel_index=0;
+            scalar_channel_index < number_of_scalar_operations; 
+                scalar_channel_index++) 
+    {
+        R700ALUInstruction* alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
+        if (alu_instruction_ptr == NULL) 
+		{
+			return GL_FALSE;
+		}
+        Init_R700ALUInstruction(alu_instruction_ptr);
+        
+        //src 0
+        current_source_index = 0;
+        pcurrent_source = &(pAsm->S[0].src);
+
+        if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
+                                         current_source_index,
+                                         pcurrent_source, 
+                                         scalar_channel_index) )     
+        {
+            return GL_FALSE;
+        }
+   
+        if (pAsm->D.dst.math == 0) 
+        {            
+            // Process source 1            
+            current_source_index = 1;
+            pcurrent_source = &(pAsm->S[current_source_index].src);
+
+            if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
+                                             current_source_index,
+                                             pcurrent_source, 
+                                             scalar_channel_index) ) 
+            {
+                return GL_FALSE;
+            }
+        }
+
+        //other bits
+        alu_instruction_ptr->m_Word0.f.index_mode = SQ_INDEX_LOOP;
+
+        if(   (is_single_scalar_operation == GL_TRUE) 
+           || (GL_TRUE == bSplitInst) )
+        {
+            alu_instruction_ptr->m_Word0.f.last = 1;
+        }
+        else 
+        {
+            alu_instruction_ptr->m_Word0.f.last = (scalar_channel_index == 3) ?  1 : 0;
+        }
+
+        alu_instruction_ptr->m_Word0.f.pred_sel                = 0x0;
+        alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;  
+        alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
+
+        // dst
+        if( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
+            (pAsm->D.dst.rtype == DST_REG_OUT) ) 
+        {
+            alu_instruction_ptr->m_Word1.f.dst_gpr  = pAsm->D.dst.reg;
+        }
+        else 
+        {
+            radeon_error("Only temp destination registers supported for ALU dest regs.\n");
+            return GL_FALSE;
+        }
+
+        alu_instruction_ptr->m_Word1.f.dst_rel  = SQ_ABSOLUTE;  //D.rtype
+
+        if ( is_single_scalar_operation == GL_TRUE ) 
+        {
+            // Override scalar_channel_index since only one scalar value will be written
+            if(pAsm->D.dst.writex) 
+            {
+                scalar_channel_index = 0;
+            }
+            else if(pAsm->D.dst.writey) 
+            {
+                scalar_channel_index = 1;
+            }
+            else if(pAsm->D.dst.writez) 
+            {
+                scalar_channel_index = 2;
+            }
+            else if(pAsm->D.dst.writew) 
+            {
+                scalar_channel_index = 3;
+            }
+        }
+
+        alu_instruction_ptr->m_Word1.f.dst_chan = scalar_channel_index;
+
+        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->pILInst[pAsm->uiCurInst].SaturateMode;
+
+        if (pAsm->D.dst.op3) 
+        {            
+            //op3
+
+            alu_instruction_ptr->m_Word1_OP3.f.alu_inst = pAsm->D.dst.opcode;
+
+            //There's 3rd src for op3
+            current_source_index = 2;
+            pcurrent_source = &(pAsm->S[current_source_index].src);
+
+            if ( GL_FALSE == assemble_alu_src(alu_instruction_ptr,
+                                              current_source_index,
+                                              pcurrent_source, 
+                                              scalar_channel_index) ) 
+            {
+                return GL_FALSE;
+            }
+        }
+        else 
+        {
+            //op2
+            if (pAsm->bR6xx)
+            {
+                alu_instruction_ptr->m_Word1_OP2.f6.alu_inst           = pAsm->D.dst.opcode;
+
+                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs           = 0x0;
+                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs           = 0x0;
+
+                //alu_instruction_ptr->m_Word1_OP2.f6.update_execute_mask = 0x0;
+                //alu_instruction_ptr->m_Word1_OP2.f6.update_pred         = 0x0;
+                switch (scalar_channel_index) 
+                {
+                    case 0: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writex; 
+                        break;
+                    case 1: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writey; 
+                        break;
+                    case 2: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writez; 
+                        break;
+                    case 3: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writew; 
+                        break;
+                    default: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = 1; //SQ_SEL_MASK;
+                        break;
+                }            
+                alu_instruction_ptr->m_Word1_OP2.f6.omod               = SQ_ALU_OMOD_OFF;
+            }
+            else
+            {
+                alu_instruction_ptr->m_Word1_OP2.f.alu_inst           = pAsm->D.dst.opcode;
+
+                alu_instruction_ptr->m_Word1_OP2.f.src0_abs           = 0x0;
+                alu_instruction_ptr->m_Word1_OP2.f.src1_abs           = 0x0;
+
+                //alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
+                //alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
+                switch (scalar_channel_index) 
+                {
+                    case 0: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writex; 
+                        break;
+                    case 1: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writey; 
+                        break;
+                    case 2: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writez; 
+                        break;
+                    case 3: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writew; 
+                        break;
+                    default: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = 1; //SQ_SEL_MASK;
+                        break;
+                }            
+                alu_instruction_ptr->m_Word1_OP2.f.omod               = SQ_ALU_OMOD_OFF;
+            }
+        }
+
+        if(GL_FALSE == add_alu_instruction(pAsm, alu_instruction_ptr, contiguous_slots_needed) )
+        {
+            return GL_FALSE;
+        }
+
+        /*
+         * Judge the type of current instruction, is it vector or scalar 
+         * instruction.
+         */        
+        if (is_single_scalar_operation) 
+        {
+            if(GL_FALSE == check_scalar(pAsm, alu_instruction_ptr) )
+            {
+                return GL_FALSE;
+            }
+        }
+        else 
+        {
+            if(GL_FALSE == check_vector(pAsm, alu_instruction_ptr) )
+            {
+                return 1;
+            }
+        }
+
+        contiguous_slots_needed = 0;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean next_ins(r700_AssemblerBase *pAsm)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    if( GL_TRUE == IsTex(pILInst->Opcode) &&
+        /* handle const moves to temp register */ 
+        !(pAsm->D.dst.opcode == SQ_OP2_INST_MOV) )
+    {
+	    if (pILInst->TexSrcTarget == TEXTURE_RECT_INDEX) {
+		    if( GL_FALSE == assemble_tex_instruction(pAsm, GL_FALSE) ) 
+		    {
+			    radeon_error("Error assembling TEX instruction\n");
+			    return GL_FALSE;
+		    }
+	    } else {
+		    if( GL_FALSE == assemble_tex_instruction(pAsm, GL_TRUE) ) 
+		    {
+			    radeon_error("Error assembling TEX instruction\n");
+			    return GL_FALSE;
+		    }
+	    }
+    }
+    else 
+    {   //ALU      
+        if( GL_FALSE == assemble_alu_instruction(pAsm) ) 
+        {
+            radeon_error("Error assembling ALU instruction\n");
+            return GL_FALSE;
+        }
+    } 
+      
+    if(pAsm->D.dst.rtype == DST_REG_OUT) 
+    {
+        if(pAsm->D.dst.op3) 
+        {        
+            // There is no mask for OP3 instructions, so all channels are written        
+            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] = 0xF;
+        }
+        else 
+        {
+            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] 
+               |= (unsigned char)pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask;
+        }
+    }
+    
+    //reset for next inst.
+    pAsm->D.bits    = 0;
+    pAsm->S[0].bits = 0;
+    pAsm->S[1].bits = 0;
+    pAsm->S[2].bits = 0;
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode)
+{
+    BITS tmp;
+
+    checkop1(pAsm);
+
+    tmp = gethelpr(pAsm);
+
+    // opcode  tmp.x,    a.x
+    // MOV     dst,      tmp.x
+
+    pAsm->D.dst.opcode = opcode;
+    pAsm->D.dst.math = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // Now replicate result to all necessary channels in destination
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_ABS(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MAX;  
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+ 
+    pAsm->S[1].bits = pAsm->S[0].bits;
+    flipneg_PVSSRC(&(pAsm->S[1].src));
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_ADD(r700_AssemblerBase *pAsm)
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
+ 
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+ 
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_SUB)
+    {
+        flipneg_PVSSRC(&(pAsm->S[1].src));
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_BAD(char *opcode_str) 
+{
+    radeon_error("Not yet implemented instruction (%s)\n", opcode_str);
+    return GL_FALSE;
+}
+
+GLboolean assemble_CMP(r700_AssemblerBase *pAsm)
+{
+    int tmp;
+
+    if( GL_FALSE == checkop3(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGE;
+    pAsm->D.dst.op3     = 1;  
+
+    tmp = (-1);
+
+    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask)
+    {
+        //OP3 has no support for write mask
+        tmp = gethelpr(pAsm);
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = tmp;
+
+        nomask_PVSDST(&(pAsm->D.dst));
+    }
+    else 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+              
+    if( GL_FALSE == assemble_src(pAsm, 2, 1) )  
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, 2) ) 
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if (0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask) 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        //tmp for source
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = tmp;
+
+        noneg_PVSSRC(&(pAsm->S[0].src));
+        noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_COS(r700_AssemblerBase *pAsm)
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_COS);
+}
+ 
+GLboolean assemble_DOT(r700_AssemblerBase *pAsm)
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+        return GL_FALSE;
+    }
+ 
+    pAsm->D.dst.opcode = SQ_OP2_INST_DOT4;  
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if(OPCODE_DP3 == pAsm->pILInst[pAsm->uiCurInst].Opcode)
+    {
+        zerocomp_PVSSRC(&(pAsm->S[0].src), 3);
+        zerocomp_PVSSRC(&(pAsm->S[1].src), 3);
+    }
+    else if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_DPH) 
+    {
+        onecomp_PVSSRC(&(pAsm->S[1].src), 3);
+    } 
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_DST(r700_AssemblerBase *pAsm)
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    onecomp_PVSSRC(&(pAsm->S[0].src), 0);
+    onecomp_PVSSRC(&(pAsm->S[0].src), 3);
+
+    onecomp_PVSSRC(&(pAsm->S[1].src), 0);
+    onecomp_PVSSRC(&(pAsm->S[1].src), 2);
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_EX2(r700_AssemblerBase *pAsm)
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_EXP_IEEE);
+}
+ 
+GLboolean assemble_FLR(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_FLOOR;  
+
+    if ( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_FLR_INT(r700_AssemblerBase *pAsm)
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_FLT_TO_INT);
+}
+
+GLboolean assemble_FRC(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_FRACT; 
+
+    if ( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_KIL(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_KILLGT;  
+  
+    if ( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = 0;
+    pAsm->D.dst.writey = 0;
+    pAsm->D.dst.writez = 0;
+    pAsm->D.dst.writew = 0;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = 0;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_0);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+
+    if(PROGRAM_TEMPORARY == pAsm->pILInst[pAsm->uiCurInst].DstReg.File)
+    {
+        pAsm->S[1].src.reg = pAsm->pILInst[pAsm->uiCurInst].DstReg.Index + pAsm->starting_temp_register_number;
+    }
+    else
+    {   //PROGRAM_OUTPUT
+        pAsm->S[1].src.reg = pAsm->uiFP_OutputMap[pAsm->pILInst[pAsm->uiCurInst].DstReg.Index];
+    }
+  
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noswizzle_PVSSRC(&(pAsm->S[1].src));
+  
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->pR700Shader->killIsUsed = GL_TRUE;
+    
+    return GL_TRUE;
+}
+
+GLboolean assemble_LG2(r700_AssemblerBase *pAsm) 
+{ 
+    return assemble_math_function(pAsm, SQ_OP2_INST_LOG_IEEE);
+}
+
+GLboolean assemble_LRP(r700_AssemblerBase *pAsm) 
+{
+    BITS tmp;
+
+    if( GL_FALSE == checkop3(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    tmp = gethelpr(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
+
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = tmp;
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    nomask_PVSDST(&(pAsm->D.dst));
+
+          
+    if( GL_FALSE == assemble_src(pAsm, 1, 0) ) 
+    {
+	    return GL_FALSE;
+    }
+
+    if ( GL_FALSE == assemble_src(pAsm, 2, 1) )   
+    {
+	    return GL_FALSE;
+    }
+
+    neg_PVSSRC(&(pAsm->S[1].src));
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+	    return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    pAsm->D.dst.op3    = 1;
+
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = tmp;
+    noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+
+    if( GL_FALSE == assemble_src(pAsm, 0, 1) ) 
+    {
+        return GL_FALSE;
+    }
+    if( GL_FALSE == assemble_src(pAsm, 2, -1) ) 
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = tmp;
+    noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_MAD(struct r700_AssemblerBase *pAsm) 
+{
+    int tmp, ii;
+    GLboolean bReplaceDst = GL_FALSE;
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+	if( GL_FALSE == checkop3(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+	pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;  
+	pAsm->D.dst.op3     = 1; 
+
+	tmp = (-1);
+
+    if(PROGRAM_TEMPORARY == pILInst->DstReg.File)
+    {   /* TODO : more investigation on MAD src and dst using same register */
+        for(ii=0; ii<3; ii++)
+        {
+            if(   (PROGRAM_TEMPORARY == pILInst->SrcReg[ii].File)
+               && (pILInst->DstReg.Index == pILInst->SrcReg[ii].Index) )
+            {
+                bReplaceDst = GL_TRUE;
+                break;
+            }
+        }
+    }
+    if(0xF != pILInst->DstReg.WriteMask)
+    {   /* OP3 has no support for write mask */
+        bReplaceDst = GL_TRUE;
+    }
+
+	if(GL_TRUE == bReplaceDst)
+    {
+        tmp = gethelpr(pAsm);
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = tmp;
+
+        nomask_PVSDST(&(pAsm->D.dst));
+    }
+    else 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+              
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )  
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 2, -1) ) 
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+	if (GL_TRUE == bReplaceDst) 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        //tmp for source
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = tmp;
+
+        noneg_PVSSRC(&(pAsm->S[0].src));
+        noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+/* LIT dst, src */
+GLboolean assemble_LIT(r700_AssemblerBase *pAsm)
+{
+    unsigned int dstReg;
+    unsigned int dstType;
+    unsigned int srcReg;
+    unsigned int srcType;
+    checkop1(pAsm);
+    int tmp = gethelpr(pAsm);
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+    dstReg  = pAsm->D.dst.reg;
+    dstType = pAsm->D.dst.rtype;
+    srcReg  = pAsm->S[0].src.reg;
+    srcType = pAsm->S[0].src.rtype;
+
+    /* dst.xw, <- 1.0  */
+    pAsm->D.dst.opcode   = SQ_OP2_INST_MOV;
+    pAsm->D.dst.rtype    = dstType;
+    pAsm->D.dst.reg      = dstReg;
+    pAsm->D.dst.writex   = 1;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 1;
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_1;
+    pAsm->S[0].src.swizzley = SQ_SEL_1;
+    pAsm->S[0].src.swizzlez = SQ_SEL_1;
+    pAsm->S[0].src.swizzlew = SQ_SEL_1;
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    /* dst.y = max(src.x, 0.0) */
+    pAsm->D.dst.opcode   = SQ_OP2_INST_MAX;
+    pAsm->D.dst.rtype    = dstType;
+    pAsm->D.dst.reg      = dstReg;
+    pAsm->D.dst.writex   = 0;
+    pAsm->D.dst.writey   = 1;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 0;
+    pAsm->S[0].src.rtype = srcType;
+    pAsm->S[0].src.reg   = srcReg;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_X;
+    pAsm->S[0].src.swizzley = SQ_SEL_X;
+    pAsm->S[0].src.swizzlez = SQ_SEL_X;
+    pAsm->S[0].src.swizzlew = SQ_SEL_X;
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[1].src.reg   = tmp;
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[1].src));
+    pAsm->S[1].src.swizzlex = SQ_SEL_0;
+    pAsm->S[1].src.swizzley = SQ_SEL_0;
+    pAsm->S[1].src.swizzlez = SQ_SEL_0;
+    pAsm->S[1].src.swizzlew = SQ_SEL_0;
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    /* before: dst.w = log(src.y)
+     * after : dst.x = log(src.y)
+     * why change dest register is that dst.w has been initialized as 1 before
+     */
+    pAsm->D.dst.opcode   = SQ_OP2_INST_LOG_CLAMPED;
+    pAsm->D.dst.math     = 1;
+    pAsm->D.dst.rtype    = dstType;
+    pAsm->D.dst.reg      = dstReg;
+    pAsm->D.dst.writex   = 1;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 0;
+    pAsm->S[0].src.rtype = srcType;
+    pAsm->S[0].src.reg   = srcReg;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_Y;
+    pAsm->S[0].src.swizzley = SQ_SEL_Y;
+    pAsm->S[0].src.swizzlez = SQ_SEL_Y;
+    pAsm->S[0].src.swizzlew = SQ_SEL_Y;
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    /* before: tmp.x = amd MUL_LIT(src.w, dst.w, src.x ) */
+    /* after : tmp.x = amd MUL_LIT(src.w, dst.x, src.x ) */
+    pAsm->D.dst.opcode   = SQ_OP3_INST_MUL_LIT;
+    pAsm->D.dst.op3      = 1;
+    pAsm->D.dst.rtype    = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg      = tmp;
+    pAsm->D.dst.writex   = 1;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 0;
+
+    pAsm->S[0].src.rtype = srcType;
+    pAsm->S[0].src.reg   = srcReg;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_W;
+    pAsm->S[0].src.swizzley = SQ_SEL_W;
+    pAsm->S[0].src.swizzlez = SQ_SEL_W;
+    pAsm->S[0].src.swizzlew = SQ_SEL_W;
+
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[1].src.reg   = dstReg;
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[1].src));
+    pAsm->S[1].src.swizzlex = SQ_SEL_X;
+    pAsm->S[1].src.swizzley = SQ_SEL_X;
+    pAsm->S[1].src.swizzlez = SQ_SEL_X;
+    pAsm->S[1].src.swizzlew = SQ_SEL_X;
+
+    pAsm->S[2].src.rtype = srcType;
+    pAsm->S[2].src.reg   = srcReg;
+    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[2].src));
+    pAsm->S[2].src.swizzlex = SQ_SEL_X;
+    pAsm->S[2].src.swizzley = SQ_SEL_X;
+    pAsm->S[2].src.swizzlez = SQ_SEL_X;
+    pAsm->S[2].src.swizzlew = SQ_SEL_X;
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    /* dst.z = exp(tmp.x) */
+    pAsm->D.dst.opcode   = SQ_OP2_INST_EXP_IEEE;
+    pAsm->D.dst.math     = 1;
+    pAsm->D.dst.rtype    = dstType;
+    pAsm->D.dst.reg      = dstReg;
+    pAsm->D.dst.writex   = 0;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 1;
+    pAsm->D.dst.writew   = 0;
+
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_X;
+    pAsm->S[0].src.swizzley = SQ_SEL_X;
+    pAsm->S[0].src.swizzlez = SQ_SEL_X;
+    pAsm->S[0].src.swizzlew = SQ_SEL_X;
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_MAX(r700_AssemblerBase *pAsm) 
+{
+	if( GL_FALSE == checkop2(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	pAsm->D.dst.opcode = SQ_OP2_INST_MAX; 
+	
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_MIN(r700_AssemblerBase *pAsm) 
+{
+	if( GL_FALSE == checkop2(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	pAsm->D.dst.opcode = SQ_OP2_INST_MIN;  
+
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+	{
+		return GL_FALSE;
+	}
+ 
+	if( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_MOV(r700_AssemblerBase *pAsm) 
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if (GL_FALSE == assemble_dst(pAsm))
+    {
+        return GL_FALSE;
+    }
+
+    if (GL_FALSE == assemble_src(pAsm, 0, -1))
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_MUL(r700_AssemblerBase *pAsm) 
+{
+	if( GL_FALSE == checkop2(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == next_ins(pAsm) ) 
+	{
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_POW(r700_AssemblerBase *pAsm) 
+{
+    BITS tmp;
+
+    checkop1(pAsm);
+
+    tmp = gethelpr(pAsm);
+
+    // LG2 tmp.x,     a.swizzle
+    pAsm->D.dst.opcode = SQ_OP2_INST_LOG_IEEE;  
+    pAsm->D.dst.math = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // MUL tmp.x,     tmp.x, b.swizzle
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // EX2 dst.mask,          tmp.x
+    // EX2 tmp.x,             tmp.x
+    pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
+    pAsm->D.dst.math = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // Now replicate result to all necessary channels in destination
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_RCP(r700_AssemblerBase *pAsm) 
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_RECIP_IEEE);
+}
+ 
+GLboolean assemble_RSQ(r700_AssemblerBase *pAsm) 
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_RECIPSQRT_IEEE);
+}
+ 
+GLboolean assemble_SIN(r700_AssemblerBase *pAsm) 
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_SIN);
+}
+ 
+GLboolean assemble_SCS(r700_AssemblerBase *pAsm) 
+{
+    BITS tmp;
+
+	checkop1(pAsm);
+
+	tmp = gethelpr(pAsm);
+
+	// COS tmp.x,    a.x
+	pAsm->D.dst.opcode = SQ_OP2_INST_COS;
+	pAsm->D.dst.math = 1;
+
+	setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+	pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+	pAsm->D.dst.reg = tmp;
+	pAsm->D.dst.writex = 1;
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if ( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	// SIN tmp.y,    a.x
+	pAsm->D.dst.opcode = SQ_OP2_INST_SIN;
+	pAsm->D.dst.math = 1;
+
+	setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+	pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+	pAsm->D.dst.reg = tmp;
+	pAsm->D.dst.writey = 1;
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	// MOV dst.mask,     tmp
+	pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+	pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+	pAsm->S[0].src.reg = tmp;
+
+	noswizzle_PVSSRC(&(pAsm->S[0].src));
+	pAsm->S[0].src.swizzlez = SQ_SEL_0;
+	pAsm->S[0].src.swizzlew = SQ_SEL_0;
+
+	if ( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_SGE(r700_AssemblerBase *pAsm) 
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_SETGE;  
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+	    return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_SLT(r700_AssemblerBase *pAsm) 
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_SETGT;  
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+                
+    if( GL_FALSE == assemble_src(pAsm, 0, 1) )  
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, 0) )  
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_STP(r700_AssemblerBase *pAsm) 
+{
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_TEX(r700_AssemblerBase *pAsm) 
+{
+    GLboolean src_const;
+
+    switch (pAsm->pILInst[pAsm->uiCurInst].SrcReg[0].File)
+    {
+    case PROGRAM_CONSTANT:
+    case PROGRAM_LOCAL_PARAM:
+    case PROGRAM_ENV_PARAM:
+    case PROGRAM_STATE_VAR:
+        src_const = GL_TRUE;
+        break;
+    case PROGRAM_TEMPORARY:
+    case PROGRAM_INPUT:
+        src_const = GL_FALSE;
+	break;
+    }
+
+    if (GL_TRUE == src_const)
+    {
+	    if ( GL_FALSE == mov_temp(pAsm, 0) )
+		    return GL_FALSE;
+    }
+
+    switch (pAsm->pILInst[pAsm->uiCurInst].Opcode)
+    {
+        case OPCODE_TEX:
+            pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
+            break;
+        case OPCODE_TXB:
+            radeon_error("do not support TXB yet\n");
+            return GL_FALSE;
+            break;
+        case OPCODE_TXP:
+            /* TODO : tex proj version : divid first 3 components by 4th */
+            pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
+            break;
+        default:
+            radeon_error("Internal error: bad texture op (not TEX)\n");
+            return GL_FALSE;
+            break;
+    }
+
+    // Set src1 to tex unit id
+    pAsm->S[1].src.reg   = pAsm->pILInst[pAsm->uiCurInst].TexSrcUnit;
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+
+    //No sw info from mesa compiler, so hard code here.
+    pAsm->S[1].src.swizzlex = SQ_SEL_X;
+    pAsm->S[1].src.swizzley = SQ_SEL_Y;
+    pAsm->S[1].src.swizzlez = SQ_SEL_Z;
+    pAsm->S[1].src.swizzlew = SQ_SEL_W;
+
+    if( GL_FALSE == tex_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == tex_src(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_XPD(r700_AssemblerBase *pAsm) 
+{
+    BITS tmp;
+
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    tmp = gethelpr(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+  
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+ 
+    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_Z, SQ_SEL_X, SQ_SEL_Y, SQ_SEL_0);
+    swizzleagain_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Y, SQ_SEL_Z, SQ_SEL_X, SQ_SEL_0);
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    pAsm->D.dst.op3    = 1;
+
+    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask)
+    {
+        tmp = gethelpr(pAsm);
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = tmp;
+
+        nomask_PVSDST(&(pAsm->D.dst));
+    }
+    else 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+ 
+    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_Y, SQ_SEL_Z, SQ_SEL_X, SQ_SEL_0);
+    swizzleagain_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Z, SQ_SEL_X, SQ_SEL_Y, SQ_SEL_0);
+
+    // result1 + (neg) result0
+    setaddrmode_PVSSRC(&(pAsm->S[2].src),ADDR_ABSOLUTE);
+    pAsm->S[2].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[2].src.reg   = tmp;
+
+    neg_PVSSRC(&(pAsm->S[2].src));
+    noswizzle_PVSSRC(&(pAsm->S[2].src));
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+
+    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask) 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        // Use tmp as source
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = tmp;
+
+        noneg_PVSSRC(&(pAsm->S[0].src));
+        noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_EXPORT(r700_AssemblerBase *pAsm)
+{
+    return GL_TRUE;
+}
+
+GLboolean assemble_IF(r700_AssemblerBase *pAsm)
+{
+    return GL_TRUE;
+}
+
+GLboolean assemble_ENDIF(r700_AssemblerBase *pAsm)
+{
+    return GL_TRUE;
+}
+
+GLboolean AssembleInstr(GLuint uiNumberInsts,
+                        struct prog_instruction *pILInst, 
+						r700_AssemblerBase *pR700AsmCode)
+{
+    GLuint i;
+
+    pR700AsmCode->pILInst = pILInst;
+	for(i=0; i<uiNumberInsts; i++)
+    {
+        pR700AsmCode->uiCurInst = i;
+
+        switch (pILInst[i].Opcode)
+        {
+        case OPCODE_ABS: 
+            if ( GL_FALSE == assemble_ABS(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_ADD: 
+        case OPCODE_SUB: 
+            if ( GL_FALSE == assemble_ADD(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_ARL: 
+            radeon_error("Not yet implemented instruction OPCODE_ARL \n");
+            //if ( GL_FALSE == assemble_BAD("ARL") ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_ARR: 
+            radeon_error("Not yet implemented instruction OPCODE_ARR \n");
+            //if ( GL_FALSE == assemble_BAD("ARR") ) 
+                return GL_FALSE;
+            break;
+
+        case OPCODE_CMP: 
+            if ( GL_FALSE == assemble_CMP(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_COS: 
+            if ( GL_FALSE == assemble_COS(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_DP3: 
+        case OPCODE_DP4: 
+        case OPCODE_DPH: 
+            if ( GL_FALSE == assemble_DOT(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_DST: 
+            if ( GL_FALSE == assemble_DST(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_EX2: 
+            if ( GL_FALSE == assemble_EX2(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_EXP: 
+            radeon_error("Not yet implemented instruction OPCODE_EXP \n");
+            //if ( GL_FALSE == assemble_BAD("EXP") ) 
+                return GL_FALSE;
+            break; // approx of EX2
+
+        case OPCODE_FLR:     
+            if ( GL_FALSE == assemble_FLR(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        //case OP_FLR_INT: 
+        //    if ( GL_FALSE == assemble_FLR_INT() ) 
+        //        return GL_FALSE;
+        //    break;  
+
+        case OPCODE_FRC: 
+            if ( GL_FALSE == assemble_FRC(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_KIL: 
+            if ( GL_FALSE == assemble_KIL(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_LG2: 
+            if ( GL_FALSE == assemble_LG2(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_LIT:
+            if ( GL_FALSE == assemble_LIT(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_LRP: 
+            if ( GL_FALSE == assemble_LRP(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_LOG: 
+            radeon_error("Not yet implemented instruction OPCODE_LOG \n");
+            //if ( GL_FALSE == assemble_BAD("LOG") ) 
+                return GL_FALSE;
+            break; // approx of LG2
+
+        case OPCODE_MAD: 
+            if ( GL_FALSE == assemble_MAD(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_MAX: 
+            if ( GL_FALSE == assemble_MAX(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_MIN: 
+            if ( GL_FALSE == assemble_MIN(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_MOV: 
+            if ( GL_FALSE == assemble_MOV(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_MUL: 
+            if ( GL_FALSE == assemble_MUL(pR700AsmCode) ) 
+                return GL_FALSE;
+            break; 
+
+        case OPCODE_POW: 
+            if ( GL_FALSE == assemble_POW(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_RCP: 
+            if ( GL_FALSE == assemble_RCP(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_RSQ: 
+            if ( GL_FALSE == assemble_RSQ(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_SIN: 
+            if ( GL_FALSE == assemble_SIN(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_SCS: 
+            if ( GL_FALSE == assemble_SCS(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_SGE: 
+            if ( GL_FALSE == assemble_SGE(pR700AsmCode) ) 
+                return GL_FALSE;
+            break; 
+        case OPCODE_SLT: 
+            if ( GL_FALSE == assemble_SLT(pR700AsmCode) ) 
+                return GL_FALSE;
+            break; 
+
+        //case OP_STP: 
+        //    if ( GL_FALSE == assemble_STP(pR700AsmCode) ) 
+        //        return GL_FALSE;
+        //    break;
+
+        case OPCODE_SWZ: 
+            if ( GL_FALSE == assemble_MOV(pR700AsmCode) ) 
+            {
+                return GL_FALSE; 
+            }
+            else
+            {
+                if( (i+1)<uiNumberInsts )
+                {
+                    if(OPCODE_END != pILInst[i+1].Opcode)
+                    {
+                        if( GL_TRUE == IsTex(pILInst[i+1].Opcode) )
+                        {
+                            pR700AsmCode->pInstDeps[i+1].nDstDep = i+1; //=1?
+                        }
+                    }
+                }
+            }
+            break;
+
+        case OPCODE_TEX: 
+        case OPCODE_TXB:  
+        case OPCODE_TXP: 
+            if ( GL_FALSE == assemble_TEX(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+
+        case OPCODE_XPD: 
+            if ( GL_FALSE == assemble_XPD(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_IF   : 
+            if ( GL_FALSE == assemble_IF(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_ELSE : 
+            radeon_error("Not yet implemented instruction OPCODE_ELSE \n");
+            //if ( GL_FALSE == assemble_BAD("ELSE") ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_ENDIF: 
+            if ( GL_FALSE == assemble_ENDIF(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+
+        //case OPCODE_EXPORT: 
+        //    if ( GL_FALSE == assemble_EXPORT() ) 
+        //        return GL_FALSE;
+        //    break;
+
+        case OPCODE_END: 
+			//pR700AsmCode->uiCurInst = i;
+			//This is to remaind that if in later exoort there is depth/stencil
+			//export, we need a mov to re-arrange DST channel, where using a
+			//psuedo inst, we will use this end inst to do it.
+            return GL_TRUE;
+
+        default:
+            radeon_error("internal: unknown instruction\n");
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean Process_Export(r700_AssemblerBase* pAsm,
+                         GLuint type,
+                         GLuint export_starting_index,
+                         GLuint export_count, 
+                         GLuint starting_register_number,
+                         GLboolean is_depth_export)
+{
+    unsigned char ucWriteMask;
+
+    check_current_clause(pAsm, CF_EMPTY_CLAUSE);
+    check_current_clause(pAsm, CF_EXPORT_CLAUSE); //alloc the cf_current_export_clause_ptr
+
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.type = type;
+
+    switch (type) 
+    {
+        case SQ_EXPORT_PIXEL:
+            if(GL_TRUE == is_depth_export) 
+            {
+                pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_PIXEL_Z;
+            }
+            else 
+            {
+                pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_PIXEL_MRT0 + export_starting_index;
+            }
+            break;
+
+        case SQ_EXPORT_POS:
+            pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_POS_0 + export_starting_index; 
+            break;
+
+        case SQ_EXPORT_PARAM:
+            pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = 0x0 + export_starting_index; 
+            break;
+
+        default:
+            radeon_error("Unknown export type: %d\n", type);
+            return GL_FALSE;
+            break;
+    }
+
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.rw_gpr      = starting_register_number;
+
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.rw_rel      = SQ_ABSOLUTE;
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.index_gpr   = 0x0;
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.elem_size   = 0x3; 
+
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.burst_count      = (export_count - 1);
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_EXPORT;  // _DONE
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    if (export_count == 1) 
+    {
+        ucWriteMask = pAsm->pucOutMask[starting_register_number - pAsm->starting_export_register_number];
+	/* exports Z as a float into Red channel */
+	if (GL_TRUE == is_depth_export)
+	    ucWriteMask = 0x1;
+
+        if( (ucWriteMask & 0x1) != 0)
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_X;
+        }
+        else
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_MASK;
+        }
+        if( ((ucWriteMask>>1) & 0x1) != 0)
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_Y;
+        }
+        else
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_MASK;
+        }
+        if( ((ucWriteMask>>2) & 0x1) != 0)
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_Z;
+        }
+        else
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_MASK;
+        }
+        if( ((ucWriteMask>>3) & 0x1) != 0)
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_W;
+        }
+        else
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_MASK;
+        }
+    }
+    else 
+    {
+        // This should only be used if all components for all registers have been written
+        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_X;
+        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_Y;
+        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_Z;
+        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_W;
+    }
+
+    pAsm->cf_last_export_ptr = pAsm->cf_current_export_clause_ptr;
+
+    return GL_TRUE;
+}
+
+GLboolean Move_Depth_Exports_To_Correct_Channels(r700_AssemblerBase *pAsm, BITS depth_channel_select)
+{
+	gl_inst_opcode Opcode_save = pAsm->pILInst[pAsm->uiCurInst].Opcode; //Should be OPCODE_END
+    pAsm->pILInst[pAsm->uiCurInst].Opcode = OPCODE_MOV;
+
+    // MOV depth_export_register.hw_depth_channel, depth_export_register.depth_channel_select
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = pAsm->depth_export_register_number;
+
+    pAsm->D.dst.writex = 1;   // depth          goes in R channel for HW                       
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = pAsm->depth_export_register_number;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), depth_channel_select);
+
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->pILInst[pAsm->uiCurInst].Opcode = Opcode_save;
+
+    return GL_TRUE;
+}
+ 
+GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode,
+                                   GLbitfield          OutputsWritten)  
+{ 
+    unsigned int unBit;
+
+    if(pR700AsmCode->depth_export_register_number >= 0) 
+    {
+        if( GL_FALSE == Move_Depth_Exports_To_Correct_Channels(pR700AsmCode, SQ_SEL_Z) )  // depth
+		{
+			return GL_FALSE;
+		}
+    }
+
+    unBit = 1 << FRAG_RESULT_COLOR;
+	if(OutputsWritten & unBit)
+	{
+		if( GL_FALSE == Process_Export(pR700AsmCode,
+                                       SQ_EXPORT_PIXEL, 
+                                       0, 
+                                       1, 
+                                       pR700AsmCode->uiFP_OutputMap[FRAG_RESULT_COLOR], 
+                                       GL_FALSE) ) 
+        {
+            return GL_FALSE;
+        }
+	}
+	unBit = 1 << FRAG_RESULT_DEPTH;
+	if(OutputsWritten & unBit)
+	{
+        if( GL_FALSE == Process_Export(pR700AsmCode,
+                                       SQ_EXPORT_PIXEL, 
+                                       0, 
+                                       1, 
+                                       pR700AsmCode->uiFP_OutputMap[FRAG_RESULT_DEPTH], 
+                                       GL_TRUE)) 
+        {
+            return GL_FALSE;
+        }
+	}
+
+    if(pR700AsmCode->cf_last_export_ptr != NULL) 
+    {
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst        = SQ_CF_INST_EXPORT_DONE;
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.end_of_program = 0x1;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean Process_Vertex_Exports(r700_AssemblerBase *pR700AsmCode,
+                                 GLbitfield          OutputsWritten)  
+{
+    unsigned int unBit;
+    unsigned int i;
+
+    GLuint export_starting_index  = 0;
+    GLuint export_count           = pR700AsmCode->number_of_exports;
+
+    unBit = 1 << VERT_RESULT_HPOS;
+	if(OutputsWritten & unBit)
+	{
+        if( GL_FALSE == Process_Export(pR700AsmCode, 
+                                       SQ_EXPORT_POS, 
+                                       export_starting_index, 
+                                       1, 
+                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_HPOS],
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+
+        export_count--;
+
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;
+	}
+
+    pR700AsmCode->number_of_exports = export_count;
+
+	unBit = 1 << VERT_RESULT_COL0;
+	if(OutputsWritten & unBit)
+	{
+        if( GL_FALSE == Process_Export(pR700AsmCode, 
+                                       SQ_EXPORT_PARAM, 
+                                       export_starting_index, 
+                                       1, 
+                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_COL0],
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+
+        export_starting_index++;
+	}
+
+	unBit = 1 << VERT_RESULT_COL1;
+	if(OutputsWritten & unBit)
+	{
+        if( GL_FALSE == Process_Export(pR700AsmCode, 
+                                       SQ_EXPORT_PARAM, 
+                                       export_starting_index, 
+                                       1, 
+                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_COL1],
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+
+        export_starting_index++;
+	}
+
+        unBit = 1 << VERT_RESULT_FOGC;
+        if(OutputsWritten & unBit)
+        {
+        if( GL_FALSE == Process_Export(pR700AsmCode,
+                                       SQ_EXPORT_PARAM,
+                                       export_starting_index,
+                                       1,
+                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_FOGC],
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+
+        export_starting_index++;
+        }
+
+	for(i=0; i<8; i++)
+	{
+		unBit = 1 << (VERT_RESULT_TEX0 + i);
+		if(OutputsWritten & unBit)
+		{
+            if( GL_FALSE == Process_Export(pR700AsmCode,
+                                          SQ_EXPORT_PARAM, 
+                                          export_starting_index, 
+                                          1, 
+                                          pR700AsmCode->ucVP_OutputMap[VERT_RESULT_TEX0 + i],
+                                          GL_FALSE) )
+            {
+                return GL_FALSE;
+            }
+
+            export_starting_index++;
+		}
+	}
+
+    // At least one param should be exported
+    if (export_count) 
+    {
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;    
+    }
+    else
+    {
+        if( GL_FALSE == Process_Export(pR700AsmCode,
+                                       SQ_EXPORT_PARAM, 
+                                       0, 
+                                       1, 
+                                       pR700AsmCode->starting_export_register_number,
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+      
+        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_0;
+        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_0;
+        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_0;
+        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_1;
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;
+    }
+
+    pR700AsmCode->cf_last_export_ptr->m_Word1.f.end_of_program = 0x1;
+
+    return GL_TRUE;
+}
+
+GLboolean Clean_Up_Assembler(r700_AssemblerBase *pR700AsmCode)
+{
+    FREE(pR700AsmCode->pucOutMask);
+    FREE(pR700AsmCode->pInstDeps);
+    return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.h b/src/mesa/drivers/dri/r600/r700_assembler.h
new file mode 100644
index 0000000000..f9c4d849c6
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_assembler.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#ifndef _R700_ASSEMBLER_H_
+#define _R700_ASSEMBLER_H_
+
+#include "main/mtypes.h"
+#include "shader/prog_instruction.h"
+
+#include "r700_chip.h"
+#include "r700_shaderinst.h"
+#include "r700_shader.h"
+
+typedef enum SHADER_PIPE_TYPE 
+{
+    SPT_VP = 0,
+    SPT_FP = 1
+} SHADER_PIPE_TYPE;
+
+typedef enum ConstantCycles 
+{
+    NUMBER_OF_CYCLES     = 3,
+    NUMBER_OF_COMPONENTS = 4
+} ConstantCycles;
+
+typedef enum  HARDWARE_LIMIT_VALUES  
+{
+   TEMPORARY_REGISTER_OFFSET = SQ_ALU_SRC_GPR_BASE,
+   MAX_TEMPORARY_REGISTERS   = SQ_ALU_SRC_GPR_SIZE,
+   MAX_CONSTANT_REGISTERS    = SQ_ALU_SRC_CFILE_SIZE,
+   CFILE_REGISTER_OFFSET     = SQ_ALU_SRC_CFILE_BASE,
+   NUMBER_OF_INPUT_COLORS    = 2,
+   NUMBER_OF_OUTPUT_COLORS   = 8,
+   NUMBER_OF_TEXTURE_UNITS   = 16,
+   MEGA_FETCH_BYTES          = 32
+} HARDWARE_LIMIT_VALUES;
+
+typedef enum AddressMode 
+{
+    ADDR_ABSOLUTE          = 0,
+    ADDR_RELATIVE_A0       = 1,
+    ADDR_RELATIVE_FLI_0    = 2,
+    NUMBER_OF_ADDR_MOD     = 3
+} AddressMode;
+
+typedef enum SrcRegisterType 
+{
+    SRC_REG_TEMPORARY      = 0,
+    SRC_REG_INPUT          = 1,
+    SRC_REG_CONSTANT       = 2,
+    SRC_REG_ALT_TEMPORARY  = 3,
+    NUMBER_OF_SRC_REG_TYPE = 4
+} SrcRegisterType;
+
+typedef enum DstRegisterType 
+{
+    DST_REG_TEMPORARY      = 0,
+    DST_REG_A0             = 1,
+    DST_REG_OUT            = 2,
+    DST_REG_OUT_X_REPL     = 3,
+    DST_REG_ALT_TEMPORARY  = 4,
+    DST_REG_INPUT          = 5,
+    NUMBER_OF_DST_REG_TYPE = 6
+} DstRegisterType;
+
+typedef unsigned int BITS;
+
+typedef struct PVSDSTtag 
+{
+	BITS opcode:8;     //(:6)  //@@@ really should be 10 bits for OP2
+	BITS math:1;
+	BITS predicated:1; //10   //8
+	BITS pred_inv  :1; //11   //8
+
+	BITS rtype:3;
+	BITS reg:10;       //24   //20
+
+	BITS writex:1;
+	BITS writey:1;
+	BITS writez:1;
+	BITS writew:1;     //28
+
+	BITS op3:1;       // 29  Represents *_OP3_* ALU opcode
+
+	BITS dualop:1;    // 30  //26
+
+	BITS addrmode0:1; //31   //29
+	BITS addrmode1:1; //32
+} PVSDST;
+
+typedef struct PVSSRCtag 
+{
+	BITS rtype:4;            
+	BITS addrmode0:1;        
+	BITS reg:10;      //15     (8)
+	BITS swizzlex:3;
+	BITS swizzley:3;
+	BITS swizzlez:3;
+	BITS swizzlew:3;  //27        
+
+	BITS negx:1;
+	BITS negy:1;
+	BITS negz:1;
+	BITS negw:1;      //31
+	//BITS addrsel:2;
+	BITS addrmode1:1; //32
+} PVSSRC;
+
+typedef struct PVSMATHtag 
+{
+	BITS rtype:4;
+	BITS spare:1;
+	BITS reg:8;
+	BITS swizzlex:3;
+	BITS swizzley:3;
+	BITS dstoff:2; // 2 bits of dest offset into alt ram
+	BITS opcode:4;
+	BITS negx:1;
+	BITS negy:1;
+	BITS dstcomp:2; // select dest component
+	BITS spare2:3;
+} PVSMATH;
+
+typedef union PVSDWORDtag 
+{
+	BITS    bits;
+	PVSDST  dst;
+	PVSSRC  src;
+	PVSMATH math;
+	float   f;
+} PVSDWORD;
+
+typedef struct VAP_OUT_VTX_FMT_0tag 
+{
+	BITS pos:1;      // 0
+	BITS misc:1;
+	BITS clip_dist0:1;
+	BITS clip_dist1:1;
+	BITS pos_param:1; // 4
+
+	BITS color0:1;    // 5
+	BITS color1:1;
+	BITS color2:1;
+	BITS color3:1;
+	BITS color4:1;
+	BITS color5:1;
+	BITS color6:1;
+	BITS color7:1;
+
+	BITS normal:1;    
+
+	BITS depth:1;          // 14
+
+	BITS point_size:1;     // 15   
+	BITS edge_flag:1;      
+	BITS rta_index:1;      //     shares same channel as kill_flag
+	BITS kill_flag:1;
+	BITS viewport_index:1; // 19   
+
+	BITS resvd1:12;        // 20
+} VAP_OUT_VTX_FMT_0;
+
+typedef struct VAP_OUT_VTX_FMT_1tag 
+{
+	BITS tex0comp:3;
+	BITS tex1comp:3;
+	BITS tex2comp:3;
+	BITS tex3comp:3;
+	BITS tex4comp:3;
+	BITS tex5comp:3;
+	BITS tex6comp:3;
+	BITS tex7comp:3;
+
+	BITS resvd:8;
+} VAP_OUT_VTX_FMT_1;
+
+typedef struct VAP_OUT_VTX_FMT_2tag 
+{
+	BITS tex8comp :3;
+	BITS tex9comp :3;
+	BITS tex10comp:3;
+	BITS tex11comp:3;
+	BITS tex12comp:3;
+	BITS tex13comp:3;
+	BITS tex14comp:3;
+	BITS tex15comp:3;
+
+	BITS resvd:8;
+} VAP_OUT_VTX_FMT_2;
+
+typedef struct OUT_FRAGMENT_FMT_0tag 
+{
+	BITS color0:1;
+	BITS color1:1;
+	BITS color2:1;
+	BITS color3:1;
+	BITS color4:1;
+	BITS color5:1;
+	BITS color6:1;
+	BITS color7:1;
+
+	BITS depth:1;
+	BITS stencil_ref:1;
+	BITS coverage_to_mask:1;
+	BITS mask:1;
+
+	BITS resvd1:20;
+} OUT_FRAGMENT_FMT_0;
+
+typedef enum  CF_CLAUSE_TYPE 
+{
+   CF_EXPORT_CLAUSE,
+   CF_ALU_CLAUSE,
+   CF_TEX_CLAUSE,
+   CF_VTX_CLAUSE,
+   CF_OTHER_CLAUSE,
+   CF_EMPTY_CLAUSE,
+   NUMBER_CF_CLAUSE_TYPES
+} CF_CLAUSE_TYPE;
+
+enum 
+{
+    MAX_BOOL_CONSTANTS   = 32,
+    MAX_INT_CONSTANTS    = 32,
+    MAX_FLOAT_CONSTANTS  = 256,
+
+    FC_NONE = 0,
+    FC_IF = 1,
+    FC_LOOP = 2,
+    FC_REP = 3,
+
+    COND_NONE = 0,
+    COND_BOOL = 1,
+    COND_PRED = 2,
+    COND_ALU = 3,
+
+    SAFEDIST_TEX = 6, ///< safe distance for using result of texture lookup in alu or another tex lookup
+    SAFEDIST_ALU = 6 ///< the same for alu->fc
+};
+
+typedef struct FC_LEVEL 
+{
+	unsigned int           first; ///< first fc instruction on level (if, rep, loop)
+	unsigned int*          mid; ///< middle instructions - else or all breaks on this level
+	unsigned int           midLen;
+	unsigned int           type;
+	unsigned int           cond;
+	unsigned int           inv;
+	unsigned int           bpush; ///< 1 if first instruction does branch stack push
+			 int           id; ///< id of bool or int variable
+} FC_LEVEL;
+
+typedef struct VTX_FETCH_METHOD 
+{
+	GLboolean bEnableMini;
+	GLuint mega_fetch_remainder;
+} VTX_FETCH_METHOD;
+
+typedef struct r700_AssemblerBase 
+{
+	R700ControlFlowSXClause*      cf_last_export_ptr;
+	R700ControlFlowSXClause*      cf_current_export_clause_ptr;
+	R700ControlFlowALUClause*     cf_current_alu_clause_ptr;
+	R700ControlFlowGenericClause* cf_current_tex_clause_ptr;
+	R700ControlFlowGenericClause* cf_current_vtx_clause_ptr;
+	R700ControlFlowGenericClause* cf_current_cf_clause_ptr;
+
+    //Result shader
+    R700_Shader * pR700Shader;
+
+	// No clause has been created yet
+	CF_CLAUSE_TYPE cf_current_clause_type;
+
+	GLuint number_of_exports;
+	GLuint number_of_colorandz_exports;
+	GLuint number_of_export_opcodes;
+
+	PVSDWORD D;
+	PVSDWORD S[3];
+
+	unsigned int uLastPosUpdate;
+
+	OUT_FRAGMENT_FMT_0     fp_stOutFmt0;
+
+	unsigned int uIIns;
+	unsigned int uOIns;
+	unsigned int number_used_registers;
+	unsigned int uUsedConsts; 
+
+	// Fragment programs
+	unsigned int uiFP_AttributeMap[FRAG_ATTRIB_MAX];
+	unsigned int uiFP_OutputMap[FRAG_RESULT_MAX];
+	unsigned int uBoolConsts;
+	unsigned int uIntConsts;
+	unsigned int uInsts;
+	unsigned int uConsts;
+
+	// Vertex programs
+	unsigned char ucVP_AttributeMap[VERT_ATTRIB_MAX];
+	unsigned char ucVP_OutputMap[VERT_RESULT_MAX];
+
+    unsigned char * pucOutMask;
+
+	//-----------------------------------------------------------------------------------
+	// flow control members
+	//-----------------------------------------------------------------------------------
+	unsigned int FCSP;
+	FC_LEVEL fc_stack[32];
+
+	unsigned int branch_depth;
+	unsigned int max_branch_depth;
+
+	//-----------------------------------------------------------------------------------
+	// ArgSubst used in Assemble_Source() function
+	//-----------------------------------------------------------------------------------
+	int aArgSubst[4];
+
+    GLint hw_gpr[ NUMBER_OF_CYCLES ][ NUMBER_OF_COMPONENTS ];
+    GLint hw_cfile_addr[ NUMBER_OF_COMPONENTS ];
+    GLint hw_cfile_chan[ NUMBER_OF_COMPONENTS ];
+
+    GLuint uOutputs;
+  
+    GLint color_export_register_number[NUMBER_OF_OUTPUT_COLORS];
+	GLint depth_export_register_number;
+
+	GLint stencil_export_register_number;
+	GLint coverage_to_mask_export_register_number;
+	GLint mask_export_register_number;
+
+	GLuint starting_export_register_number;
+	GLuint starting_vfetch_register_number;
+	GLuint starting_temp_register_number;
+	GLuint uHelpReg;
+	GLuint uFirstHelpReg;
+
+	GLboolean input_position_is_used;
+	GLboolean input_normal_is_used;
+
+	GLboolean input_color_is_used[NUMBER_OF_INPUT_COLORS];
+  
+	GLboolean input_texture_unit_is_used[NUMBER_OF_TEXTURE_UNITS];
+  
+    R700VertexGenericFetch* vfetch_instruction_ptr_array[VERT_ATTRIB_MAX];
+  
+	GLuint number_of_inputs;
+
+    InstDeps *pInstDeps;
+
+    SHADER_PIPE_TYPE currentShaderType;
+    struct prog_instruction * pILInst;
+    GLuint             uiCurInst;
+    GLboolean   bR6xx;
+} r700_AssemblerBase;
+
+//Internal use
+BITS addrmode_PVSDST(PVSDST * pPVSDST);
+void setaddrmode_PVSDST(PVSDST * pPVSDST, BITS addrmode);
+void nomask_PVSDST(PVSDST * pPVSDST);
+BITS addrmode_PVSSRC(PVSSRC* pPVSSRC);
+void setaddrmode_PVSSRC(PVSSRC* pPVSSRC, BITS addrmode);
+void setswizzle_PVSSRC(PVSSRC* pPVSSRC, BITS swz);
+void noswizzle_PVSSRC(PVSSRC* pPVSSRC);
+void swizzleagain_PVSSRC(PVSSRC * pPVSSRC, BITS x, BITS y, BITS z, BITS w);
+void neg_PVSSRC(PVSSRC* pPVSSRC);
+void noneg_PVSSRC(PVSSRC* pPVSSRC);
+void flipneg_PVSSRC(PVSSRC* pPVSSRC);
+void zerocomp_PVSSRC(PVSSRC* pPVSSRC, int c);
+void onecomp_PVSSRC(PVSSRC* pPVSSRC, int c);
+BITS is_misc_component_exported(VAP_OUT_VTX_FMT_0* pOutVTXFmt0);
+BITS is_depth_component_exported(OUT_FRAGMENT_FMT_0* pFPOutFmt) ;
+GLboolean is_reduction_opcode(PVSDWORD * dest);
+GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size);
+
+unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm);
+
+GLboolean IsTex(gl_inst_opcode Opcode);
+GLboolean IsAlu(gl_inst_opcode Opcode);
+int check_current_clause(r700_AssemblerBase* pAsm,
+					     CF_CLAUSE_TYPE      new_clause_type);
+GLboolean add_vfetch_instruction(r700_AssemblerBase*     pAsm,
+								 R700VertexInstruction*  vertex_instruction_ptr);
+GLboolean add_tex_instruction(r700_AssemblerBase*     pAsm,
+                              R700TextureInstruction* tex_instruction_ptr);
+GLboolean assemble_vfetch_instruction(r700_AssemblerBase* pAsm,
+								GLuint gl_client_id,
+                                GLuint destination_register,
+								GLuint number_of_elements,
+                                GLenum dataElementType,
+								VTX_FETCH_METHOD* pFetchMethod);
+GLuint gethelpr(r700_AssemblerBase* pAsm);
+void resethelpr(r700_AssemblerBase* pAsm);
+void checkop_init(r700_AssemblerBase* pAsm);
+GLboolean mov_temp(r700_AssemblerBase* pAsm, int src);
+GLboolean checkop1(r700_AssemblerBase* pAsm);
+GLboolean checkop2(r700_AssemblerBase* pAsm);
+GLboolean checkop3(r700_AssemblerBase* pAsm);
+GLboolean assemble_src(r700_AssemblerBase *pAsm,
+                       int src, 
+                       int fld);
+GLboolean assemble_dst(r700_AssemblerBase *pAsm);
+GLboolean tex_dst(r700_AssemblerBase *pAsm);
+GLboolean tex_src(r700_AssemblerBase *pAsm);
+GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm, GLboolean normalized);
+void initialize(r700_AssemblerBase *pAsm);
+GLboolean assemble_alu_src(R700ALUInstruction*  alu_instruction_ptr,
+                           int                  source_index,
+                           PVSSRC*              pSource,
+                           BITS                 scalar_channel_index);
+GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
+                              R700ALUInstruction* alu_instruction_ptr,
+                              GLuint              contiguous_slots_needed);
+void get_src_properties(R700ALUInstruction*  alu_instruction_ptr,
+                        int                  source_index,
+                        BITS*                psrc_sel,
+                        BITS*                psrc_rel,
+                        BITS*                psrc_chan,
+                        BITS*                psrc_neg);
+int is_cfile(BITS sel);
+int is_const(BITS sel);
+int is_gpr(BITS sel);
+GLboolean reserve_cfile(r700_AssemblerBase* pAsm, 
+                        GLuint sel, 
+                        GLuint chan);
+GLboolean reserve_gpr(r700_AssemblerBase* pAsm, GLuint sel, GLuint chan, GLuint cycle);
+GLboolean cycle_for_scalar_bank_swizzle(const int swiz, const int sel, GLuint* pCycle);
+GLboolean cycle_for_vector_bank_swizzle(const int swiz, const int sel, GLuint* pCycle);
+GLboolean check_scalar(r700_AssemblerBase* pAsm,
+                       R700ALUInstruction* alu_instruction_ptr);
+GLboolean check_vector(r700_AssemblerBase* pAsm,
+                       R700ALUInstruction* alu_instruction_ptr);
+GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm);
+GLboolean next_ins(r700_AssemblerBase *pAsm);
+GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode);
+GLboolean assemble_ABS(r700_AssemblerBase *pAsm);
+GLboolean assemble_ADD(r700_AssemblerBase *pAsm);
+GLboolean assemble_BAD(char *opcode_str);
+GLboolean assemble_CMP(r700_AssemblerBase *pAsm);
+GLboolean assemble_COS(r700_AssemblerBase *pAsm);
+GLboolean assemble_DOT(r700_AssemblerBase *pAsm);
+GLboolean assemble_DST(r700_AssemblerBase *pAsm);
+GLboolean assemble_EX2(r700_AssemblerBase *pAsm);
+GLboolean assemble_FLR(r700_AssemblerBase *pAsm);
+GLboolean assemble_FLR_INT(r700_AssemblerBase *pAsm);
+GLboolean assemble_FRC(r700_AssemblerBase *pAsm);
+GLboolean assemble_KIL(r700_AssemblerBase *pAsm);
+GLboolean assemble_LG2(r700_AssemblerBase *pAsm);
+GLboolean assemble_LRP(r700_AssemblerBase *pAsm);
+GLboolean assemble_MAD(r700_AssemblerBase *pAsm);
+GLboolean assemble_LIT(r700_AssemblerBase *pAsm);
+GLboolean assemble_MAX(r700_AssemblerBase *pAsm);
+GLboolean assemble_MIN(r700_AssemblerBase *pAsm);
+GLboolean assemble_MOV(r700_AssemblerBase *pAsm);
+GLboolean assemble_MUL(r700_AssemblerBase *pAsm);
+GLboolean assemble_POW(r700_AssemblerBase *pAsm);
+GLboolean assemble_RCP(r700_AssemblerBase *pAsm);
+GLboolean assemble_RSQ(r700_AssemblerBase *pAsm);
+GLboolean assemble_SIN(r700_AssemblerBase *pAsm);
+GLboolean assemble_SCS(r700_AssemblerBase *pAsm);
+GLboolean assemble_SGE(r700_AssemblerBase *pAsm);
+GLboolean assemble_SLT(r700_AssemblerBase *pAsm);
+GLboolean assemble_STP(r700_AssemblerBase *pAsm);
+GLboolean assemble_TEX(r700_AssemblerBase *pAsm);
+GLboolean assemble_XPD(r700_AssemblerBase *pAsm);
+GLboolean assemble_EXPORT(r700_AssemblerBase *pAsm);
+GLboolean assemble_IF(r700_AssemblerBase *pAsm);
+GLboolean assemble_ENDIF(r700_AssemblerBase *pAsm);
+
+GLboolean Process_Export(r700_AssemblerBase* pAsm,
+                         GLuint type, 
+                         GLuint export_starting_index,
+                         GLuint export_count, 
+                         GLuint starting_register_number,
+                         GLboolean is_depth_export);
+GLboolean Move_Depth_Exports_To_Correct_Channels(r700_AssemblerBase *pAsm, 
+                                                 BITS depth_channel_select);
+
+
+//Interface
+GLboolean AssembleInstr(GLuint uiNumberInsts,
+                        struct prog_instruction *pILInst, 
+						r700_AssemblerBase *pR700AsmCode);
+GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode, GLbitfield OutputsWritten);  
+GLboolean Process_Vertex_Exports(r700_AssemblerBase *pR700AsmCode, GLbitfield OutputsWritten);
+
+int       Init_r700_AssemblerBase(SHADER_PIPE_TYPE spt, r700_AssemblerBase* pAsm, R700_Shader* pShader);
+GLboolean Clean_Up_Assembler(r700_AssemblerBase *pR700AsmCode);
+
+#endif //_R700_ASSEMBLER_H_
diff --git a/src/mesa/drivers/dri/r600/r700_chip.c b/src/mesa/drivers/dri/r600/r700_chip.c
new file mode 100644
index 0000000000..1b56059197
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_chip.c
@@ -0,0 +1,1274 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#include "main/imports.h"
+#include "main/glheader.h"
+#include "main/simple_list.h"
+
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+
+#include "r700_state.h"
+#include "r600_tex.h"
+#include "r700_oglprog.h"
+#include "r700_fragprog.h"
+#include "r700_vertprog.h"
+#include "r700_ioctl.h"
+
+#include "radeon_mipmap_tree.h"
+
+static void r700SendTexState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t         *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	struct radeon_bo *bo = NULL;
+	unsigned int i;
+	BATCH_LOCALS(&context->radeon);
+
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	for (i = 0; i < R700_TEXTURE_NUMBERUNITS; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+			radeonTexObj *t = r700->textures[i];
+			if (t) {
+				if (!t->image_override)
+					bo = t->mt->bo;
+				else
+					bo = t->bo;
+				if (bo) {
+
+					r700SyncSurf(context, bo,
+						     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM,
+						     0, TC_ACTION_ENA_bit);
+
+					BEGIN_BATCH_NO_AUTOSTATE(9 + 4);
+					R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, 7));
+					R600_OUT_BATCH(i * 7);
+					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE0);
+					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE1);
+					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE2);
+					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE3);
+					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE4);
+					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE5);
+					R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE6);
+					R600_OUT_BATCH_RELOC(r700->textures[i]->SQ_TEX_RESOURCE2,
+							     bo,
+							     0,
+							     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+					R600_OUT_BATCH_RELOC(r700->textures[i]->SQ_TEX_RESOURCE3,
+							     bo,
+							     r700->textures[i]->SQ_TEX_RESOURCE3,
+							     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+					END_BATCH();
+					COMMIT_BATCH();
+				}
+			}
+		}
+	}
+}
+
+static void r700SendTexSamplerState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t         *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	unsigned int i;
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	for (i = 0; i < R700_TEXTURE_NUMBERUNITS; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+			radeonTexObj *t = r700->textures[i];
+			if (t) {
+				BEGIN_BATCH_NO_AUTOSTATE(5);
+				R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_SAMPLER, 3));
+				R600_OUT_BATCH(i * 3);
+				R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER0);
+				R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER1);
+				R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER2);
+				END_BATCH();
+				COMMIT_BATCH();
+			}
+		}
+	}
+}
+
+static void r700SendTexBorderColorState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t         *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	unsigned int i;
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	for (i = 0; i < R700_TEXTURE_NUMBERUNITS; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+			radeonTexObj *t = r700->textures[i];
+			if (t) {
+				BEGIN_BATCH_NO_AUTOSTATE(2 + 4);
+				R600_OUT_BATCH_REGSEQ((TD_PS_SAMPLER0_BORDER_RED + (i * 16)), 4);
+				R600_OUT_BATCH(r700->textures[i]->TD_PS_SAMPLER0_BORDER_RED);
+				R600_OUT_BATCH(r700->textures[i]->TD_PS_SAMPLER0_BORDER_GREEN);
+				R600_OUT_BATCH(r700->textures[i]->TD_PS_SAMPLER0_BORDER_BLUE);
+				R600_OUT_BATCH(r700->textures[i]->TD_PS_SAMPLER0_BORDER_ALPHA);
+				END_BATCH();
+				COMMIT_BATCH();
+			}
+		}
+	}
+}
+
+static void r700SetupVTXConstants(GLcontext  * ctx,
+				  unsigned int nStreamID,
+				  void *       pAos,
+				  unsigned int size,      /* number of elements in vector */
+				  unsigned int stride,
+				  unsigned int count)     /* number of vectors in stream */
+{
+    context_t *context = R700_CONTEXT(ctx);
+    struct radeon_aos * paos = (struct radeon_aos *)pAos;
+    BATCH_LOCALS(&context->radeon);
+    radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+    unsigned int uSQ_VTX_CONSTANT_WORD0_0;
+    unsigned int uSQ_VTX_CONSTANT_WORD1_0;
+    unsigned int uSQ_VTX_CONSTANT_WORD2_0 = 0;
+    unsigned int uSQ_VTX_CONSTANT_WORD3_0 = 0;
+    unsigned int uSQ_VTX_CONSTANT_WORD6_0 = 0;
+
+    if (!paos->bo)
+	    return;
+
+    if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV610) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV620) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS780) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS880) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV710))
+	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, TC_ACTION_ENA_bit);
+    else
+	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
+
+    uSQ_VTX_CONSTANT_WORD0_0 = paos->offset;
+    uSQ_VTX_CONSTANT_WORD1_0 = count * (size * 4) - 1;
+
+    SETfield(uSQ_VTX_CONSTANT_WORD2_0, 0, BASE_ADDRESS_HI_shift, BASE_ADDRESS_HI_mask); /* TODO */
+    SETfield(uSQ_VTX_CONSTANT_WORD2_0, stride, SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift,
+	     SQ_VTX_CONSTANT_WORD2_0__STRIDE_mask);
+    SETfield(uSQ_VTX_CONSTANT_WORD2_0, GetSurfaceFormat(GL_FLOAT, size, NULL),
+	     SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_shift,
+	     SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_mask); /* TODO : trace back api for initial data type, not only GL_FLOAT */
+    SETfield(uSQ_VTX_CONSTANT_WORD2_0, SQ_NUM_FORMAT_SCALED,
+	     SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift, SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_mask);
+    SETbit(uSQ_VTX_CONSTANT_WORD2_0, SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit);
+
+    SETfield(uSQ_VTX_CONSTANT_WORD3_0, 1, MEM_REQUEST_SIZE_shift, MEM_REQUEST_SIZE_mask);
+    SETfield(uSQ_VTX_CONSTANT_WORD6_0, SQ_TEX_VTX_VALID_BUFFER,
+	     SQ_TEX_RESOURCE_WORD6_0__TYPE_shift, SQ_TEX_RESOURCE_WORD6_0__TYPE_mask);
+
+    BEGIN_BATCH_NO_AUTOSTATE(9 + 2);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, 7));
+    R600_OUT_BATCH((nStreamID + SQ_FETCH_RESOURCE_VS_OFFSET) * FETCH_RESOURCE_STRIDE);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD0_0);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD1_0);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD2_0);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD3_0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD6_0);
+    R600_OUT_BATCH_RELOC(uSQ_VTX_CONSTANT_WORD0_0,
+                         paos->bo,
+                         uSQ_VTX_CONSTANT_WORD0_0,
+                         RADEON_GEM_DOMAIN_GTT, 0, 0);
+    END_BATCH();
+    COMMIT_BATCH();
+
+}
+
+void r700SetupStreams(GLcontext *ctx)
+{
+    context_t         *context = R700_CONTEXT(ctx);
+    struct r700_vertex_program *vp = context->selected_vp;
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    struct vertex_buffer *vb = &tnl->vb;
+    unsigned int i, j = 0;
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+    R600_STATECHANGE(context, vtx);
+
+    for(i=0; i<VERT_ATTRIB_MAX; i++) {
+	    if(vp->mesa_program->Base.InputsRead & (1 << i)) {
+		    rcommon_emit_vector(ctx,
+					&context->radeon.tcl.aos[j],
+					vb->AttribPtr[i]->data,
+					vb->AttribPtr[i]->size,
+					vb->AttribPtr[i]->stride,
+					vb->Count);
+		    j++;
+	    }
+    }
+    context->radeon.tcl.aos_count = j;
+}
+
+static void r700SendVTXState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+    context_t         *context = R700_CONTEXT(ctx);
+    struct r700_vertex_program *vp = context->selected_vp;
+    unsigned int i, j = 0;
+    BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+    if (context->radeon.tcl.aos_count == 0)
+	    return;
+
+    BEGIN_BATCH_NO_AUTOSTATE(6);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
+    R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(0);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
+    R600_OUT_BATCH(mmSQ_VTX_START_INST_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(0);
+    END_BATCH();
+    COMMIT_BATCH();
+
+    for(i=0; i<VERT_ATTRIB_MAX; i++) {
+	    if(vp->mesa_program->Base.InputsRead & (1 << i)) {
+		    /* currently aos are packed */
+		    r700SetupVTXConstants(ctx,
+					  i,
+					  (void*)(&context->radeon.tcl.aos[j]),
+					  (unsigned int)context->radeon.tcl.aos[j].components,
+					  (unsigned int)context->radeon.tcl.aos[j].stride * 4,
+					  (unsigned int)context->radeon.tcl.aos[j].count);
+		    j++;
+	    }
+    }
+}
+
+static void r700SetRenderTarget(context_t *context, int id)
+{
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    struct radeon_renderbuffer *rrb;
+    unsigned int nPitchInPixel;
+
+    rrb = radeon_get_colorbuffer(&context->radeon);
+    if (!rrb || !rrb->bo) {
+	    return;
+    }
+
+    R600_STATECHANGE(context, cb_target);
+
+    /* color buffer */
+    r700->render_target[id].CB_COLOR0_BASE.u32All = context->radeon.state.color.draw_offset;
+
+    nPitchInPixel = rrb->pitch/rrb->cpp;
+    SETfield(r700->render_target[id].CB_COLOR0_SIZE.u32All, (nPitchInPixel/8)-1,
+             PITCH_TILE_MAX_shift, PITCH_TILE_MAX_mask);
+    SETfield(r700->render_target[id].CB_COLOR0_SIZE.u32All, ( (nPitchInPixel * context->radeon.radeonScreen->driScreen->fbHeight)/64 )-1,
+             SLICE_TILE_MAX_shift, SLICE_TILE_MAX_mask);
+    r700->render_target[id].CB_COLOR0_BASE.u32All = 0;
+    SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, ENDIAN_NONE, ENDIAN_shift, ENDIAN_mask);
+    SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, ARRAY_LINEAR_GENERAL,
+             CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
+    if(4 == rrb->cpp)
+    {
+        SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, COLOR_8_8_8_8,
+                 CB_COLOR0_INFO__FORMAT_shift, CB_COLOR0_INFO__FORMAT_mask);
+        SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, SWAP_ALT, COMP_SWAP_shift, COMP_SWAP_mask);
+    }
+    else
+    {
+        SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, COLOR_5_6_5,
+                 CB_COLOR0_INFO__FORMAT_shift, CB_COLOR0_INFO__FORMAT_mask);
+        SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, SWAP_ALT_REV,
+                 COMP_SWAP_shift, COMP_SWAP_mask);
+    }
+    SETbit(r700->render_target[id].CB_COLOR0_INFO.u32All, SOURCE_FORMAT_bit);
+    SETbit(r700->render_target[id].CB_COLOR0_INFO.u32All, BLEND_CLAMP_bit);
+    SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+
+    r700->render_target[id].enabled = GL_TRUE;
+}
+
+static void r700SetDepthTarget(context_t *context)
+{
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    struct radeon_renderbuffer *rrb;
+    unsigned int nPitchInPixel;
+
+    rrb = radeon_get_depthbuffer(&context->radeon);
+    if (!rrb)
+	    return;
+
+    R600_STATECHANGE(context, db_target);
+
+    /* depth buf */
+    r700->DB_DEPTH_SIZE.u32All = 0;
+    r700->DB_DEPTH_BASE.u32All = 0;
+    r700->DB_DEPTH_INFO.u32All = 0;
+    r700->DB_DEPTH_VIEW.u32All = 0;
+
+    nPitchInPixel = rrb->pitch/rrb->cpp;
+
+    SETfield(r700->DB_DEPTH_SIZE.u32All, (nPitchInPixel/8)-1,
+             PITCH_TILE_MAX_shift, PITCH_TILE_MAX_mask);
+    SETfield(r700->DB_DEPTH_SIZE.u32All, ( (nPitchInPixel * context->radeon.radeonScreen->driScreen->fbHeight)/64 )-1,
+             SLICE_TILE_MAX_shift, SLICE_TILE_MAX_mask); /* size in pixel / 64 - 1 */
+
+    if(4 == rrb->cpp)
+    {
+        SETfield(r700->DB_DEPTH_INFO.u32All, DEPTH_8_24,
+                 DB_DEPTH_INFO__FORMAT_shift, DB_DEPTH_INFO__FORMAT_mask);
+    }
+    else
+    {
+        SETfield(r700->DB_DEPTH_INFO.u32All, DEPTH_16,
+                     DB_DEPTH_INFO__FORMAT_shift, DB_DEPTH_INFO__FORMAT_mask);
+    }
+    SETfield(r700->DB_DEPTH_INFO.u32All, ARRAY_2D_TILED_THIN1,
+             DB_DEPTH_INFO__ARRAY_MODE_shift, DB_DEPTH_INFO__ARRAY_MODE_mask);
+    /* r700->DB_PREFETCH_LIMIT.bits.DEPTH_HEIGHT_TILE_MAX = (context->currentDraw->h >> 3) - 1; */ /* z buffer sie may much bigger than what need, so use actual used h. */
+}
+
+static void r700SendDepthTargetState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_renderbuffer *rrb;
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	rrb = radeon_get_depthbuffer(&context->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return;
+	}
+
+	r700SetDepthTarget(context);
+
+        BEGIN_BATCH_NO_AUTOSTATE(8 + 2);
+	R600_OUT_BATCH_REGSEQ(DB_DEPTH_SIZE, 2);
+	R600_OUT_BATCH(r700->DB_DEPTH_SIZE.u32All);
+	R600_OUT_BATCH(r700->DB_DEPTH_VIEW.u32All);
+	R600_OUT_BATCH_REGSEQ(DB_DEPTH_BASE, 2);
+	R600_OUT_BATCH(r700->DB_DEPTH_BASE.u32All);
+	R600_OUT_BATCH(r700->DB_DEPTH_INFO.u32All);
+	R600_OUT_BATCH_RELOC(r700->DB_DEPTH_BASE.u32All,
+			     rrb->bo,
+			     r700->DB_DEPTH_BASE.u32All,
+			     0, RADEON_GEM_DOMAIN_VRAM, 0);
+        END_BATCH();
+
+	if ((context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) &&
+	    (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)) {
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SURFACE_BASE_UPDATE, 0));
+		R600_OUT_BATCH(1 << 0);
+		END_BATCH();
+	}
+
+	COMMIT_BATCH();
+
+}
+
+static void r700SendRenderTargetState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_renderbuffer *rrb;
+	BATCH_LOCALS(&context->radeon);
+	int id = 0;
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	rrb = radeon_get_colorbuffer(&context->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return;
+	}
+
+	r700SetRenderTarget(context, 0);
+
+	if (id > R700_MAX_RENDER_TARGETS)
+		return;
+
+	if (!r700->render_target[id].enabled)
+		return;
+
+        BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+	R600_OUT_BATCH_REGSEQ(CB_COLOR0_BASE + (4 * id), 1);
+	R600_OUT_BATCH(r700->render_target[id].CB_COLOR0_BASE.u32All);
+	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     rrb->bo,
+			     r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     0, RADEON_GEM_DOMAIN_VRAM, 0);
+        END_BATCH();
+
+	if ((context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) &&
+	    (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)) {
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SURFACE_BASE_UPDATE, 0));
+		R600_OUT_BATCH((2 << id));
+		END_BATCH();
+	}
+
+        BEGIN_BATCH_NO_AUTOSTATE(18);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_SIZE + (4 * id), r700->render_target[id].CB_COLOR0_SIZE.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_VIEW + (4 * id), r700->render_target[id].CB_COLOR0_VIEW.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), r700->render_target[id].CB_COLOR0_INFO.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_TILE + (4 * id), r700->render_target[id].CB_COLOR0_TILE.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_FRAG + (4 * id), r700->render_target[id].CB_COLOR0_FRAG.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), r700->render_target[id].CB_COLOR0_MASK.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+}
+
+static void r700SendPSState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_bo * pbo;
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	pbo = (struct radeon_bo *)r700GetActiveFpShaderBo(GL_CONTEXT(context));
+
+	if (!pbo)
+		return;
+
+	r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+
+        BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+	R600_OUT_BATCH_REGSEQ(SQ_PGM_START_PS, 1);
+	R600_OUT_BATCH(r700->ps.SQ_PGM_START_PS.u32All);
+	R600_OUT_BATCH_RELOC(r700->ps.SQ_PGM_START_PS.u32All,
+			     pbo,
+			     r700->ps.SQ_PGM_START_PS.u32All,
+			     RADEON_GEM_DOMAIN_GTT, 0, 0);
+	END_BATCH();
+
+        BEGIN_BATCH_NO_AUTOSTATE(9);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_PS, r700->ps.SQ_PGM_RESOURCES_PS.u32All);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_EXPORTS_PS, r700->ps.SQ_PGM_EXPORTS_PS.u32All);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_PS, r700->ps.SQ_PGM_CF_OFFSET_PS.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+}
+
+static void r700SendVSState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_bo * pbo;
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	pbo = (struct radeon_bo *)r700GetActiveVpShaderBo(GL_CONTEXT(context));
+
+	if (!pbo)
+		return;
+
+	r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+
+        BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+	R600_OUT_BATCH_REGSEQ(SQ_PGM_START_VS, 1);
+	R600_OUT_BATCH(r700->vs.SQ_PGM_START_VS.u32All);
+	R600_OUT_BATCH_RELOC(r700->vs.SQ_PGM_START_VS.u32All,
+			     pbo,
+			     r700->vs.SQ_PGM_START_VS.u32All,
+			     RADEON_GEM_DOMAIN_GTT, 0, 0);
+	END_BATCH();
+
+        BEGIN_BATCH_NO_AUTOSTATE(6);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_VS, r700->vs.SQ_PGM_RESOURCES_VS.u32All);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_VS, r700->vs.SQ_PGM_CF_OFFSET_VS.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+}
+
+static void r700SendFSState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_bo * pbo;
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	/* XXX fixme
+	 * R6xx chips require a FS be emitted, even if it's not used.
+	 * since we aren't using FS yet, just send the VS address to make
+	 * the kernel command checker happy
+	 */
+	pbo = (struct radeon_bo *)r700GetActiveVpShaderBo(GL_CONTEXT(context));
+	r700->fs.SQ_PGM_START_FS.u32All = r700->vs.SQ_PGM_START_VS.u32All;
+	r700->fs.SQ_PGM_RESOURCES_FS.u32All = 0;
+	r700->fs.SQ_PGM_CF_OFFSET_FS.u32All = 0;
+	/* XXX */
+
+	if (!pbo)
+		return;
+
+	r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+
+        BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+	R600_OUT_BATCH_REGSEQ(SQ_PGM_START_FS, 1);
+	R600_OUT_BATCH(r700->fs.SQ_PGM_START_FS.u32All);
+	R600_OUT_BATCH_RELOC(r700->fs.SQ_PGM_START_FS.u32All,
+			     pbo,
+			     r700->fs.SQ_PGM_START_FS.u32All,
+			     RADEON_GEM_DOMAIN_GTT, 0, 0);
+	END_BATCH();
+
+        BEGIN_BATCH_NO_AUTOSTATE(6);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_FS, r700->fs.SQ_PGM_RESOURCES_FS.u32All);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_FS, r700->fs.SQ_PGM_CF_OFFSET_FS.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+}
+
+static void r700SendViewportState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	int id = 0;
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	if (id > R700_MAX_VIEWPORTS)
+		return;
+
+	if (!r700->viewport[id].enabled)
+		return;
+
+        BEGIN_BATCH_NO_AUTOSTATE(16);
+	R600_OUT_BATCH_REGSEQ(PA_SC_VPORT_SCISSOR_0_TL + (8 * id), 2);
+	R600_OUT_BATCH(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_TL.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_BR.u32All);
+	R600_OUT_BATCH_REGSEQ(PA_SC_VPORT_ZMIN_0 + (8 * id), 2);
+	R600_OUT_BATCH(r700->viewport[id].PA_SC_VPORT_ZMIN_0.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_SC_VPORT_ZMAX_0.u32All);
+	R600_OUT_BATCH_REGSEQ(PA_CL_VPORT_XSCALE_0 + (24 * id), 6);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_XSCALE.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_XOFFSET.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_YSCALE.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_YOFFSET.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_ZSCALE.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_ZOFFSET.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+}
+
+static void r700SendSQConfig(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+        BEGIN_BATCH_NO_AUTOSTATE(34);
+	R600_OUT_BATCH_REGSEQ(SQ_CONFIG, 6);
+	R600_OUT_BATCH(r700->sq_config.SQ_CONFIG.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_GPR_RESOURCE_MGMT_2.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_STACK_RESOURCE_MGMT_1.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_STACK_RESOURCE_MGMT_2.u32All);
+
+	R600_OUT_BATCH_REGVAL(TA_CNTL_AUX, r700->TA_CNTL_AUX.u32All);
+	R600_OUT_BATCH_REGVAL(VC_ENHANCE, r700->VC_ENHANCE.u32All);
+	R600_OUT_BATCH_REGVAL(R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, r700->SQ_DYN_GPR_CNTL_PS_FLUSH_REQ.u32All);
+	R600_OUT_BATCH_REGVAL(DB_DEBUG, r700->DB_DEBUG.u32All);
+	R600_OUT_BATCH_REGVAL(DB_WATERMARKS, r700->DB_WATERMARKS.u32All);
+
+	R600_OUT_BATCH_REGSEQ(SQ_ESGS_RING_ITEMSIZE, 9);
+	R600_OUT_BATCH(r700->SQ_ESGS_RING_ITEMSIZE.u32All);
+	R600_OUT_BATCH(r700->SQ_GSVS_RING_ITEMSIZE.u32All);
+	R600_OUT_BATCH(r700->SQ_ESTMP_RING_ITEMSIZE.u32All);
+	R600_OUT_BATCH(r700->SQ_GSTMP_RING_ITEMSIZE.u32All);
+	R600_OUT_BATCH(r700->SQ_VSTMP_RING_ITEMSIZE.u32All);
+	R600_OUT_BATCH(r700->SQ_PSTMP_RING_ITEMSIZE.u32All);
+	R600_OUT_BATCH(r700->SQ_FBUF_RING_ITEMSIZE.u32All);
+	R600_OUT_BATCH(r700->SQ_REDUC_RING_ITEMSIZE.u32All);
+	R600_OUT_BATCH(r700->SQ_GS_VERT_ITEMSIZE.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+}
+
+static void r700SendUCPState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	int i;
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	for (i = 0; i < R700_MAX_UCP; i++) {
+		if (r700->ucp[i].enabled) {
+			BEGIN_BATCH_NO_AUTOSTATE(6);
+			R600_OUT_BATCH_REGSEQ(PA_CL_UCP_0_X + (16 * i), 4);
+			R600_OUT_BATCH(r700->ucp[i].PA_CL_UCP_0_X.u32All);
+			R600_OUT_BATCH(r700->ucp[i].PA_CL_UCP_0_Y.u32All);
+			R600_OUT_BATCH(r700->ucp[i].PA_CL_UCP_0_Z.u32All);
+			R600_OUT_BATCH(r700->ucp[i].PA_CL_UCP_0_W.u32All);
+			END_BATCH();
+			COMMIT_BATCH();
+		}
+	}
+}
+
+static void r700SendSPIState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	unsigned int ui;
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	BEGIN_BATCH_NO_AUTOSTATE(59 + R700_MAX_SHADER_EXPORTS);
+
+	R600_OUT_BATCH_REGSEQ(SQ_VTX_SEMANTIC_0, 32);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_0.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_1.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_2.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_3.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_4.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_5.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_6.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_7.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_8.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_9.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_10.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_11.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_12.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_13.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_14.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_15.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_16.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_17.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_18.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_19.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_20.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_21.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_22.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_23.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_24.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_25.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_26.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_27.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_28.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_29.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_30.u32All);
+	R600_OUT_BATCH(r700->SQ_VTX_SEMANTIC_31.u32All);
+
+	R600_OUT_BATCH_REGSEQ(SPI_VS_OUT_ID_0, 10);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_0.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_1.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_2.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_3.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_4.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_5.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_6.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_7.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_8.u32All);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_ID_9.u32All);
+
+	R600_OUT_BATCH_REGSEQ(SPI_VS_OUT_CONFIG, 9);
+	R600_OUT_BATCH(r700->SPI_VS_OUT_CONFIG.u32All);
+	R600_OUT_BATCH(r700->SPI_THREAD_GROUPING.u32All);
+	R600_OUT_BATCH(r700->SPI_PS_IN_CONTROL_0.u32All);
+	R600_OUT_BATCH(r700->SPI_PS_IN_CONTROL_1.u32All);
+	R600_OUT_BATCH(r700->SPI_INTERP_CONTROL_0.u32All);
+	R600_OUT_BATCH(r700->SPI_INPUT_Z.u32All);
+	R600_OUT_BATCH(r700->SPI_FOG_CNTL.u32All);
+	R600_OUT_BATCH(r700->SPI_FOG_FUNC_SCALE.u32All);
+	R600_OUT_BATCH(r700->SPI_FOG_FUNC_BIAS.u32All);
+
+	R600_OUT_BATCH_REGSEQ(SPI_PS_INPUT_CNTL_0, R700_MAX_SHADER_EXPORTS);
+	for(ui = 0; ui < R700_MAX_SHADER_EXPORTS; ui++)
+		R600_OUT_BATCH(r700->SPI_PS_INPUT_CNTL[ui].u32All);
+
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendVGTState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+        BEGIN_BATCH_NO_AUTOSTATE(41);
+
+	R600_OUT_BATCH_REGSEQ(VGT_MAX_VTX_INDX, 4);
+	R600_OUT_BATCH(r700->VGT_MAX_VTX_INDX.u32All);
+	R600_OUT_BATCH(r700->VGT_MIN_VTX_INDX.u32All);
+	R600_OUT_BATCH(r700->VGT_INDX_OFFSET.u32All);
+	R600_OUT_BATCH(r700->VGT_MULTI_PRIM_IB_RESET_INDX.u32All);
+
+	R600_OUT_BATCH_REGSEQ(VGT_OUTPUT_PATH_CNTL, 13);
+	R600_OUT_BATCH(r700->VGT_OUTPUT_PATH_CNTL.u32All);
+	R600_OUT_BATCH(r700->VGT_HOS_CNTL.u32All);
+	R600_OUT_BATCH(r700->VGT_HOS_MAX_TESS_LEVEL.u32All);
+	R600_OUT_BATCH(r700->VGT_HOS_MIN_TESS_LEVEL.u32All);
+	R600_OUT_BATCH(r700->VGT_HOS_REUSE_DEPTH.u32All);
+	R600_OUT_BATCH(r700->VGT_GROUP_PRIM_TYPE.u32All);
+	R600_OUT_BATCH(r700->VGT_GROUP_FIRST_DECR.u32All);
+	R600_OUT_BATCH(r700->VGT_GROUP_DECR.u32All);
+	R600_OUT_BATCH(r700->VGT_GROUP_VECT_0_CNTL.u32All);
+	R600_OUT_BATCH(r700->VGT_GROUP_VECT_1_CNTL.u32All);
+	R600_OUT_BATCH(r700->VGT_GROUP_VECT_0_FMT_CNTL.u32All);
+	R600_OUT_BATCH(r700->VGT_GROUP_VECT_1_FMT_CNTL.u32All);
+	R600_OUT_BATCH(r700->VGT_GS_MODE.u32All);
+
+	R600_OUT_BATCH_REGVAL(VGT_PRIMITIVEID_EN, r700->VGT_PRIMITIVEID_EN.u32All);
+	R600_OUT_BATCH_REGVAL(VGT_MULTI_PRIM_IB_RESET_EN, r700->VGT_MULTI_PRIM_IB_RESET_EN.u32All);
+	R600_OUT_BATCH_REGVAL(VGT_INSTANCE_STEP_RATE_0, r700->VGT_INSTANCE_STEP_RATE_0.u32All);
+	R600_OUT_BATCH_REGVAL(VGT_INSTANCE_STEP_RATE_1, r700->VGT_INSTANCE_STEP_RATE_1.u32All);
+
+	R600_OUT_BATCH_REGSEQ(VGT_STRMOUT_EN, 3);
+	R600_OUT_BATCH(r700->VGT_STRMOUT_EN.u32All);
+	R600_OUT_BATCH(r700->VGT_REUSE_OFF.u32All);
+	R600_OUT_BATCH(r700->VGT_VTX_CNT_EN.u32All);
+
+	R600_OUT_BATCH_REGVAL(VGT_STRMOUT_BUFFER_EN, r700->VGT_STRMOUT_BUFFER_EN.u32All);
+
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendSXState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+        BEGIN_BATCH_NO_AUTOSTATE(9);
+	R600_OUT_BATCH_REGVAL(SX_MISC, r700->SX_MISC.u32All);
+	R600_OUT_BATCH_REGVAL(SX_ALPHA_TEST_CONTROL, r700->SX_ALPHA_TEST_CONTROL.u32All);
+	R600_OUT_BATCH_REGVAL(SX_ALPHA_REF, r700->SX_ALPHA_REF.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendDBState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+        BEGIN_BATCH_NO_AUTOSTATE(23);
+	R600_OUT_BATCH_REGVAL(DB_HTILE_DATA_BASE, r700->DB_HTILE_DATA_BASE.u32All);
+
+	R600_OUT_BATCH_REGSEQ(DB_STENCIL_CLEAR, 2);
+	R600_OUT_BATCH(r700->DB_STENCIL_CLEAR.u32All);
+	R600_OUT_BATCH(r700->DB_DEPTH_CLEAR.u32All);
+
+	R600_OUT_BATCH_REGVAL(DB_DEPTH_CONTROL, r700->DB_DEPTH_CONTROL.u32All);
+	R600_OUT_BATCH_REGVAL(DB_SHADER_CONTROL, r700->DB_SHADER_CONTROL.u32All);
+
+	R600_OUT_BATCH_REGSEQ(DB_RENDER_CONTROL, 2);
+	R600_OUT_BATCH(r700->DB_RENDER_CONTROL.u32All);
+	R600_OUT_BATCH(r700->DB_RENDER_OVERRIDE.u32All);
+
+	R600_OUT_BATCH_REGVAL(DB_HTILE_SURFACE, r700->DB_HTILE_SURFACE.u32All);
+	R600_OUT_BATCH_REGVAL(DB_ALPHA_TO_MASK, r700->DB_ALPHA_TO_MASK.u32All);
+
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendStencilState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+
+        BEGIN_BATCH_NO_AUTOSTATE(4);
+	R600_OUT_BATCH_REGSEQ(DB_STENCILREFMASK, 2);
+	R600_OUT_BATCH(r700->DB_STENCILREFMASK.u32All);
+	R600_OUT_BATCH(r700->DB_STENCILREFMASK_BF.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendCBState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770) {
+		BEGIN_BATCH_NO_AUTOSTATE(11);
+		R600_OUT_BATCH_REGSEQ(CB_CLEAR_RED, 4);
+		R600_OUT_BATCH(r700->CB_CLEAR_RED_R6XX.u32All);
+		R600_OUT_BATCH(r700->CB_CLEAR_GREEN_R6XX.u32All);
+		R600_OUT_BATCH(r700->CB_CLEAR_BLUE_R6XX.u32All);
+		R600_OUT_BATCH(r700->CB_CLEAR_ALPHA_R6XX.u32All);
+		R600_OUT_BATCH_REGSEQ(CB_FOG_RED, 3);
+		R600_OUT_BATCH(r700->CB_FOG_RED_R6XX.u32All);
+		R600_OUT_BATCH(r700->CB_FOG_GREEN_R6XX.u32All);
+		R600_OUT_BATCH(r700->CB_FOG_BLUE_R6XX.u32All);
+		END_BATCH();
+	}
+
+	BEGIN_BATCH_NO_AUTOSTATE(7);
+	R600_OUT_BATCH_REGSEQ(CB_TARGET_MASK, 2);
+	R600_OUT_BATCH(r700->CB_TARGET_MASK.u32All);
+	R600_OUT_BATCH(r700->CB_SHADER_MASK.u32All);
+	R600_OUT_BATCH_REGVAL(R7xx_CB_SHADER_CONTROL, r700->CB_SHADER_CONTROL.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendCBCLRCMPState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(6);
+	R600_OUT_BATCH_REGSEQ(CB_CLRCMP_CONTROL, 4);
+	R600_OUT_BATCH(r700->CB_CLRCMP_CONTROL.u32All);
+	R600_OUT_BATCH(r700->CB_CLRCMP_SRC.u32All);
+	R600_OUT_BATCH(r700->CB_CLRCMP_DST.u32All);
+	R600_OUT_BATCH(r700->CB_CLRCMP_MSK.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendCBBlendState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	unsigned int ui;
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770) {
+		BEGIN_BATCH_NO_AUTOSTATE(3);
+		R600_OUT_BATCH_REGVAL(CB_BLEND_CONTROL, r700->CB_BLEND_CONTROL.u32All);
+		END_BATCH();
+	}
+
+	BEGIN_BATCH_NO_AUTOSTATE(3);
+	R600_OUT_BATCH_REGVAL(CB_COLOR_CONTROL, r700->CB_COLOR_CONTROL.u32All);
+	END_BATCH();
+
+	if (context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) {
+		for (ui = 0; ui < R700_MAX_RENDER_TARGETS; ui++) {
+			if (r700->render_target[ui].enabled) {
+				BEGIN_BATCH_NO_AUTOSTATE(3);
+				R600_OUT_BATCH_REGVAL(CB_BLEND0_CONTROL + (4 * ui),
+						      r700->render_target[ui].CB_BLEND0_CONTROL.u32All);
+				END_BATCH();
+			}
+		}
+	}
+
+	COMMIT_BATCH();
+}
+
+static void r700SendCBBlendColorState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	BEGIN_BATCH_NO_AUTOSTATE(6);
+	R600_OUT_BATCH_REGSEQ(CB_BLEND_RED, 4);
+	R600_OUT_BATCH(r700->CB_BLEND_RED.u32All);
+	R600_OUT_BATCH(r700->CB_BLEND_GREEN.u32All);
+	R600_OUT_BATCH(r700->CB_BLEND_BLUE.u32All);
+	R600_OUT_BATCH(r700->CB_BLEND_ALPHA.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendSUState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(9);
+	R600_OUT_BATCH_REGVAL(PA_SU_SC_MODE_CNTL, r700->PA_SU_SC_MODE_CNTL.u32All);
+	R600_OUT_BATCH_REGSEQ(PA_SU_POINT_SIZE, 4);
+	R600_OUT_BATCH(r700->PA_SU_POINT_SIZE.u32All);
+	R600_OUT_BATCH(r700->PA_SU_POINT_MINMAX.u32All);
+	R600_OUT_BATCH(r700->PA_SU_LINE_CNTL.u32All);
+	R600_OUT_BATCH(r700->PA_SU_VTX_CNTL.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+
+}
+
+static void r700SendPolyState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(10);
+	R600_OUT_BATCH_REGSEQ(PA_SU_POLY_OFFSET_DB_FMT_CNTL, 2);
+	R600_OUT_BATCH(r700->PA_SU_POLY_OFFSET_DB_FMT_CNTL.u32All);
+	R600_OUT_BATCH(r700->PA_SU_POLY_OFFSET_CLAMP.u32All);
+	R600_OUT_BATCH_REGSEQ(PA_SU_POLY_OFFSET_FRONT_SCALE, 4);
+	R600_OUT_BATCH(r700->PA_SU_POLY_OFFSET_FRONT_SCALE.u32All);
+	R600_OUT_BATCH(r700->PA_SU_POLY_OFFSET_FRONT_OFFSET.u32All);
+	R600_OUT_BATCH(r700->PA_SU_POLY_OFFSET_BACK_SCALE.u32All);
+	R600_OUT_BATCH(r700->PA_SU_POLY_OFFSET_BACK_OFFSET.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+
+}
+
+static void r700SendCLState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	BEGIN_BATCH_NO_AUTOSTATE(12);
+	R600_OUT_BATCH_REGVAL(PA_CL_CLIP_CNTL, r700->PA_CL_CLIP_CNTL.u32All);
+	R600_OUT_BATCH_REGVAL(PA_CL_VTE_CNTL, r700->PA_CL_VTE_CNTL.u32All);
+	R600_OUT_BATCH_REGVAL(PA_CL_VS_OUT_CNTL, r700->PA_CL_VS_OUT_CNTL.u32All);
+	R600_OUT_BATCH_REGVAL(PA_CL_NANINF_CNTL, r700->PA_CL_NANINF_CNTL.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendGBState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(6);
+	R600_OUT_BATCH_REGSEQ(PA_CL_GB_VERT_CLIP_ADJ, 4);
+	R600_OUT_BATCH(r700->PA_CL_GB_VERT_CLIP_ADJ.u32All);
+	R600_OUT_BATCH(r700->PA_CL_GB_VERT_DISC_ADJ.u32All);
+	R600_OUT_BATCH(r700->PA_CL_GB_HORZ_CLIP_ADJ.u32All);
+	R600_OUT_BATCH(r700->PA_CL_GB_HORZ_DISC_ADJ.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendScissorState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	BEGIN_BATCH_NO_AUTOSTATE(22);
+	R600_OUT_BATCH_REGSEQ(PA_SC_SCREEN_SCISSOR_TL, 2);
+	R600_OUT_BATCH(r700->PA_SC_SCREEN_SCISSOR_TL.u32All);
+	R600_OUT_BATCH(r700->PA_SC_SCREEN_SCISSOR_BR.u32All);
+
+	R600_OUT_BATCH_REGSEQ(PA_SC_WINDOW_OFFSET, 12);
+	R600_OUT_BATCH(r700->PA_SC_WINDOW_OFFSET.u32All);
+	R600_OUT_BATCH(r700->PA_SC_WINDOW_SCISSOR_TL.u32All);
+	R600_OUT_BATCH(r700->PA_SC_WINDOW_SCISSOR_BR.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_RULE.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_0_TL.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_0_BR.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_1_TL.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_1_BR.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_2_TL.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_2_BR.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_3_TL.u32All);
+	R600_OUT_BATCH(r700->PA_SC_CLIPRECT_3_BR.u32All);
+
+	R600_OUT_BATCH_REGSEQ(PA_SC_GENERIC_SCISSOR_TL, 2);
+	R600_OUT_BATCH(r700->PA_SC_GENERIC_SCISSOR_TL.u32All);
+	R600_OUT_BATCH(r700->PA_SC_GENERIC_SCISSOR_BR.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendSCState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	BEGIN_BATCH_NO_AUTOSTATE(15);
+	R600_OUT_BATCH_REGVAL(R7xx_PA_SC_EDGERULE, r700->PA_SC_EDGERULE.u32All);
+	R600_OUT_BATCH_REGVAL(PA_SC_LINE_STIPPLE, r700->PA_SC_LINE_STIPPLE.u32All);
+	R600_OUT_BATCH_REGVAL(PA_SC_MPASS_PS_CNTL, r700->PA_SC_MPASS_PS_CNTL.u32All);
+	R600_OUT_BATCH_REGVAL(PA_SC_MODE_CNTL, r700->PA_SC_MODE_CNTL.u32All);
+	R600_OUT_BATCH_REGVAL(PA_SC_LINE_CNTL, r700->PA_SC_LINE_CNTL.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendAAState(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(12);
+	R600_OUT_BATCH_REGVAL(PA_SC_AA_CONFIG, r700->PA_SC_AA_CONFIG.u32All);
+	R600_OUT_BATCH_REGVAL(PA_SC_AA_SAMPLE_LOCS_MCTX, r700->PA_SC_AA_SAMPLE_LOCS_MCTX.u32All);
+	R600_OUT_BATCH_REGVAL(PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX, r700->PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX.u32All);
+	R600_OUT_BATCH_REGVAL(PA_SC_AA_MASK, r700->PA_SC_AA_MASK.u32All);
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendPSConsts(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	int i;
+	BATCH_LOCALS(&context->radeon);
+
+	if (r700->ps.num_consts == 0)
+		return;
+
+	BEGIN_BATCH_NO_AUTOSTATE(2 + (r700->ps.num_consts * 4));
+	R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_ALU_CONST, (r700->ps.num_consts * 4)));
+	/* assembler map const from very beginning. */
+	R600_OUT_BATCH(SQ_ALU_CONSTANT_PS_OFFSET * 4);
+	for (i = 0; i < r700->ps.num_consts; i++) {
+		R600_OUT_BATCH(r700->ps.consts[i][0].u32All);
+		R600_OUT_BATCH(r700->ps.consts[i][1].u32All);
+		R600_OUT_BATCH(r700->ps.consts[i][2].u32All);
+		R600_OUT_BATCH(r700->ps.consts[i][3].u32All);
+	}
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static void r700SendVSConsts(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	int i;
+	BATCH_LOCALS(&context->radeon);
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s\n", __func__);
+
+	if (r700->vs.num_consts == 0)
+		return;
+
+	BEGIN_BATCH_NO_AUTOSTATE(2 + (r700->vs.num_consts * 4));
+	R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_ALU_CONST, (r700->vs.num_consts * 4)));
+	/* assembler map const from very beginning. */
+	R600_OUT_BATCH(SQ_ALU_CONSTANT_VS_OFFSET * 4);
+	for (i = 0; i < r700->vs.num_consts; i++) {
+		R600_OUT_BATCH(r700->vs.consts[i][0].u32All);
+		R600_OUT_BATCH(r700->vs.consts[i][1].u32All);
+		R600_OUT_BATCH(r700->vs.consts[i][2].u32All);
+		R600_OUT_BATCH(r700->vs.consts[i][3].u32All);
+	}
+	END_BATCH();
+	COMMIT_BATCH();
+}
+
+static int check_always(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	return atom->cmd_size;
+}
+
+static int check_cb(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	int count = 7;
+
+	if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+		count += 11;
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
+
+	return count;
+}
+
+static int check_blnd(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	unsigned int ui;
+	int count = 3;
+
+	if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+		count += 3;
+
+	if (context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) {
+		for (ui = 0; ui < R700_MAX_RENDER_TARGETS; ui++) {
+                        if (r700->render_target[ui].enabled)
+				count += 3;
+		}
+	}
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
+
+	return count;
+}
+
+static int check_ucp(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	int i;
+	int count = 0;
+
+	for (i = 0; i < R700_MAX_UCP; i++) {
+		if (r700->ucp[i].enabled)
+			count += 6;
+	}
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
+	return count;
+}
+
+static int check_vtx(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	int count = context->radeon.tcl.aos_count * 18;
+
+	if (count)
+		count += 6;
+
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
+	return count;
+}
+
+static int check_tx(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	unsigned int i, count = 0;
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	for (i = 0; i < R700_TEXTURE_NUMBERUNITS; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled) {
+			radeonTexObj *t = r700->textures[i];
+			if (t)
+				count++;
+		}
+	}
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
+	return count * 31;
+}
+
+static int check_ps_consts(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	int count = r700->ps.num_consts * 4;
+
+	if (count)
+		count += 2;
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
+
+	return count;
+}
+
+static int check_vs_consts(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	int count = r700->vs.num_consts * 4;
+
+	if (count)
+		count += 2;
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
+
+	return count;
+}
+
+#define ALLOC_STATE( ATOM, CHK, SZ, EMIT )				\
+do {									\
+	context->atoms.ATOM.cmd_size = (SZ);				\
+	context->atoms.ATOM.cmd = NULL;					\
+	context->atoms.ATOM.name = #ATOM;				\
+	context->atoms.ATOM.idx = 0;					\
+	context->atoms.ATOM.check = check_##CHK;			\
+	context->atoms.ATOM.dirty = GL_FALSE;				\
+	context->atoms.ATOM.emit = (EMIT);				\
+	context->radeon.hw.max_state_size += (SZ);			\
+	insert_at_tail(&context->radeon.hw.atomlist, &context->atoms.ATOM); \
+} while (0)
+
+void r600InitAtoms(context_t *context)
+{
+	radeon_print(RADEON_STATE, RADEON_NORMAL, "%s %p\n", __func__, context);
+	context->radeon.hw.max_state_size = 10 + 5 + 14; /* start 3d, idle, cb/db flush */
+
+	/* Setup the atom linked list */
+	make_empty_list(&context->radeon.hw.atomlist);
+	context->radeon.hw.atomlist.name = "atom-list";
+
+	ALLOC_STATE(sq, always, 34, r700SendSQConfig);
+	ALLOC_STATE(db, always, 23, r700SendDBState);
+	ALLOC_STATE(stencil, always, 4, r700SendStencilState);
+	ALLOC_STATE(db_target, always, 12, r700SendDepthTargetState);
+	ALLOC_STATE(sc, always, 15, r700SendSCState);
+	ALLOC_STATE(scissor, always, 22, r700SendScissorState);
+	ALLOC_STATE(aa, always, 12, r700SendAAState);
+	ALLOC_STATE(cl, always, 12, r700SendCLState);
+	ALLOC_STATE(gb, always, 6, r700SendGBState);
+	ALLOC_STATE(ucp, ucp, (R700_MAX_UCP * 6), r700SendUCPState);
+	ALLOC_STATE(su, always, 9, r700SendSUState);
+	ALLOC_STATE(poly, always, 10, r700SendPolyState);
+	ALLOC_STATE(cb, cb, 18, r700SendCBState);
+	ALLOC_STATE(clrcmp, always, 6, r700SendCBCLRCMPState);
+	ALLOC_STATE(blnd, blnd, (6 + (R700_MAX_RENDER_TARGETS * 3)), r700SendCBBlendState);
+	ALLOC_STATE(blnd_clr, always, 6, r700SendCBBlendColorState);
+	ALLOC_STATE(cb_target, always, 25, r700SendRenderTargetState);
+	ALLOC_STATE(sx, always, 9, r700SendSXState);
+	ALLOC_STATE(vgt, always, 41, r700SendVGTState);
+	ALLOC_STATE(spi, always, (59 + R700_MAX_SHADER_EXPORTS), r700SendSPIState);
+	ALLOC_STATE(vpt, always, 16, r700SendViewportState);
+	ALLOC_STATE(fs, always, 18, r700SendFSState);
+	ALLOC_STATE(vs, always, 18, r700SendVSState);
+	ALLOC_STATE(ps, always, 21, r700SendPSState);
+	ALLOC_STATE(vs_consts, vs_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendVSConsts);
+	ALLOC_STATE(ps_consts, ps_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendPSConsts);
+	ALLOC_STATE(vtx, vtx, (6 + (VERT_ATTRIB_MAX * 18)), r700SendVTXState);
+	ALLOC_STATE(tx, tx, (R700_TEXTURE_NUMBERUNITS * 20), r700SendTexState);
+	ALLOC_STATE(tx_smplr, tx, (R700_TEXTURE_NUMBERUNITS * 5), r700SendTexSamplerState);
+	ALLOC_STATE(tx_brdr_clr, tx, (R700_TEXTURE_NUMBERUNITS * 6), r700SendTexBorderColorState);
+
+	context->radeon.hw.is_dirty = GL_TRUE;
+	context->radeon.hw.all_dirty = GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r600/r700_chip.h b/src/mesa/drivers/dri/r600/r700_chip.h
new file mode 100644
index 0000000000..ae249e15fd
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_chip.h
@@ -0,0 +1,503 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#ifndef _R700_CHIP_H_
+#define _R700_CHIP_H_
+
+#include "r600_context.h"
+
+#include "r600_reg.h"
+#include "r600_reg_auto_r6xx.h"
+#include "r600_reg_r6xx.h"
+#include "r600_reg_r7xx.h"
+
+#include "r700_chipoffset.h"
+
+#define SETfield(x, val, shift, mask)  ( (x) = ((x) & ~(mask)) | ((val) << (shift)) ) /* u32All */
+#define CLEARfield(x, mask)            ( (x) &= ~(mask) )
+#define SETbit(x, bit)                 ( (x) |= (bit) )
+#define CLEARbit(x, bit)               ( (x) &= ~(bit) )
+
+#define R700_TEXTURE_NUMBERUNITS 16
+#define R700_MAX_RENDER_TARGETS  8
+#define R700_MAX_VIEWPORTS       16
+#define R700_MAX_SHADER_EXPORTS  32
+#define R700_MAX_UCP             6
+#define R700_MAX_DX9_CONSTS      256
+
+/* Enum not show in r600_*.h */
+
+#define FETCH_RESOURCE_STRIDE 7
+
+#define ASIC_CONFIG_BASE_INDEX    0x2000
+#define ASIC_CONTEXT_BASE_INDEX   0xA000
+#define ASIC_CTL_CONST_BASE_INDEX 0xF3FC
+
+
+enum 
+{
+    SQ_ABSOLUTE                              = 0x00000000,
+    SQ_RELATIVE                              = 0x00000001,
+};
+
+enum 
+{
+    SQ_ALU_SCL_210                           = 0x00000000,
+    SQ_ALU_SCL_122                           = 0x00000001,
+    SQ_ALU_SCL_212                           = 0x00000002,
+    SQ_ALU_SCL_221                           = 0x00000003,
+};
+
+enum 
+{
+    SQ_TEX_UNNORMALIZED                      = 0x00000000,
+    SQ_TEX_NORMALIZED                        = 0x00000001,
+};
+
+enum 
+{
+    SQ_CF_PIXEL_MRT0                         = 0x00000000,
+    SQ_CF_PIXEL_MRT1                         = 0x00000001,
+    SQ_CF_PIXEL_MRT2                         = 0x00000002,
+    SQ_CF_PIXEL_MRT3                         = 0x00000003,
+    SQ_CF_PIXEL_MRT4                         = 0x00000004,
+    SQ_CF_PIXEL_MRT5                         = 0x00000005,
+    SQ_CF_PIXEL_MRT6                         = 0x00000006,
+    SQ_CF_PIXEL_MRT7                         = 0x00000007,
+    SQ_CF_PIXEL_Z                            = 0x0000003d,
+};
+
+typedef enum ENUM_SQ_CF_ARRAY_BASE_POS {
+SQ_CF_POS_0                              = 0x0000003c,
+SQ_CF_POS_1                              = 0x0000003d,
+SQ_CF_POS_2                              = 0x0000003e,
+SQ_CF_POS_3                              = 0x0000003f,
+} ENUM_SQ_CF_ARRAY_BASE_POS;
+
+enum
+{
+    PGM_RESOURCES__PRIME_CACHE_ON_DRAW_bit = 23,
+};
+
+enum 
+{
+    TEX_XYFilter_Point                       = 0x00000000,
+    TEX_XYFilter_Linear                      = 0x00000001,
+    TEX_XYFilter_Cubic                       = 0x00000002,
+    TEX_XYFilter_Cleartype                   = 0x00000003,
+
+    TEX_MipFilter_None                       = 0x00000000,
+    TEX_MipFilter_Point                      = 0x00000001,
+    TEX_MipFilter_Linear                     = 0x00000002,
+};
+
+enum 
+{
+    SQ_EXPORT_WRITE                          = 0x00000000,
+    SQ_EXPORT_WRITE_IND                      = 0x00000001,
+    SQ_EXPORT_WRITE_ACK                      = 0x00000002,
+    SQ_EXPORT_WRITE_IND_ACK                  = 0x00000003,
+};
+
+/* --------------------------------- */
+
+enum
+{
+    R700_PM4_PACKET0_NOP = 0x00000000,
+    R700_PM4_PACKET1_NOP = 0x40000000,
+    R700_PM4_PACKET2_NOP = 0x80000000,
+    R700_PM4_PACKET3_NOP = 0xC0000000,
+};
+
+#define  PM4_OPCODE_SET_INDEX_TYPE      (R700_PM4_PACKET3_NOP | (IT_INDEX_TYPE << 8))
+
+#define  PM4_OPCODE_DRAW_INDEX_AUTO     (R700_PM4_PACKET3_NOP | (IT_DRAW_INDEX_AUTO << 8))
+#define  PM4_OPCODE_DRAW_INDEX_IMMD     (R700_PM4_PACKET3_NOP | (IT_DRAW_INDEX_IMMD << 8))
+#define  PM4_OPCODE_WAIT_REG_MEM        (R700_PM4_PACKET3_NOP | (IT_WAIT_REG_MEM << 8))
+#define  PM4_OPCODE_SET_CONTEXT_REG     (R700_PM4_PACKET3_NOP | (IT_SET_CONTEXT_REG << 8))
+#define  PM4_OPCODE_SET_CONFIG_REG      (R700_PM4_PACKET3_NOP | (IT_SET_CONFIG_REG << 8))
+#define  PM4_OPCODE_SET_ALU_CONST       (R700_PM4_PACKET3_NOP | (IT_SET_ALU_CONST << 8))
+#define  PM4_OPCODE_SET_RESOURCE        (R700_PM4_PACKET3_NOP | (IT_SET_RESOURCE << 8))
+#define  PM4_OPCODE_SET_SAMPLER         (R700_PM4_PACKET3_NOP | (IT_SET_SAMPLER << 8))
+#define  PM4_OPCODE_CONTEXT_CONTROL     (R700_PM4_PACKET3_NOP | (IT_CONTEXT_CONTROL << 8))
+
+union UINT_FLOAT 
+{
+    unsigned int u32All;
+    float	f32All;
+};
+
+#if 0
+typedef struct _TEXTURE_STATE_STRUCT
+{
+    union UINT_FLOAT     SQ_TEX_RESOURCE0;
+    union UINT_FLOAT     SQ_TEX_RESOURCE1;
+    union UINT_FLOAT     SQ_TEX_RESOURCE2;
+    union UINT_FLOAT     SQ_TEX_RESOURCE3;
+    union UINT_FLOAT     SQ_TEX_RESOURCE4;
+    union UINT_FLOAT     SQ_TEX_RESOURCE5;
+    union UINT_FLOAT     SQ_TEX_RESOURCE6;
+    GLboolean                         enabled;
+} TEXTURE_STATE_STRUCT;
+
+typedef struct _SAMPLER_STATE_STRUCT
+{
+    union UINT_FLOAT      SQ_TEX_SAMPLER0;
+    union UINT_FLOAT      SQ_TEX_SAMPLER1;
+    union UINT_FLOAT      SQ_TEX_SAMPLER2;
+    GLboolean                         enabled;
+} SAMPLER_STATE_STRUCT;
+
+typedef struct _R700_TEXTURE_STATES
+{
+    TEXTURE_STATE_STRUCT *textures[R700_TEXTURE_NUMBERUNITS];
+    SAMPLER_STATE_STRUCT *samplers[R700_TEXTURE_NUMBERUNITS];
+} R700_TEXTURE_STATES;
+#endif
+
+typedef struct _RENDER_TARGET_STATE_STRUCT
+{
+	union UINT_FLOAT            	CB_COLOR0_BASE;  /* 0xA010 */
+	union UINT_FLOAT            	CB_COLOR0_SIZE;  /* 0xA018 */
+	union UINT_FLOAT            	CB_COLOR0_VIEW;  /* 0xA020 */
+	union UINT_FLOAT            	CB_COLOR0_INFO;  /* 0xA028 */
+	union UINT_FLOAT            	CB_COLOR0_TILE;  /* 0xA030 */
+	union UINT_FLOAT            	CB_COLOR0_FRAG;  /* 0xA038 */
+	union UINT_FLOAT            	CB_COLOR0_MASK;  /* 0xA040 */
+	union UINT_FLOAT         	CB_BLEND0_CONTROL;  /* 0xA1E0 */
+	GLboolean                         enabled;
+	GLboolean                         dirty;
+} RENDER_TARGET_STATE_STRUCT;
+
+typedef struct _VIEWPORT_STATE_STRUCT
+{
+	union UINT_FLOAT  	PA_SC_VPORT_SCISSOR_0_TL;  /* 0xA094 */
+	union UINT_FLOAT  	PA_SC_VPORT_SCISSOR_0_BR;  /* 0xA095 */
+	union UINT_FLOAT        PA_SC_VPORT_ZMIN_0;        /* 0xA0B4 */
+	union UINT_FLOAT        PA_SC_VPORT_ZMAX_0;        /* 0xA0B5 */
+	union UINT_FLOAT        PA_CL_VPORT_XSCALE;        /* 0xA10F */
+	union UINT_FLOAT       	PA_CL_VPORT_XOFFSET;       /* 0xA110 */
+	union UINT_FLOAT        PA_CL_VPORT_YSCALE;        /* 0xA111 */
+	union UINT_FLOAT       	PA_CL_VPORT_YOFFSET;       /* 0xA112 */
+	union UINT_FLOAT        PA_CL_VPORT_ZSCALE;        /* 0xA113 */
+	union UINT_FLOAT       	PA_CL_VPORT_ZOFFSET;       /* 0xA114 */
+	GLboolean                         enabled;
+	GLboolean                         dirty;
+} VIEWPORT_STATE_STRUCT;
+
+typedef struct _UCP_STATE_STRUCT
+{
+	union UINT_FLOAT        PA_CL_UCP_0_X;
+	union UINT_FLOAT        PA_CL_UCP_0_Y;
+	union UINT_FLOAT        PA_CL_UCP_0_Z;
+	union UINT_FLOAT        PA_CL_UCP_0_W;
+	GLboolean                         enabled;
+	GLboolean                         dirty;
+} UCP_STATE_STRUCT;
+
+typedef struct _PS_STATE_STRUCT
+{
+	union UINT_FLOAT           	SQ_PGM_START_PS           ;  /* 0xA210 */
+	union UINT_FLOAT       	        SQ_PGM_RESOURCES_PS       ;  /* 0xA214 */
+	union UINT_FLOAT         	SQ_PGM_EXPORTS_PS         ;  /* 0xA215 */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_PS       ;  /* 0xA233 */
+	GLboolean                         dirty;
+	int                             num_consts;
+	union UINT_FLOAT                consts[R700_MAX_DX9_CONSTS][4];
+} PS_STATE_STRUCT;
+
+typedef struct _VS_STATE_STRUCT
+{
+ 	union UINT_FLOAT           	SQ_PGM_START_VS           ;  /* 0xA216 */
+	union UINT_FLOAT  		SQ_PGM_RESOURCES_VS       ;  /* 0xA21A */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_VS       ;  /* 0xA234 */
+	GLboolean                         dirty;
+	int                             num_consts;
+	union UINT_FLOAT                consts[R700_MAX_DX9_CONSTS][4];
+} VS_STATE_STRUCT;
+
+typedef struct _GS_STATE_STRUCT
+{
+	union UINT_FLOAT           	SQ_PGM_START_GS           ;  /* 0xA21B */
+	union UINT_FLOAT       	        SQ_PGM_RESOURCES_GS       ;  /* 0xA21F */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_GS       ;  /* 0xA235 */
+	GLboolean                         dirty;
+} GS_STATE_STRUCT;
+
+typedef struct _ES_STATE_STRUCT
+{
+	union UINT_FLOAT           	SQ_PGM_START_ES           ;  /* 0xA220 */
+	union UINT_FLOAT       	        SQ_PGM_RESOURCES_ES       ;  /* 0xA224 */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_ES       ;  /* 0xA236 */
+	GLboolean                         dirty;
+} ES_STATE_STRUCT;
+
+typedef struct _FS_STATE_STRUCT
+{
+	union UINT_FLOAT           	SQ_PGM_START_FS           ;  /* 0xA225 */
+	union UINT_FLOAT       	        SQ_PGM_RESOURCES_FS       ;  /* 0xA229 */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_FS       ;  /* 0xA237 */
+	GLboolean                         dirty;
+} FS_STATE_STRUCT;
+
+typedef struct _SQ_CONFIG_STRUCT
+{
+	union UINT_FLOAT     	        SQ_CONFIG                 ;  /* 0x2300 */
+	union UINT_FLOAT     	        SQ_GPR_RESOURCE_MGMT_1    ;  /* 0x2301 */
+	union UINT_FLOAT     	        SQ_GPR_RESOURCE_MGMT_2    ;  /* 0x2302 */
+	union UINT_FLOAT     	        SQ_THREAD_RESOURCE_MGMT   ;  /* 0x2303 */
+	union UINT_FLOAT     	        SQ_STACK_RESOURCE_MGMT_1  ;  /* 0x2304 */
+	union UINT_FLOAT     	        SQ_STACK_RESOURCE_MGMT_2  ;  /* 0x2305 */
+} SQ_CONFIG_STRUCT;
+
+typedef struct _R700_CHIP_CONTEXT
+{
+	// DB
+	union UINT_FLOAT             	DB_DEPTH_SIZE             ;  /* 0xA000 */
+	union UINT_FLOAT             	DB_DEPTH_VIEW             ;  /* 0xA001 */
+	union UINT_FLOAT             	DB_DEPTH_BASE             ;  /* 0xA003 */
+	union UINT_FLOAT             	DB_DEPTH_INFO             ;  /* 0xA004 */
+	GLboolean                       db_target_dirty;
+	union UINT_FLOAT                DB_HTILE_DATA_BASE        ;  /* 0xA005 */
+	union UINT_FLOAT          	DB_STENCIL_CLEAR          ;  /* 0xA00A */
+	union UINT_FLOAT            	DB_DEPTH_CLEAR            ;  /* 0xA00B */
+	union UINT_FLOAT            	DB_STENCILREFMASK         ;  /* 0xA10C */
+	union UINT_FLOAT            	DB_STENCILREFMASK_BF      ;  /* 0xA10D */
+	union UINT_FLOAT         	DB_RENDER_CONTROL         ;  /* 0xA343 */
+	union UINT_FLOAT        	DB_RENDER_OVERRIDE        ;  /* 0xA344 */
+	union UINT_FLOAT          	DB_HTILE_SURFACE          ;  /* 0xA349 */
+	union UINT_FLOAT          	DB_ALPHA_TO_MASK          ;  /* 0xA351 */
+	union UINT_FLOAT          	DB_DEPTH_CONTROL          ;  /* 0xA200 */
+	union UINT_FLOAT         	DB_SHADER_CONTROL         ;  /* 0xA203 */
+	GLboolean                       db_dirty;
+
+	// SC
+	union UINT_FLOAT   	        PA_SC_SCREEN_SCISSOR_TL   ;  /* 0xA00C */
+	union UINT_FLOAT   	        PA_SC_SCREEN_SCISSOR_BR   ;  /* 0xA00D */
+	union UINT_FLOAT       	        PA_SC_WINDOW_OFFSET       ;  /* 0xA080 */
+	union UINT_FLOAT   	        PA_SC_WINDOW_SCISSOR_TL   ;  /* 0xA081 */
+	union UINT_FLOAT   	        PA_SC_WINDOW_SCISSOR_BR   ;  /* 0xA082 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_RULE       ;  /* 0xA083 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_0_TL       ;  /* 0xA084 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_0_BR       ;  /* 0xA085 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_1_TL       ;  /* 0xA086 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_1_BR       ;  /* 0xA087 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_2_TL       ;  /* 0xA088 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_2_BR       ;  /* 0xA089 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_3_TL       ;  /* 0xA08A */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_3_BR       ;  /* 0xA08B */
+	union UINT_FLOAT            	PA_SC_EDGERULE            ;  /* 0xA08C */
+	union UINT_FLOAT  	        PA_SC_GENERIC_SCISSOR_TL  ;  /* 0xA090 */
+	union UINT_FLOAT  	        PA_SC_GENERIC_SCISSOR_BR  ;  /* 0xA091 */
+	GLboolean                       scissor_dirty;
+
+	union UINT_FLOAT        	PA_SC_LINE_STIPPLE        ;  /* 0xA283 */
+	union UINT_FLOAT           	PA_SC_LINE_CNTL           ;  /* 0xA300 */
+	union UINT_FLOAT           	PA_SC_AA_CONFIG           ;  /* 0xA301 */
+	union UINT_FLOAT       	        PA_SC_MPASS_PS_CNTL       ;  /* 0xA292 */
+	union UINT_FLOAT           	PA_SC_MODE_CNTL           ;  /* 0xA293 */
+	union UINT_FLOAT 	        PA_SC_AA_SAMPLE_LOCS_MCTX ;  /* 0xA307 */
+	union UINT_FLOAT                PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX; /* 0xA308 */
+	union UINT_FLOAT             	PA_SC_AA_MASK             ;  /* 0xA312 */
+	GLboolean                       sc_dirty;
+
+	// CL
+	union UINT_FLOAT           	PA_CL_CLIP_CNTL           ;  /* 0xA204 */
+	union UINT_FLOAT            	PA_CL_VTE_CNTL            ;  /* 0xA206 */
+	union UINT_FLOAT         	PA_CL_VS_OUT_CNTL         ;  /* 0xA207 */
+	union UINT_FLOAT         	PA_CL_NANINF_CNTL         ;  /* 0xA208 */
+	union UINT_FLOAT    	        PA_CL_GB_VERT_CLIP_ADJ    ;  /* 0xA303 */
+	union UINT_FLOAT    	        PA_CL_GB_VERT_DISC_ADJ    ;  /* 0xA304 */
+	union UINT_FLOAT    	        PA_CL_GB_HORZ_CLIP_ADJ    ;  /* 0xA305 */
+	union UINT_FLOAT    	        PA_CL_GB_HORZ_DISC_ADJ    ;  /* 0xA306 */
+	GLboolean                       cl_dirty;
+
+	// SU
+	union UINT_FLOAT        	PA_SU_SC_MODE_CNTL        ;  /* 0xA205 */
+	union UINT_FLOAT          	PA_SU_POINT_SIZE          ;  /* 0xA280 */
+	union UINT_FLOAT        	PA_SU_POINT_MINMAX        ;  /* 0xA281 */
+	union UINT_FLOAT           	PA_SU_LINE_CNTL           ;  /* 0xA282 */
+	union UINT_FLOAT            	PA_SU_VTX_CNTL            ;  /* 0xA302 */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_DB_FMT_CNTL;   /* 0xA37E */
+	union UINT_FLOAT   	        PA_SU_POLY_OFFSET_CLAMP   ;      /* 0xA37F */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_FRONT_SCALE;   /* 0xA380 */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_FRONT_OFFSET; /* 0xA381 */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_BACK_SCALE;    /* 0xA382 */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_BACK_OFFSET;   /* 0xA383 */
+	GLboolean                       su_dirty;
+
+	VIEWPORT_STATE_STRUCT           viewport[R700_MAX_VIEWPORTS];
+	UCP_STATE_STRUCT                ucp[R700_MAX_UCP];
+
+	// CB
+	union UINT_FLOAT              	CB_CLEAR_RED_R6XX         ;  /* 0xA048 */
+	union UINT_FLOAT            	CB_CLEAR_GREEN_R6XX       ;  /* 0xA049 */
+	union UINT_FLOAT             	CB_CLEAR_BLUE_R6XX        ;  /* 0xA04A */
+	union UINT_FLOAT            	CB_CLEAR_ALPHA_R6XX       ;  /* 0xA04B */
+	union UINT_FLOAT            	CB_TARGET_MASK            ;  /* 0xA08E */
+	union UINT_FLOAT            	CB_SHADER_MASK            ;  /* 0xA08F */
+	union UINT_FLOAT              	CB_BLEND_RED              ;  /* 0xA105 */
+	union UINT_FLOAT            	CB_BLEND_GREEN            ;  /* 0xA106 */
+	union UINT_FLOAT             	CB_BLEND_BLUE             ;  /* 0xA107 */
+	union UINT_FLOAT            	CB_BLEND_ALPHA            ;  /* 0xA108 */
+	union UINT_FLOAT              	CB_FOG_RED_R6XX           ;  /* 0xA109 */
+	union UINT_FLOAT            	CB_FOG_GREEN_R6XX         ;  /* 0xA10A */
+	union UINT_FLOAT             	CB_FOG_BLUE_R6XX          ;  /* 0xA10B */
+	union UINT_FLOAT         	CB_SHADER_CONTROL         ;  /* 0xA1E8 */
+	union UINT_FLOAT          	CB_COLOR_CONTROL          ;  /* 0xA202 */
+	union UINT_FLOAT         	CB_CLRCMP_CONTROL         ;  /* 0xA30C */
+	union UINT_FLOAT             	CB_CLRCMP_SRC             ;  /* 0xA30D */
+	union UINT_FLOAT             	CB_CLRCMP_DST             ;  /* 0xA30E */
+	union UINT_FLOAT             	CB_CLRCMP_MSK             ;  /* 0xA30F */
+	union UINT_FLOAT             	CB_BLEND_CONTROL          ;  /* 0xABD0 */
+	GLboolean                       cb_dirty;
+	RENDER_TARGET_STATE_STRUCT      render_target[R700_MAX_RENDER_TARGETS];
+
+	// SX
+	union UINT_FLOAT                SX_MISC                   ;  /* 0xA0D4 */
+	union UINT_FLOAT     	        SX_ALPHA_TEST_CONTROL     ;  /* 0xA104 */
+	union UINT_FLOAT     	        SX_ALPHA_REF              ;  /* 0xA10E */
+	GLboolean                       sx_dirty;
+
+	// VGT
+	union UINT_FLOAT          	VGT_MAX_VTX_INDX          ;  /* 0xA100 */
+	union UINT_FLOAT          	VGT_MIN_VTX_INDX          ;  /* 0xA101 */
+	union UINT_FLOAT           	VGT_INDX_OFFSET           ;  /* 0xA102 */
+	union UINT_FLOAT                VGT_MULTI_PRIM_IB_RESET_INDX;  /* 0xA103 */
+	union UINT_FLOAT      	        VGT_OUTPUT_PATH_CNTL      ;  /* 0xA284 */
+	union UINT_FLOAT      	        VGT_HOS_CNTL              ;  /* 0xA285 */
+	union UINT_FLOAT      	        VGT_HOS_MAX_TESS_LEVEL    ;  /* 0xA286 */
+	union UINT_FLOAT      	        VGT_HOS_MIN_TESS_LEVEL    ;  /* 0xA287 */
+	union UINT_FLOAT      	        VGT_HOS_REUSE_DEPTH       ;  /* 0xA288 */
+	union UINT_FLOAT      	        VGT_GROUP_PRIM_TYPE       ;  /* 0xA289 */
+	union UINT_FLOAT      	        VGT_GROUP_FIRST_DECR      ;  /* 0xA28A */
+	union UINT_FLOAT      	        VGT_GROUP_DECR            ;  /* 0xA28B */
+	union UINT_FLOAT      	        VGT_GROUP_VECT_0_CNTL     ;  /* 0xA28C */
+	union UINT_FLOAT      	        VGT_GROUP_VECT_1_CNTL     ;  /* 0xA28D */
+	union UINT_FLOAT      	        VGT_GROUP_VECT_0_FMT_CNTL ;  /* 0xA28E */
+	union UINT_FLOAT      	        VGT_GROUP_VECT_1_FMT_CNTL ;  /* 0xA28F */
+	union UINT_FLOAT               	VGT_GS_MODE               ;  /* 0xA290 */
+	union UINT_FLOAT        	VGT_PRIMITIVEID_EN        ;  /* 0xA2A1 */
+	union UINT_FLOAT	        VGT_MULTI_PRIM_IB_RESET_EN;  /* 0xA2A5 */
+	union UINT_FLOAT  	        VGT_INSTANCE_STEP_RATE_0  ;  /* 0xA2A8 */
+	union UINT_FLOAT  	        VGT_INSTANCE_STEP_RATE_1  ;  /* 0xA2A9 */
+	union UINT_FLOAT            	VGT_STRMOUT_EN            ;  /* 0xA2AC */
+	union UINT_FLOAT             	VGT_REUSE_OFF             ;  /* 0xA2AD */
+	union UINT_FLOAT             	VGT_VTX_CNT_EN            ;  /* 0xA2AE */
+	union UINT_FLOAT            	VGT_STRMOUT_BUFFER_EN     ;  /* 0xA2C8 */
+	GLboolean                       vgt_dirty;
+
+	// SPI
+	union UINT_FLOAT           	SPI_VS_OUT_ID_0           ;  /* 0xA185 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_1           ;  /* 0xA186 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_2           ;  /* 0xA187 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_3           ;  /* 0xA188 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_4           ;  /* 0xA189 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_5           ;  /* 0xA18A */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_6           ;  /* 0xA18B */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_7           ;  /* 0xA18C */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_8           ;  /* 0xA18D */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_9           ;  /* 0xA18E */
+	union UINT_FLOAT                SPI_VS_OUT_CONFIG         ;  /* 0xA1B1 */
+	union UINT_FLOAT       	        SPI_THREAD_GROUPING       ;  /* 0xA1B2 */
+	union UINT_FLOAT       	        SPI_PS_IN_CONTROL_0       ;  /* 0xA1B3 */
+	union UINT_FLOAT       	        SPI_PS_IN_CONTROL_1       ;  /* 0xA1B4 */
+	union UINT_FLOAT       	        SPI_INTERP_CONTROL_0      ;  /* 0xA1B5 */
+ 	union UINT_FLOAT               	SPI_INPUT_Z               ;  /* 0xA1B6 */
+	union UINT_FLOAT              	SPI_FOG_CNTL              ;  /* 0xA1B7 */
+	union UINT_FLOAT              	SPI_FOG_FUNC_SCALE        ;  /* 0xA1B8 */
+	union UINT_FLOAT              	SPI_FOG_FUNC_BIAS         ;  /* 0xA1B9 */
+
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_0         ;  /* 0xA0E0 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_1         ;  /* 0xA0E1 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_2         ;  /* 0xA0E2 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_3         ;  /* 0xA0E3 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_4         ;  /* 0xA0E4 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_5         ;  /* 0xA0E5 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_6         ;  /* 0xA0E6 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_7         ;  /* 0xA0E7 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_8         ;  /* 0xA0E8 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_9         ;  /* 0xA0E9 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_10        ;  /* 0xA0EA */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_11        ;  /* 0xA0EB */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_12        ;  /* 0xA0EC */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_13        ;  /* 0xA0ED */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_14        ;  /* 0xA0EE */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_15        ;  /* 0xA0EF */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_16        ;  /* 0xA0F0 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_17        ;  /* 0xA0F1 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_18        ;  /* 0xA0F2 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_19        ;  /* 0xA0F3 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_20        ;  /* 0xA0F4 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_21        ;  /* 0xA0F5 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_22        ;  /* 0xA0F6 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_23        ;  /* 0xA0F7 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_24        ;  /* 0xA0F8 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_25        ;  /* 0xA0F9 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_26        ;  /* 0xA0FA */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_27        ;  /* 0xA0FB */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_28        ;  /* 0xA0FC */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_29        ;  /* 0xA0FD */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_30        ;  /* 0xA0FE */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_31        ;  /* 0xA0FF */
+	union UINT_FLOAT       	        SPI_PS_INPUT_CNTL[R700_MAX_SHADER_EXPORTS];
+	GLboolean                       spi_dirty;
+
+	// shaders
+	PS_STATE_STRUCT                 ps;
+	VS_STATE_STRUCT                 vs;
+	GS_STATE_STRUCT                 gs;
+	ES_STATE_STRUCT                 es;
+	FS_STATE_STRUCT                 fs;
+
+	// SQ CONFIG
+	SQ_CONFIG_STRUCT                sq_config;
+	// misc
+	union UINT_FLOAT             	TA_CNTL_AUX               ;  /* 0x2542 */
+	union UINT_FLOAT             	VC_ENHANCE                ;  /* 0x25C5 */
+	union UINT_FLOAT             	SQ_DYN_GPR_CNTL_PS_FLUSH_REQ;  /* 0x2363 */
+	union UINT_FLOAT             	DB_DEBUG                  ;  /* 0x260C */
+	union UINT_FLOAT             	DB_WATERMARKS             ;  /* 0x260E */
+	// SQ
+	union UINT_FLOAT     	        SQ_ESGS_RING_ITEMSIZE     ;  /* 0xA22A */
+	union UINT_FLOAT     	        SQ_GSVS_RING_ITEMSIZE     ;  /* 0xA22B */
+	union UINT_FLOAT    	        SQ_ESTMP_RING_ITEMSIZE    ;  /* 0xA22C */
+	union UINT_FLOAT    	        SQ_GSTMP_RING_ITEMSIZE    ;  /* 0xA22D */
+	union UINT_FLOAT    	        SQ_VSTMP_RING_ITEMSIZE    ;  /* 0xA22E */
+	union UINT_FLOAT    	        SQ_PSTMP_RING_ITEMSIZE    ;  /* 0xA22F */
+	union UINT_FLOAT     	        SQ_FBUF_RING_ITEMSIZE     ;  /* 0xA230 */
+	union UINT_FLOAT    	        SQ_REDUC_RING_ITEMSIZE    ;  /* 0xA231 */
+	union UINT_FLOAT       	        SQ_GS_VERT_ITEMSIZE       ;  /* 0xA232 */
+	GLboolean                       sq_dirty;
+
+	radeonTexObj*                   textures[R700_TEXTURE_NUMBERUNITS];
+
+	GLboolean                       bEnablePerspective;
+
+} R700_CHIP_CONTEXT;
+
+#endif /* _R700_CHIP_H_ */
+
diff --git a/src/mesa/drivers/dri/r600/r700_chipoffset.h b/src/mesa/drivers/dri/r600/r700_chipoffset.h
new file mode 100644
index 0000000000..4d73fb99a7
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_chipoffset.h
@@ -0,0 +1,693 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#ifndef _R700_CHIPOFFSET_H_
+#define _R700_CHIPOFFSET_H_
+
+#define mmWAIT_UNTIL                                    0x2010
+#define mmSCRATCH_REG0                                  0x2140
+#define mmGUI_SCRATCH_REG0                              0x2140
+#define mmSCRATCH_REG1                                  0x2141
+#define mmGUI_SCRATCH_REG1                              0x2141
+#define mmSCRATCH_REG2                                  0x2142
+#define mmGUI_SCRATCH_REG2                              0x2142
+#define mmSCRATCH_REG3                                  0x2143
+#define mmGUI_SCRATCH_REG3                              0x2143
+#define mmSCRATCH_REG4                                  0x2144
+#define mmGUI_SCRATCH_REG4                              0x2144
+#define mmSCRATCH_REG5                                  0x2145
+#define mmGUI_SCRATCH_REG5                              0x2145
+#define mmSCRATCH_REG6                                  0x2146
+#define mmGUI_SCRATCH_REG6                              0x2146
+#define mmSCRATCH_REG7                                  0x2147
+#define mmGUI_SCRATCH_REG7                              0x2147
+
+#define mmCP_COHER_CNTL                                 0x217C
+#define mmCP_COHER_SIZE                                 0x217D
+#define mmCP_COHER_BASE                                 0x217E
+#define mmCP_COHER_STATUS                               0x217F
+
+#define mmTA_CNTL_AUX                                   0x2542
+#define mmVC_ENHANCE                                    0x25C5
+#define mmSQ_DYN_GPR_CNTL_PS_FLUSH_REQ                  0x2363
+#define mmDB_DEBUG                                      0x260C
+#define mmDB_WATERMARKS                                 0x260E
+
+#define mmPA_CL_VPORT_XSCALE                            0xA10F
+#define mmPA_CL_VPORT_XOFFSET                           0xA110
+#define mmPA_CL_VPORT_YSCALE                            0xA111
+#define mmPA_CL_VPORT_YOFFSET                           0xA112
+#define mmPA_CL_VPORT_ZSCALE                            0xA113
+#define mmPA_CL_VPORT_ZOFFSET                           0xA114
+#define mmPA_CL_VPORT_XSCALE_1                          0xA115
+#define mmPA_CL_VPORT_XSCALE_2                          0xA11B
+#define mmPA_CL_VPORT_XSCALE_3                          0xA121
+#define mmPA_CL_VPORT_XSCALE_4                          0xA127
+#define mmPA_CL_VPORT_XSCALE_5                          0xA12D
+#define mmPA_CL_VPORT_XSCALE_6                          0xA133
+#define mmPA_CL_VPORT_XSCALE_7                          0xA139
+#define mmPA_CL_VPORT_XSCALE_8                          0xA13F
+#define mmPA_CL_VPORT_XSCALE_9                          0xA145
+#define mmPA_CL_VPORT_XSCALE_10                         0xA14B
+#define mmPA_CL_VPORT_XSCALE_11                         0xA151
+#define mmPA_CL_VPORT_XSCALE_12                         0xA157
+#define mmPA_CL_VPORT_XSCALE_13                         0xA15D
+#define mmPA_CL_VPORT_XSCALE_14                         0xA163
+#define mmPA_CL_VPORT_XSCALE_15                         0xA169
+#define mmPA_CL_VPORT_XOFFSET_1                         0xA116
+#define mmPA_CL_VPORT_XOFFSET_2                         0xA11C
+#define mmPA_CL_VPORT_XOFFSET_3                         0xA122
+#define mmPA_CL_VPORT_XOFFSET_4                         0xA128
+#define mmPA_CL_VPORT_XOFFSET_5                         0xA12E
+#define mmPA_CL_VPORT_XOFFSET_6                         0xA134
+#define mmPA_CL_VPORT_XOFFSET_7                         0xA13A
+#define mmPA_CL_VPORT_XOFFSET_8                         0xA140
+#define mmPA_CL_VPORT_XOFFSET_9                         0xA146
+#define mmPA_CL_VPORT_XOFFSET_10                        0xA14C
+#define mmPA_CL_VPORT_XOFFSET_11                        0xA152
+#define mmPA_CL_VPORT_XOFFSET_12                        0xA158
+#define mmPA_CL_VPORT_XOFFSET_13                        0xA15E
+#define mmPA_CL_VPORT_XOFFSET_14                        0xA164
+#define mmPA_CL_VPORT_XOFFSET_15                        0xA16A
+#define mmPA_CL_VPORT_YSCALE_1                          0xA117
+#define mmPA_CL_VPORT_YSCALE_2                          0xA11D
+#define mmPA_CL_VPORT_YSCALE_3                          0xA123
+#define mmPA_CL_VPORT_YSCALE_4                          0xA129
+#define mmPA_CL_VPORT_YSCALE_5                          0xA12F
+#define mmPA_CL_VPORT_YSCALE_6                          0xA135
+#define mmPA_CL_VPORT_YSCALE_7                          0xA13B
+#define mmPA_CL_VPORT_YSCALE_8                          0xA141
+#define mmPA_CL_VPORT_YSCALE_9                          0xA147
+#define mmPA_CL_VPORT_YSCALE_10                         0xA14D
+#define mmPA_CL_VPORT_YSCALE_11                         0xA153
+#define mmPA_CL_VPORT_YSCALE_12                         0xA159
+#define mmPA_CL_VPORT_YSCALE_13                         0xA15F
+#define mmPA_CL_VPORT_YSCALE_14                         0xA165
+#define mmPA_CL_VPORT_YSCALE_15                         0xA16B
+#define mmPA_CL_VPORT_YOFFSET_1                         0xA118
+#define mmPA_CL_VPORT_YOFFSET_2                         0xA11E
+#define mmPA_CL_VPORT_YOFFSET_3                         0xA124
+#define mmPA_CL_VPORT_YOFFSET_4                         0xA12A
+#define mmPA_CL_VPORT_YOFFSET_5                         0xA130
+#define mmPA_CL_VPORT_YOFFSET_6                         0xA136
+#define mmPA_CL_VPORT_YOFFSET_7                         0xA13C
+#define mmPA_CL_VPORT_YOFFSET_8                         0xA142
+#define mmPA_CL_VPORT_YOFFSET_9                         0xA148
+#define mmPA_CL_VPORT_YOFFSET_10                        0xA14E
+#define mmPA_CL_VPORT_YOFFSET_11                        0xA154
+#define mmPA_CL_VPORT_YOFFSET_12                        0xA15A
+#define mmPA_CL_VPORT_YOFFSET_13                        0xA160
+#define mmPA_CL_VPORT_YOFFSET_14                        0xA166
+#define mmPA_CL_VPORT_YOFFSET_15                        0xA16C
+#define mmPA_CL_VPORT_ZSCALE_1                          0xA119
+#define mmPA_CL_VPORT_ZSCALE_2                          0xA11F
+#define mmPA_CL_VPORT_ZSCALE_3                          0xA125
+#define mmPA_CL_VPORT_ZSCALE_4                          0xA12B
+#define mmPA_CL_VPORT_ZSCALE_5                          0xA131
+#define mmPA_CL_VPORT_ZSCALE_6                          0xA137
+#define mmPA_CL_VPORT_ZSCALE_7                          0xA13D
+#define mmPA_CL_VPORT_ZSCALE_8                          0xA143
+#define mmPA_CL_VPORT_ZSCALE_9                          0xA149
+#define mmPA_CL_VPORT_ZSCALE_10                         0xA14F
+#define mmPA_CL_VPORT_ZSCALE_11                         0xA155
+#define mmPA_CL_VPORT_ZSCALE_12                         0xA15B
+#define mmPA_CL_VPORT_ZSCALE_13                         0xA161
+#define mmPA_CL_VPORT_ZSCALE_14                         0xA167
+#define mmPA_CL_VPORT_ZSCALE_15                         0xA16D
+#define mmPA_CL_VPORT_ZOFFSET_1                         0xA11A
+#define mmPA_CL_VPORT_ZOFFSET_2                         0xA120
+#define mmPA_CL_VPORT_ZOFFSET_3                         0xA126
+#define mmPA_CL_VPORT_ZOFFSET_4                         0xA12C
+#define mmPA_CL_VPORT_ZOFFSET_5                         0xA132
+#define mmPA_CL_VPORT_ZOFFSET_6                         0xA138
+#define mmPA_CL_VPORT_ZOFFSET_7                         0xA13E
+#define mmPA_CL_VPORT_ZOFFSET_8                         0xA144
+#define mmPA_CL_VPORT_ZOFFSET_9                         0xA14A
+#define mmPA_CL_VPORT_ZOFFSET_10                        0xA150
+#define mmPA_CL_VPORT_ZOFFSET_11                        0xA156
+#define mmPA_CL_VPORT_ZOFFSET_12                        0xA15C
+#define mmPA_CL_VPORT_ZOFFSET_13                        0xA162
+#define mmPA_CL_VPORT_ZOFFSET_14                        0xA168
+#define mmPA_CL_VPORT_ZOFFSET_15                        0xA16E
+#define mmPA_CL_VTE_CNTL                                0xA206
+#define mmPA_CL_VS_OUT_CNTL                             0xA207
+#define mmPA_CL_NANINF_CNTL                             0xA208
+#define mmPA_CL_CLIP_CNTL                               0xA204
+#define mmPA_CL_GB_VERT_CLIP_ADJ                        0xA303
+#define mmPA_CL_GB_VERT_DISC_ADJ                        0xA304
+#define mmPA_CL_GB_HORZ_CLIP_ADJ                        0xA305
+#define mmPA_CL_GB_HORZ_DISC_ADJ                        0xA306
+#define mmPA_CL_UCP_0_X                                 0xA388
+#define mmPA_CL_UCP_0_Y                                 0xA389
+#define mmPA_CL_UCP_0_Z                                 0xA38A
+#define mmPA_CL_UCP_0_W                                 0xA38B
+#define mmPA_CL_UCP_1_X                                 0xA38C
+#define mmPA_CL_UCP_1_Y                                 0xA38D
+#define mmPA_CL_UCP_1_Z                                 0xA38E
+#define mmPA_CL_UCP_1_W                                 0xA38F
+#define mmPA_CL_UCP_2_X                                 0xA390
+#define mmPA_CL_UCP_2_Y                                 0xA391
+#define mmPA_CL_UCP_2_Z                                 0xA392
+#define mmPA_CL_UCP_2_W                                 0xA393
+#define mmPA_CL_UCP_3_X                                 0xA394
+#define mmPA_CL_UCP_3_Y                                 0xA395
+#define mmPA_CL_UCP_3_Z                                 0xA396
+#define mmPA_CL_UCP_3_W                                 0xA397
+#define mmPA_CL_UCP_4_X                                 0xA398
+#define mmPA_CL_UCP_4_Y                                 0xA399
+#define mmPA_CL_UCP_4_Z                                 0xA39A
+#define mmPA_CL_UCP_4_W                                 0xA39B
+#define mmPA_CL_UCP_5_X                                 0xA39C
+#define mmPA_CL_UCP_5_Y                                 0xA39D
+#define mmPA_CL_UCP_5_Z                                 0xA39E
+#define mmPA_CL_UCP_5_W                                 0xA39F
+#define mmPA_CL_POINT_X_RAD                             0xA384
+#define mmPA_CL_POINT_Y_RAD                             0xA385
+#define mmPA_CL_POINT_SIZE                              0xA386
+#define mmPA_CL_POINT_CULL_RAD                          0xA387
+
+#define mmPA_SU_VTX_CNTL                                0xA302
+#define mmPA_SU_POINT_SIZE                              0xA280
+#define mmPA_SU_POINT_MINMAX                            0xA281
+#define mmPA_SU_LINE_CNTL                               0xA282
+#define mmPA_SU_SC_MODE_CNTL                            0xA205
+#define mmPA_SU_POLY_OFFSET_DB_FMT_CNTL                 0xA37E
+#define mmPA_SU_POLY_OFFSET_CLAMP                       0xA37F
+#define mmPA_SU_POLY_OFFSET_FRONT_SCALE                 0xA380
+#define mmPA_SU_POLY_OFFSET_FRONT_OFFSET                0xA381
+#define mmPA_SU_POLY_OFFSET_BACK_SCALE                  0xA382
+#define mmPA_SU_POLY_OFFSET_BACK_OFFSET                 0xA383
+
+#define mmPA_SC_WINDOW_OFFSET                           0xA080
+#define mmPA_SC_AA_CONFIG                               0xA301
+#define mmPA_SC_AA_MASK                                 0xA312
+#define mmPA_SC_AA_SAMPLE_LOCS_MCTX                     0xA307
+#define mmPA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX              0xA308
+#define mmPA_SC_LINE_STIPPLE                            0xA283
+#define mmPA_SC_LINE_CNTL                               0xA300
+#define mmPA_SC_SCREEN_SCISSOR_TL                       0xA00C
+#define mmPA_SC_SCREEN_SCISSOR_BR                       0xA00D
+#define mmPA_SC_WINDOW_SCISSOR_TL                       0xA081
+#define mmPA_SC_WINDOW_SCISSOR_BR                       0xA082
+#define mmPA_SC_CLIPRECT_RULE                           0xA083
+#define mmPA_SC_CLIPRECT_0_TL                           0xA084
+#define mmPA_SC_CLIPRECT_0_BR                           0xA085
+#define mmPA_SC_CLIPRECT_1_TL                           0xA086
+#define mmPA_SC_CLIPRECT_1_BR                           0xA087
+#define mmPA_SC_CLIPRECT_2_TL                           0xA088
+#define mmPA_SC_CLIPRECT_2_BR                           0xA089
+#define mmPA_SC_CLIPRECT_3_TL                           0xA08A
+#define mmPA_SC_CLIPRECT_3_BR                           0xA08B
+#define mmPA_SC_EDGERULE                                0xA08C
+#define mmPA_SC_GENERIC_SCISSOR_TL                      0xA090
+#define mmPA_SC_GENERIC_SCISSOR_BR                      0xA091
+#define mmPA_SC_VPORT_SCISSOR_0_TL                      0xA094
+#define mmPA_SC_VPORT_SCISSOR_1_TL                      0xA096
+#define mmPA_SC_VPORT_SCISSOR_2_TL                      0xA098
+#define mmPA_SC_VPORT_SCISSOR_3_TL                      0xA09A
+#define mmPA_SC_VPORT_SCISSOR_4_TL                      0xA09C
+#define mmPA_SC_VPORT_SCISSOR_5_TL                      0xA09E
+#define mmPA_SC_VPORT_SCISSOR_6_TL                      0xA0A0
+#define mmPA_SC_VPORT_SCISSOR_7_TL                      0xA0A2
+#define mmPA_SC_VPORT_SCISSOR_8_TL                      0xA0A4
+#define mmPA_SC_VPORT_SCISSOR_9_TL                      0xA0A6
+#define mmPA_SC_VPORT_SCISSOR_10_TL                     0xA0A8
+#define mmPA_SC_VPORT_SCISSOR_11_TL                     0xA0AA
+#define mmPA_SC_VPORT_SCISSOR_12_TL                     0xA0AC
+#define mmPA_SC_VPORT_SCISSOR_13_TL                     0xA0AE
+#define mmPA_SC_VPORT_SCISSOR_14_TL                     0xA0B0
+#define mmPA_SC_VPORT_SCISSOR_15_TL                     0xA0B2
+#define mmPA_SC_VPORT_SCISSOR_0_BR                      0xA095
+#define mmPA_SC_VPORT_SCISSOR_1_BR                      0xA097
+#define mmPA_SC_VPORT_SCISSOR_2_BR                      0xA099
+#define mmPA_SC_VPORT_SCISSOR_3_BR                      0xA09B
+#define mmPA_SC_VPORT_SCISSOR_4_BR                      0xA09D
+#define mmPA_SC_VPORT_SCISSOR_5_BR                      0xA09F
+#define mmPA_SC_VPORT_SCISSOR_6_BR                      0xA0A1
+#define mmPA_SC_VPORT_SCISSOR_7_BR                      0xA0A3
+#define mmPA_SC_VPORT_SCISSOR_8_BR                      0xA0A5
+#define mmPA_SC_VPORT_SCISSOR_9_BR                      0xA0A7
+#define mmPA_SC_VPORT_SCISSOR_10_BR                     0xA0A9
+#define mmPA_SC_VPORT_SCISSOR_11_BR                     0xA0AB
+#define mmPA_SC_VPORT_SCISSOR_12_BR                     0xA0AD
+#define mmPA_SC_VPORT_SCISSOR_13_BR                     0xA0AF
+#define mmPA_SC_VPORT_SCISSOR_14_BR                     0xA0B1
+#define mmPA_SC_VPORT_SCISSOR_15_BR                     0xA0B3
+#define mmPA_SC_VPORT_ZMIN_0                            0xA0B4
+#define mmPA_SC_VPORT_ZMIN_1                            0xA0B6
+#define mmPA_SC_VPORT_ZMIN_2                            0xA0B8
+#define mmPA_SC_VPORT_ZMIN_3                            0xA0BA
+#define mmPA_SC_VPORT_ZMIN_4                            0xA0BC
+#define mmPA_SC_VPORT_ZMIN_5                            0xA0BE
+#define mmPA_SC_VPORT_ZMIN_6                            0xA0C0
+#define mmPA_SC_VPORT_ZMIN_7                            0xA0C2
+#define mmPA_SC_VPORT_ZMIN_8                            0xA0C4
+#define mmPA_SC_VPORT_ZMIN_9                            0xA0C6
+#define mmPA_SC_VPORT_ZMIN_10                           0xA0C8
+#define mmPA_SC_VPORT_ZMIN_11                           0xA0CA
+#define mmPA_SC_VPORT_ZMIN_12                           0xA0CC
+#define mmPA_SC_VPORT_ZMIN_13                           0xA0CE
+#define mmPA_SC_VPORT_ZMIN_14                           0xA0D0
+#define mmPA_SC_VPORT_ZMIN_15                           0xA0D2
+#define mmPA_SC_VPORT_ZMAX_0                            0xA0B5
+#define mmPA_SC_VPORT_ZMAX_1                            0xA0B7
+#define mmPA_SC_VPORT_ZMAX_2                            0xA0B9
+#define mmPA_SC_VPORT_ZMAX_3                            0xA0BB
+#define mmPA_SC_VPORT_ZMAX_4                            0xA0BD
+#define mmPA_SC_VPORT_ZMAX_5                            0xA0BF
+#define mmPA_SC_VPORT_ZMAX_6                            0xA0C1
+#define mmPA_SC_VPORT_ZMAX_7                            0xA0C3
+#define mmPA_SC_VPORT_ZMAX_8                            0xA0C5
+#define mmPA_SC_VPORT_ZMAX_9                            0xA0C7
+#define mmPA_SC_VPORT_ZMAX_10                           0xA0C9
+#define mmPA_SC_VPORT_ZMAX_11                           0xA0CB
+#define mmPA_SC_VPORT_ZMAX_12                           0xA0CD
+#define mmPA_SC_VPORT_ZMAX_13                           0xA0CF
+#define mmPA_SC_VPORT_ZMAX_14                           0xA0D1
+#define mmPA_SC_VPORT_ZMAX_15                           0xA0D3
+#define mmPA_SC_MODE_CNTL                               0xA293
+#define mmPA_SC_MPASS_PS_CNTL                           0xA292
+
+#define mmVGT_DRAW_INITIATOR                            0xA1FC
+#define mmVGT_EVENT_INITIATOR                           0xA2A4
+#define mmVGT_EVENT_ADDRESS_REG                         0xA1FE
+#define mmVGT_DMA_BASE_HI                               0xA1F9
+#define mmVGT_DMA_BASE                                  0xA1FA
+#define mmVGT_DMA_INDEX_TYPE                            0xA29F
+#define mmVGT_DMA_NUM_INSTANCES                         0xA2A2
+#define mmVGT_DMA_SIZE                                  0xA29D
+
+#define mmVGT_IMMED_DATA                                0xA1FD
+#define mmVGT_INDEX_TYPE                                0x2257
+#define mmVGT_NUM_INDICES                               0x225C
+#define mmVGT_NUM_INSTANCES                             0x225D
+#define mmVGT_PRIMITIVE_TYPE                            0x2256
+#define mmVGT_PRIMITIVEID_EN                            0xA2A1
+#define mmVGT_VTX_CNT_EN                                0xA2AE
+#define mmVGT_REUSE_OFF                                 0xA2AD
+#define mmVGT_INSTANCE_STEP_RATE_0                      0xA2A8
+#define mmVGT_INSTANCE_STEP_RATE_1                      0xA2A9
+#define mmVGT_MAX_VTX_INDX                              0xA100
+#define mmVGT_MIN_VTX_INDX                              0xA101
+#define mmVGT_INDX_OFFSET                               0xA102
+#define mmVGT_VERTEX_REUSE_BLOCK_CNTL                   0xA316
+#define mmVGT_OUT_DEALLOC_CNTL                          0xA317
+#define mmVGT_MULTI_PRIM_IB_RESET_INDX                  0xA103
+#define mmVGT_MULTI_PRIM_IB_RESET_EN                    0xA2A5
+#define mmVGT_ENHANCE                                   0xA294
+#define mmVGT_OUTPUT_PATH_CNTL                          0xA284
+#define mmVGT_HOS_CNTL                                  0xA285
+#define mmVGT_HOS_MAX_TESS_LEVEL                        0xA286
+#define mmVGT_HOS_MIN_TESS_LEVEL                        0xA287
+#define mmVGT_HOS_REUSE_DEPTH                           0xA288
+#define mmVGT_GROUP_PRIM_TYPE                           0xA289
+#define mmVGT_GROUP_FIRST_DECR                          0xA28A
+#define mmVGT_GROUP_DECR                                0xA28B
+#define mmVGT_GROUP_VECT_0_CNTL                         0xA28C
+#define mmVGT_GROUP_VECT_1_CNTL                         0xA28D
+#define mmVGT_GROUP_VECT_0_FMT_CNTL                     0xA28E
+#define mmVGT_GROUP_VECT_1_FMT_CNTL                     0xA28F
+#define mmVGT_GS_MODE                                   0xA290
+#define mmVGT_GS_OUT_PRIM_TYPE                          0xA29B
+
+#define mmVGT_STRMOUT_EN                                0xA2AC
+#define mmVGT_STRMOUT_BUFFER_SIZE_0                     0xA2B4
+#define mmVGT_STRMOUT_BUFFER_SIZE_1                     0xA2B8
+#define mmVGT_STRMOUT_BUFFER_SIZE_2                     0xA2BC
+#define mmVGT_STRMOUT_BUFFER_SIZE_3                     0xA2C0
+#define mmVGT_STRMOUT_BUFFER_OFFSET_0                   0xA2B7
+#define mmVGT_STRMOUT_BUFFER_OFFSET_1                   0xA2BB
+#define mmVGT_STRMOUT_BUFFER_OFFSET_2                   0xA2BF
+#define mmVGT_STRMOUT_BUFFER_OFFSET_3                   0xA2C3
+#define mmVGT_STRMOUT_VTX_STRIDE_0                      0xA2B5
+#define mmVGT_STRMOUT_VTX_STRIDE_1                      0xA2B9
+#define mmVGT_STRMOUT_VTX_STRIDE_2                      0xA2BD
+#define mmVGT_STRMOUT_VTX_STRIDE_3                      0xA2C1
+#define mmVGT_STRMOUT_BUFFER_BASE_0                     0xA2B6
+#define mmVGT_STRMOUT_BUFFER_BASE_1                     0xA2BA
+#define mmVGT_STRMOUT_BUFFER_BASE_2                     0xA2BE
+#define mmVGT_STRMOUT_BUFFER_BASE_3                     0xA2C2
+#define mmVGT_STRMOUT_BUFFER_EN                         0xA2C8
+#define mmVGT_STRMOUT_BASE_OFFSET_0                     0xA2C4
+#define mmVGT_STRMOUT_BASE_OFFSET_1                     0xA2C5
+#define mmVGT_STRMOUT_BASE_OFFSET_2                     0xA2C6
+#define mmVGT_STRMOUT_BASE_OFFSET_3                     0xA2C7
+#define mmVGT_STRMOUT_BASE_OFFSET_HI_0                  0xA2D1
+#define mmVGT_STRMOUT_BASE_OFFSET_HI_1                  0xA2D2
+#define mmVGT_STRMOUT_BASE_OFFSET_HI_2                  0xA2D3
+#define mmVGT_STRMOUT_BASE_OFFSET_HI_3                  0xA2D4
+#define mmVGT_STRMOUT_DRAW_OPAQUE_OFFSET                0xA2CA
+#define mmVGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE    0xA2CB
+#define mmVGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE         0xA2CC
+
+#define mmSQ_PGM_START_PS                               0xA210
+#define mmSQ_PGM_CF_OFFSET_PS                           0xA233
+#define mmSQ_PGM_RESOURCES_PS                           0xA214
+#define mmSQ_PGM_EXPORTS_PS                             0xA215
+#define mmSQ_PGM_START_VS                               0xA216
+#define mmSQ_PGM_CF_OFFSET_VS                           0xA234
+#define mmSQ_PGM_RESOURCES_VS                           0xA21A
+#define mmSQ_PGM_START_GS                               0xA21B
+#define mmSQ_PGM_CF_OFFSET_GS                           0xA235
+#define mmSQ_PGM_RESOURCES_GS                           0xA21F
+#define mmSQ_PGM_START_ES                               0xA220
+#define mmSQ_PGM_CF_OFFSET_ES                           0xA236
+#define mmSQ_PGM_RESOURCES_ES                           0xA224
+#define mmSQ_PGM_START_FS                               0xA225
+#define mmSQ_PGM_CF_OFFSET_FS                           0xA237
+#define mmSQ_PGM_RESOURCES_FS                           0xA229
+#define mmSQ_ESGS_RING_ITEMSIZE                         0xA22A
+#define mmSQ_GSVS_RING_ITEMSIZE                         0xA22B
+#define mmSQ_ESTMP_RING_ITEMSIZE                        0xA22C
+#define mmSQ_GSTMP_RING_ITEMSIZE                        0xA22D
+#define mmSQ_VSTMP_RING_ITEMSIZE                        0xA22E
+#define mmSQ_PSTMP_RING_ITEMSIZE                        0xA22F
+#define mmSQ_FBUF_RING_ITEMSIZE                         0xA230
+#define mmSQ_REDUC_RING_ITEMSIZE                        0xA231
+#define mmSQ_GS_VERT_ITEMSIZE                           0xA232
+#define mmSQ_VTX_SEMANTIC_CLEAR                         0xA238
+
+#define mmSQ_VTX_SEMANTIC_0                             0xA0E0
+#define mmSQ_VTX_SEMANTIC_1                             0xA0E1
+#define mmSQ_VTX_SEMANTIC_2                             0xA0E2
+#define mmSQ_VTX_SEMANTIC_3                             0xA0E3
+#define mmSQ_VTX_SEMANTIC_4                             0xA0E4
+#define mmSQ_VTX_SEMANTIC_5                             0xA0E5
+#define mmSQ_VTX_SEMANTIC_6                             0xA0E6
+#define mmSQ_VTX_SEMANTIC_7                             0xA0E7
+#define mmSQ_VTX_SEMANTIC_8                             0xA0E8
+#define mmSQ_VTX_SEMANTIC_9                             0xA0E9
+#define mmSQ_VTX_SEMANTIC_10                            0xA0EA
+#define mmSQ_VTX_SEMANTIC_11                            0xA0EB
+#define mmSQ_VTX_SEMANTIC_12                            0xA0EC
+#define mmSQ_VTX_SEMANTIC_13                            0xA0ED
+#define mmSQ_VTX_SEMANTIC_14                            0xA0EE
+#define mmSQ_VTX_SEMANTIC_15                            0xA0EF
+#define mmSQ_VTX_SEMANTIC_16                            0xA0F0
+#define mmSQ_VTX_SEMANTIC_17                            0xA0F1
+#define mmSQ_VTX_SEMANTIC_18                            0xA0F2
+#define mmSQ_VTX_SEMANTIC_19                            0xA0F3
+#define mmSQ_VTX_SEMANTIC_20                            0xA0F4
+#define mmSQ_VTX_SEMANTIC_21                            0xA0F5
+#define mmSQ_VTX_SEMANTIC_22                            0xA0F6
+#define mmSQ_VTX_SEMANTIC_23                            0xA0F7
+#define mmSQ_VTX_SEMANTIC_24                            0xA0F8
+#define mmSQ_VTX_SEMANTIC_25                            0xA0F9
+#define mmSQ_VTX_SEMANTIC_26                            0xA0FA
+#define mmSQ_VTX_SEMANTIC_27                            0xA0FB
+#define mmSQ_VTX_SEMANTIC_28                            0xA0FC
+#define mmSQ_VTX_SEMANTIC_29                            0xA0FD
+#define mmSQ_VTX_SEMANTIC_30                            0xA0FE
+#define mmSQ_VTX_SEMANTIC_31                            0xA0FF
+
+#define mmSQ_ALU_CONST_CACHE_PS_0                       0xA250
+#define mmSQ_ALU_CONST_CACHE_PS_1                       0xA251
+#define mmSQ_ALU_CONST_CACHE_PS_2                       0xA252
+#define mmSQ_ALU_CONST_CACHE_PS_3                       0xA253
+#define mmSQ_ALU_CONST_CACHE_PS_4                       0xA254
+#define mmSQ_ALU_CONST_CACHE_PS_5                       0xA255
+#define mmSQ_ALU_CONST_CACHE_PS_6                       0xA256
+#define mmSQ_ALU_CONST_CACHE_PS_7                       0xA257
+#define mmSQ_ALU_CONST_CACHE_PS_8                       0xA258
+#define mmSQ_ALU_CONST_CACHE_PS_9                       0xA259
+#define mmSQ_ALU_CONST_CACHE_PS_10                      0xA25A
+#define mmSQ_ALU_CONST_CACHE_PS_11                      0xA25B
+#define mmSQ_ALU_CONST_CACHE_PS_12                      0xA25C
+#define mmSQ_ALU_CONST_CACHE_PS_13                      0xA25D
+#define mmSQ_ALU_CONST_CACHE_PS_14                      0xA25E
+#define mmSQ_ALU_CONST_CACHE_PS_15                      0xA25F
+#define mmSQ_ALU_CONST_CACHE_VS_0                       0xA260
+#define mmSQ_ALU_CONST_CACHE_VS_1                       0xA261
+#define mmSQ_ALU_CONST_CACHE_VS_2                       0xA262
+#define mmSQ_ALU_CONST_CACHE_VS_3                       0xA263
+#define mmSQ_ALU_CONST_CACHE_VS_4                       0xA264
+#define mmSQ_ALU_CONST_CACHE_VS_5                       0xA265
+#define mmSQ_ALU_CONST_CACHE_VS_6                       0xA266
+#define mmSQ_ALU_CONST_CACHE_VS_7                       0xA267
+#define mmSQ_ALU_CONST_CACHE_VS_8                       0xA268
+#define mmSQ_ALU_CONST_CACHE_VS_9                       0xA269
+#define mmSQ_ALU_CONST_CACHE_VS_10                      0xA26A
+#define mmSQ_ALU_CONST_CACHE_VS_11                      0xA26B
+#define mmSQ_ALU_CONST_CACHE_VS_12                      0xA26C
+#define mmSQ_ALU_CONST_CACHE_VS_13                      0xA26D
+#define mmSQ_ALU_CONST_CACHE_VS_14                      0xA26E
+#define mmSQ_ALU_CONST_CACHE_VS_15                      0xA26F
+#define mmSQ_ALU_CONST_CACHE_GS_0                       0xA270
+#define mmSQ_ALU_CONST_CACHE_GS_1                       0xA271
+#define mmSQ_ALU_CONST_CACHE_GS_2                       0xA272
+#define mmSQ_ALU_CONST_CACHE_GS_3                       0xA273
+#define mmSQ_ALU_CONST_CACHE_GS_4                       0xA274
+#define mmSQ_ALU_CONST_CACHE_GS_5                       0xA275
+#define mmSQ_ALU_CONST_CACHE_GS_6                       0xA276
+#define mmSQ_ALU_CONST_CACHE_GS_7                       0xA277
+#define mmSQ_ALU_CONST_CACHE_GS_8                       0xA278
+#define mmSQ_ALU_CONST_CACHE_GS_9                       0xA279
+#define mmSQ_ALU_CONST_CACHE_GS_10                      0xA27A
+#define mmSQ_ALU_CONST_CACHE_GS_11                      0xA27B
+#define mmSQ_ALU_CONST_CACHE_GS_12                      0xA27C
+#define mmSQ_ALU_CONST_CACHE_GS_13                      0xA27D
+#define mmSQ_ALU_CONST_CACHE_GS_14                      0xA27E
+#define mmSQ_ALU_CONST_CACHE_GS_15                      0xA27F
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_0                 0xA050
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_1                 0xA051
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_2                 0xA052
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_3                 0xA053
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_4                 0xA054
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_5                 0xA055
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_6                 0xA056
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_7                 0xA057
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_8                 0xA058
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_9                 0xA059
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_10                0xA05A
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_11                0xA05B
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_12                0xA05C
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_13                0xA05D
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_14                0xA05E
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_15                0xA05F
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_0                 0xA060
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_1                 0xA061
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_2                 0xA062
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_3                 0xA063
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_4                 0xA064
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_5                 0xA065
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_6                 0xA066
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_7                 0xA067
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_8                 0xA068
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_9                 0xA069
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_10                0xA06A
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_11                0xA06B
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_12                0xA06C
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_13                0xA06D
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_14                0xA06E
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_15                0xA06F
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_0                 0xA070
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_1                 0xA071
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_2                 0xA072
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_3                 0xA073
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_4                 0xA074
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_5                 0xA075
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_6                 0xA076
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_7                 0xA077
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_8                 0xA078
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_9                 0xA079
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_10                0xA07A
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_11                0xA07B
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_12                0xA07C
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_13                0xA07D
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_14                0xA07E
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_15                0xA07F
+
+#define mmSPI_VS_OUT_ID_0                               0xA185
+#define mmSPI_VS_OUT_ID_1                               0xA186
+#define mmSPI_VS_OUT_ID_2                               0xA187
+#define mmSPI_VS_OUT_ID_3                               0xA188
+#define mmSPI_VS_OUT_ID_4                               0xA189
+#define mmSPI_VS_OUT_ID_5                               0xA18A
+#define mmSPI_VS_OUT_ID_6                               0xA18B
+#define mmSPI_VS_OUT_ID_7                               0xA18C
+#define mmSPI_VS_OUT_ID_8                               0xA18D
+#define mmSPI_VS_OUT_ID_9                               0xA18E
+#define mmSPI_PS_INPUT_CNTL_0                           0xA191
+#define mmSPI_PS_INPUT_CNTL_1                           0xA192
+#define mmSPI_PS_INPUT_CNTL_2                           0xA193
+#define mmSPI_PS_INPUT_CNTL_3                           0xA194
+#define mmSPI_PS_INPUT_CNTL_4                           0xA195
+#define mmSPI_PS_INPUT_CNTL_5                           0xA196
+#define mmSPI_PS_INPUT_CNTL_6                           0xA197
+#define mmSPI_PS_INPUT_CNTL_7                           0xA198
+#define mmSPI_PS_INPUT_CNTL_8                           0xA199
+#define mmSPI_PS_INPUT_CNTL_9                           0xA19A
+#define mmSPI_PS_INPUT_CNTL_10                          0xA19B
+#define mmSPI_PS_INPUT_CNTL_11                          0xA19C
+#define mmSPI_PS_INPUT_CNTL_12                          0xA19D
+#define mmSPI_PS_INPUT_CNTL_13                          0xA19E
+#define mmSPI_PS_INPUT_CNTL_14                          0xA19F
+#define mmSPI_PS_INPUT_CNTL_15                          0xA1A0
+#define mmSPI_PS_INPUT_CNTL_16                          0xA1A1
+#define mmSPI_PS_INPUT_CNTL_17                          0xA1A2
+#define mmSPI_PS_INPUT_CNTL_18                          0xA1A3
+#define mmSPI_PS_INPUT_CNTL_19                          0xA1A4
+#define mmSPI_PS_INPUT_CNTL_20                          0xA1A5
+#define mmSPI_PS_INPUT_CNTL_21                          0xA1A6
+#define mmSPI_PS_INPUT_CNTL_22                          0xA1A7
+#define mmSPI_PS_INPUT_CNTL_23                          0xA1A8
+#define mmSPI_PS_INPUT_CNTL_24                          0xA1A9
+#define mmSPI_PS_INPUT_CNTL_25                          0xA1AA
+#define mmSPI_PS_INPUT_CNTL_26                          0xA1AB
+#define mmSPI_PS_INPUT_CNTL_27                          0xA1AC
+#define mmSPI_PS_INPUT_CNTL_28                          0xA1AD
+#define mmSPI_PS_INPUT_CNTL_29                          0xA1AE
+#define mmSPI_PS_INPUT_CNTL_30                          0xA1AF
+#define mmSPI_PS_INPUT_CNTL_31                          0xA1B0
+#define mmSPI_VS_OUT_CONFIG                             0xA1B1
+#define mmSPI_THREAD_GROUPING                           0xA1B2
+#define mmSPI_PS_IN_CONTROL_0                           0xA1B3
+#define mmSPI_PS_IN_CONTROL_1                           0xA1B4
+#define mmSPI_INTERP_CONTROL_0                          0xA1B5
+#define mmSPI_INPUT_Z                                   0xA1B6
+#define mmSPI_FOG_CNTL                                  0xA1B7
+#define mmSPI_FOG_FUNC_SCALE                            0xA1B8
+#define mmSPI_FOG_FUNC_BIAS                             0xA1B9
+
+#define mmSX_MISC                                       0xA0D4
+#define mmSX_ALPHA_TEST_CONTROL                         0xA104
+#define mmSX_ALPHA_REF                                  0xA10E
+
+#define mmDB_DEPTH_BASE                                 0xA003
+#define mmDB_DEPTH_INFO                                 0xA004
+#define mmDB_HTILE_DATA_BASE                            0xA005
+#define mmDB_DEPTH_SIZE                                 0xA000
+#define mmDB_DEPTH_VIEW                                 0xA001
+#define mmDB_RENDER_CONTROL                             0xA343
+#define mmDB_RENDER_OVERRIDE                            0xA344
+#define mmDB_SHADER_CONTROL                             0xA203
+#define mmDB_STENCIL_CLEAR                              0xA00A
+#define mmDB_DEPTH_CLEAR                                0xA00B
+#define mmDB_HTILE_SURFACE                              0xA349
+#define mmDB_PRELOAD_CONTROL                            0xA34C
+#define mmDB_PREFETCH_LIMIT                             0xA34D
+#define mmDB_STENCILREFMASK                             0xA10C
+#define mmDB_STENCILREFMASK_BF                          0xA10D
+#define mmDB_SRESULTS_COMPARE_STATE0                    0xA34A
+#define mmDB_SRESULTS_COMPARE_STATE1                    0xA34B
+#define mmDB_DEPTH_CONTROL                              0xA200
+#define mmDB_ALPHA_TO_MASK                              0xA351
+
+#define mmCB_CLEAR_RED_R6XX                             0xA048
+#define mmCB_CLEAR_GREEN_R6XX                           0xA049
+#define mmCB_CLEAR_BLUE_R6XX                            0xA04A
+#define mmCB_CLEAR_ALPHA_R6XX                           0xA04B
+#define mmCB_BLEND_RED                                  0xA105
+#define mmCB_BLEND_GREEN                                0xA106
+#define mmCB_BLEND_BLUE                                 0xA107
+#define mmCB_BLEND_ALPHA                                0xA108
+#define mmCB_FOG_RED_R6XX                               0xA109
+#define mmCB_FOG_GREEN_R6XX                             0xA10A
+#define mmCB_FOG_BLUE_R6XX                              0xA10B
+#define mmCB_BLEND_CONTROL                              0xA201
+#define mmCB_COLOR_CONTROL                              0xA202
+#define mmCB_BLEND0_CONTROL                             0xA1E0
+#define mmCB_BLEND1_CONTROL                             0xA1E1
+#define mmCB_BLEND2_CONTROL                             0xA1E2
+#define mmCB_BLEND3_CONTROL                             0xA1E3
+#define mmCB_BLEND4_CONTROL                             0xA1E4
+#define mmCB_BLEND5_CONTROL                             0xA1E5
+#define mmCB_BLEND6_CONTROL                             0xA1E6
+#define mmCB_BLEND7_CONTROL                             0xA1E7
+#define mmCB_CLRCMP_CONTROL                             0xA30C
+#define mmCB_CLRCMP_SRC                                 0xA30D
+#define mmCB_CLRCMP_DST                                 0xA30E
+#define mmCB_CLRCMP_MSK                                 0xA30F
+#define mmCB_COLOR0_BASE                                0xA010
+#define mmCB_COLOR1_BASE                                0xA011
+#define mmCB_COLOR2_BASE                                0xA012
+#define mmCB_COLOR3_BASE                                0xA013
+#define mmCB_COLOR4_BASE                                0xA014
+#define mmCB_COLOR5_BASE                                0xA015
+#define mmCB_COLOR6_BASE                                0xA016
+#define mmCB_COLOR7_BASE                                0xA017
+#define mmCB_COLOR0_SIZE                                0xA018
+#define mmCB_COLOR1_SIZE                                0xA019
+#define mmCB_COLOR2_SIZE                                0xA01A
+#define mmCB_COLOR3_SIZE                                0xA01B
+#define mmCB_COLOR4_SIZE                                0xA01C
+#define mmCB_COLOR5_SIZE                                0xA01D
+#define mmCB_COLOR6_SIZE                                0xA01E
+#define mmCB_COLOR7_SIZE                                0xA01F
+#define mmCB_COLOR0_VIEW                                0xA020
+#define mmCB_COLOR1_VIEW                                0xA021
+#define mmCB_COLOR2_VIEW                                0xA022
+#define mmCB_COLOR3_VIEW                                0xA023
+#define mmCB_COLOR4_VIEW                                0xA024
+#define mmCB_COLOR5_VIEW                                0xA025
+#define mmCB_COLOR6_VIEW                                0xA026
+#define mmCB_COLOR7_VIEW                                0xA027
+#define mmCB_COLOR0_INFO                                0xA028
+#define mmCB_COLOR1_INFO                                0xA029
+#define mmCB_COLOR2_INFO                                0xA02A
+#define mmCB_COLOR3_INFO                                0xA02B
+#define mmCB_COLOR4_INFO                                0xA02C
+#define mmCB_COLOR5_INFO                                0xA02D
+#define mmCB_COLOR6_INFO                                0xA02E
+#define mmCB_COLOR7_INFO                                0xA02F
+#define mmCB_COLOR0_TILE                                0xA030
+#define mmCB_COLOR1_TILE                                0xA031
+#define mmCB_COLOR2_TILE                                0xA032
+#define mmCB_COLOR3_TILE                                0xA033
+#define mmCB_COLOR4_TILE                                0xA034
+#define mmCB_COLOR5_TILE                                0xA035
+#define mmCB_COLOR6_TILE                                0xA036
+#define mmCB_COLOR7_TILE                                0xA037
+#define mmCB_COLOR0_FRAG                                0xA038
+#define mmCB_COLOR1_FRAG                                0xA039
+#define mmCB_COLOR2_FRAG                                0xA03A
+#define mmCB_COLOR3_FRAG                                0xA03B
+#define mmCB_COLOR4_FRAG                                0xA03C
+#define mmCB_COLOR5_FRAG                                0xA03D
+#define mmCB_COLOR6_FRAG                                0xA03E
+#define mmCB_COLOR7_FRAG                                0xA03F
+#define mmCB_COLOR0_MASK                                0xA040
+#define mmCB_COLOR1_MASK                                0xA041
+#define mmCB_COLOR2_MASK                                0xA042
+#define mmCB_COLOR3_MASK                                0xA043
+#define mmCB_COLOR4_MASK                                0xA044
+#define mmCB_COLOR5_MASK                                0xA045
+#define mmCB_COLOR6_MASK                                0xA046
+#define mmCB_COLOR7_MASK                                0xA047
+#define mmCB_CLEAR_RED_R6XX                             0xA048
+#define mmCB_CLEAR_GREEN_R6XX                           0xA049
+#define mmCB_CLEAR_BLUE_R6XX                            0xA04A
+#define mmCB_CLEAR_ALPHA_R6XX                           0xA04B
+#define mmCB_TARGET_MASK                                0xA08E
+#define mmCB_SHADER_MASK                                0xA08F
+#define mmCB_SHADER_CONTROL                             0xA1E8
+
+#define mmSQ_VTX_BASE_VTX_LOC                           0xF3FC
+#define mmSQ_VTX_START_INST_LOC                         0xF3FD
+
+#endif /* _R700_CHIPOFFSET_H_ */
+
diff --git a/src/mesa/drivers/dri/r600/r700_clear.c b/src/mesa/drivers/dri/r600/r700_clear.c
new file mode 100644
index 0000000000..c6546ab00c
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_clear.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+ 
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/imports.h"
+#include "main/mtypes.h"
+#include "main/enums.h"
+#include "swrast/swrast.h"
+
+#include "radeon_lock.h"
+#include "r600_context.h"
+
+#include "r700_shaderinst.h"
+#include "r600_emit.h"
+#include "r700_clear.h"
+
+static GLboolean r700ClearFast(context_t *context, GLbitfield mask)
+{
+    /* TODO, fast clear need implementation */
+    return GL_FALSE;
+}
+
+void r700Clear(GLcontext * ctx, GLbitfield mask)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    __DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
+    const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+    GLbitfield swrast_mask = 0, tri_mask = 0;
+    int i;
+    struct gl_framebuffer *fb = ctx->DrawBuffer;
+
+    radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s %x\n", __func__, mask);
+
+    if( GL_TRUE == r700ClearFast(context, mask) )
+    {
+        return;
+    }
+	if (!context->radeon.radeonScreen->driScreen->dri2.enabled) {
+		LOCK_HARDWARE(&context->radeon);
+		UNLOCK_HARDWARE(&context->radeon);
+		if (dPriv->numClipRects == 0)
+			return;
+	}
+
+	R600_NEWPRIM(context);
+
+	if (colorMask == ~0)
+	  tri_mask |= (mask & BUFFER_BITS_COLOR);
+	else
+	  tri_mask |= (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT));
+
+
+	/* HW stencil */
+	if (mask & BUFFER_BIT_STENCIL) {
+		tri_mask |= BUFFER_BIT_STENCIL;
+	}
+
+	/* HW depth */
+	if (mask & BUFFER_BIT_DEPTH) {
+    	        tri_mask |= BUFFER_BIT_DEPTH;
+	}
+
+	/* If we're doing a tri pass for depth/stencil, include a likely color
+	 * buffer with it.
+	 */
+
+	for (i = 0; i < BUFFER_COUNT; i++) {
+	  GLuint bufBit = 1 << i;
+	  if ((tri_mask) & bufBit) {
+	    if (!fb->Attachment[i].Renderbuffer->ClassID) {
+	      tri_mask &= ~bufBit;
+	      swrast_mask |= bufBit;
+	    }
+	  }
+	}
+
+	/* SW fallback clearing */
+	swrast_mask = mask & ~tri_mask;
+
+	if (tri_mask) {
+		radeonUserClear(ctx, tri_mask);
+	}
+
+	if (swrast_mask) {
+		radeon_print(RADEON_FALLBACKS, RADEON_IMPORTANT, "%s: swrast clear, mask: %x\n",
+				__FUNCTION__, swrast_mask);
+		_swrast_Clear(ctx, swrast_mask);
+	}
+
+}
+
+
diff --git a/src/mesa/drivers/dri/r600/r700_clear.h b/src/mesa/drivers/dri/r600/r700_clear.h
new file mode 100644
index 0000000000..bed1d3a90e
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_clear.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#ifndef __r700_CLEAR_H__
+#define __r700_CLEAR_H__
+
+extern void r700Clear(GLcontext * ctx, GLbitfield mask);
+
+#endif /* __r700_CLEAR_H__ */
diff --git a/src/mesa/drivers/dri/r600/r700_debug.c b/src/mesa/drivers/dri/r600/r700_debug.c
new file mode 100644
index 0000000000..cd1ba9eca3
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_debug.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#include "r700_debug.h"
+#include "radeon_debug.h"
+
+void DumpHwBinary(int type, void *addr, int size)
+{
+    int i;
+    unsigned int *pHw = (unsigned int *)addr;
+
+    return;
+
+    switch (type)
+    {
+        case DUMP_PIXEL_SHADER:
+            radeon_print(RADEON_SHADER, RADEON_TRACE, "Pixel Shader\n");
+        break;
+        case DUMP_VERTEX_SHADER:
+            radeon_print(RADEON_SHADER, RADEON_TRACE, "Vertex Shader\n");
+        break;
+        case DUMP_FETCH_SHADER:
+            radeon_print(RADEON_SHADER, RADEON_TRACE, "Fetch Shader\n");
+        break;
+    }
+
+    for (i = 0; i < size; i++)
+    {
+        radeon_print(RADEON_SHADER, RADEON_TRACE, "0x%08x,\t", *pHw);
+        if (i%4 == 3)
+            radeon_print(RADEON_SHADER, RADEON_TRACE, "0x%08x\n", *pHw);
+        pHw++;
+
+    }
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_debug.h b/src/mesa/drivers/dri/r600/r700_debug.h
new file mode 100644
index 0000000000..c0921bf610
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_debug.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#ifndef _R700_DEBUG_H_
+#define _R700_DEBUG_H_
+enum R700_DUMP_TYPE
+{
+	DUMP_VERTEX_SHADER      = 0x1,
+	DUMP_PIXEL_SHADER       = 0x2,
+	DUMP_FETCH_SHADER       = 0x4,
+};
+
+extern void DumpHwBinary(int, void *, int);
+
+#endif /*_R700_DEBUG_H_*/
diff --git a/src/mesa/drivers/dri/r600/r700_driconf.h b/src/mesa/drivers/dri/r600/r700_driconf.h
new file mode 100644
index 0000000000..a9e2152344
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_driconf.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#ifndef _R700_DRICONF_H_
+#define _R700_DRICONF_H_
+
+#define DRI_CONF_FP_OPTIMIZATION_SPEED   0
+#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
+
+#endif /* _R700_DRICONF_H_ */
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
new file mode 100644
index 0000000000..78ce3ae436
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.c
@@ -0,0 +1,476 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "main/imports.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+
+#include "r700_fragprog.h"
+
+#include "r700_debug.h"
+
+//TODO : Validate FP input with VP output.
+void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
+						  struct gl_fragment_program *mesa_fp)
+{
+	unsigned int unBit;
+    unsigned int i;
+    GLuint       ui;
+
+	pAsm->number_used_registers = 0;
+
+//Input mapping : mesa_fp->Base.InputsRead set the flag, set in 
+	//The flags parsed in parse_attrib_binding. FRAG_ATTRIB_COLx, FRAG_ATTRIB_TEXx, ...
+	//MUST match order in Map_Vertex_Output
+	unBit = 1 << FRAG_ATTRIB_WPOS;
+	if(mesa_fp->Base.InputsRead & unBit)
+	{
+		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_WPOS] = pAsm->number_used_registers++;
+	}
+
+	unBit = 1 << FRAG_ATTRIB_COL0;
+	if(mesa_fp->Base.InputsRead & unBit)
+	{
+		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL0] = pAsm->number_used_registers++;
+	}
+
+	unBit = 1 << FRAG_ATTRIB_COL1;
+	if(mesa_fp->Base.InputsRead & unBit)
+	{
+		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL1] = pAsm->number_used_registers++;
+	}
+
+        unBit = 1 << FRAG_ATTRIB_FOGC;
+        if(mesa_fp->Base.InputsRead & unBit)
+        {
+                pAsm->uiFP_AttributeMap[FRAG_ATTRIB_FOGC] = pAsm->number_used_registers++;
+        }
+
+	for(i=0; i<8; i++)
+	{
+		unBit = 1 << (FRAG_ATTRIB_TEX0 + i);
+		if(mesa_fp->Base.InputsRead & unBit)
+		{
+			pAsm->uiFP_AttributeMap[FRAG_ATTRIB_TEX0 + i] = pAsm->number_used_registers++;
+		}
+	}
+
+/* Map temporary registers (GPRs) */
+    pAsm->starting_temp_register_number = pAsm->number_used_registers;
+
+    if(mesa_fp->Base.NumNativeTemporaries >= mesa_fp->Base.NumTemporaries)
+    {
+	    pAsm->number_used_registers += mesa_fp->Base.NumNativeTemporaries;
+    }
+    else
+    {
+        pAsm->number_used_registers += mesa_fp->Base.NumTemporaries;
+    }
+
+/* Output mapping */
+	pAsm->number_of_exports = 0;
+	pAsm->number_of_colorandz_exports = 0; /* don't include stencil and mask out. */
+	pAsm->starting_export_register_number = pAsm->number_used_registers;
+	unBit = 1 << FRAG_RESULT_COLOR;
+	if(mesa_fp->Base.OutputsWritten & unBit)
+	{
+		pAsm->uiFP_OutputMap[FRAG_RESULT_COLOR] = pAsm->number_used_registers++;
+		pAsm->number_of_exports++;
+		pAsm->number_of_colorandz_exports++;
+	}
+	unBit = 1 << FRAG_RESULT_DEPTH;
+	if(mesa_fp->Base.OutputsWritten & unBit)
+	{
+        pAsm->depth_export_register_number = pAsm->number_used_registers;
+		pAsm->uiFP_OutputMap[FRAG_RESULT_DEPTH] = pAsm->number_used_registers++;
+		pAsm->number_of_exports++;
+		pAsm->number_of_colorandz_exports++;
+		pAsm->pR700Shader->depthIsExported = 1;
+	}
+
+    pAsm->pucOutMask = (unsigned char*) MALLOC(pAsm->number_of_exports);
+    for(ui=0; ui<pAsm->number_of_exports; ui++)
+    {
+        pAsm->pucOutMask[ui] = 0x0;
+    }
+
+    pAsm->uFirstHelpReg = pAsm->number_used_registers;
+}
+
+GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
+					                	struct gl_fragment_program   *mesa_fp)
+{
+    GLuint i, j;
+    GLint * puiTEMPwrites;
+    struct prog_instruction * pILInst;
+    InstDeps         *pInstDeps;
+    struct prog_instruction * texcoord_DepInst;
+    GLint              nDepInstID;
+
+    puiTEMPwrites = (GLint*) MALLOC(sizeof(GLuint)*mesa_fp->Base.NumTemporaries);
+    for(i=0; i<mesa_fp->Base.NumTemporaries; i++)
+    {
+        puiTEMPwrites[i] = -1;
+    }
+
+    pInstDeps = (InstDeps*)MALLOC(sizeof(InstDeps)*mesa_fp->Base.NumInstructions);
+
+    for(i=0; i<mesa_fp->Base.NumInstructions; i++)
+    {
+        pInstDeps[i].nDstDep = -1;
+        pILInst = &(mesa_fp->Base.Instructions[i]);
+
+        //Dst
+        if(pILInst->DstReg.File == PROGRAM_TEMPORARY)
+        {
+            //Set lastwrite for the temp
+            puiTEMPwrites[pILInst->DstReg.Index] = i;
+        }
+
+        //Src
+        for(j=0; j<3; j++)
+        {
+            if(pILInst->SrcReg[j].File == PROGRAM_TEMPORARY)
+            {
+                //Set dep.
+                pInstDeps[i].nSrcDeps[j] = puiTEMPwrites[pILInst->SrcReg[j].Index];
+            }
+            else
+            {
+                pInstDeps[i].nSrcDeps[j] = -1;
+            }
+        }
+    }
+
+    fp->r700AsmCode.pInstDeps = pInstDeps;
+
+    FREE(puiTEMPwrites);
+
+    //Find dep for tex inst    
+    for(i=0; i<mesa_fp->Base.NumInstructions; i++)
+    {
+        pILInst = &(mesa_fp->Base.Instructions[i]);
+
+        if(GL_TRUE == IsTex(pILInst->Opcode))
+        {   //src0 is the tex coord register, src1 is texunit, src2 is textype
+            nDepInstID = pInstDeps[i].nSrcDeps[0];
+            if(nDepInstID >= 0)
+            {
+                texcoord_DepInst = &(mesa_fp->Base.Instructions[nDepInstID]);
+                if(GL_TRUE == IsAlu(texcoord_DepInst->Opcode) )
+                {
+                    pInstDeps[nDepInstID].nDstDep = i;
+                    pInstDeps[i].nDstDep = i;
+                }
+                else if(GL_TRUE == IsTex(texcoord_DepInst->Opcode) )
+                {
+                    pInstDeps[i].nDstDep = i;
+                }
+                else
+                {   //... other deps?
+                }
+            }
+        }
+	}
+
+    return GL_TRUE;
+}
+
+GLboolean r700TranslateFragmentShader(struct r700_fragment_program *fp,
+							     struct gl_fragment_program   *mesa_fp)
+{
+	GLuint    number_of_colors_exported;
+	GLboolean z_enabled = GL_FALSE;
+	GLuint    unBit;
+
+    //Init_Program
+	Init_r700_AssemblerBase( SPT_FP, &(fp->r700AsmCode), &(fp->r700Shader) );
+	Map_Fragment_Program(&(fp->r700AsmCode), mesa_fp);
+
+    if( GL_FALSE == Find_Instruction_Dependencies_fp(fp, mesa_fp) )
+	{
+		return GL_FALSE;
+    }
+	
+	if( GL_FALSE == AssembleInstr(mesa_fp->Base.NumInstructions,
+                                  &(mesa_fp->Base.Instructions[0]), 
+                                  &(fp->r700AsmCode)) )
+	{
+		return GL_FALSE;
+	}
+
+    if(GL_FALSE == Process_Fragment_Exports(&(fp->r700AsmCode), mesa_fp->Base.OutputsWritten) )
+    {
+        return GL_FALSE;
+    }
+
+    fp->r700Shader.nRegs = (fp->r700AsmCode.number_used_registers == 0) ? 0 
+                         : (fp->r700AsmCode.number_used_registers - 1);
+
+	fp->r700Shader.nParamExports = fp->r700AsmCode.number_of_exports;
+
+	number_of_colors_exported = fp->r700AsmCode.number_of_colorandz_exports;
+
+	unBit = 1 << FRAG_RESULT_DEPTH;
+	if(mesa_fp->Base.OutputsWritten & unBit)
+	{
+		z_enabled = GL_TRUE;
+		number_of_colors_exported--;
+	}
+
+	fp->r700Shader.exportMode = number_of_colors_exported << 1 | z_enabled;
+
+    fp->translated = GL_TRUE;
+
+	return GL_TRUE;
+}
+
+void r700SelectFragmentShader(GLcontext *ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    struct r700_fragment_program *fp = (struct r700_fragment_program *)
+	    (ctx->FragmentProgram._Current);
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+    {
+	    fp->r700AsmCode.bR6xx = 1;
+    }
+
+    if (GL_FALSE == fp->translated)
+	    r700TranslateFragmentShader(fp, &(fp->mesa_program));
+}
+
+void * r700GetActiveFpShaderBo(GLcontext * ctx)
+{
+    struct r700_fragment_program *fp = (struct r700_fragment_program *)
+	                                   (ctx->FragmentProgram._Current);
+
+    return fp->shaderbo;
+}
+
+GLboolean r700SetupFragmentProgram(GLcontext * ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    struct r700_fragment_program *fp = (struct r700_fragment_program *)
+	                                   (ctx->FragmentProgram._Current);
+    r700_AssemblerBase         *pAsm = &(fp->r700AsmCode);
+    struct gl_fragment_program *mesa_fp = &(fp->mesa_program);
+    struct gl_program_parameter_list *paramList;
+    unsigned int unNumParamData;
+    unsigned int ui, i;
+    unsigned int unNumOfReg;
+    unsigned int unBit;
+    GLuint exportCount;
+
+    if(GL_FALSE == fp->loaded)
+    {
+	    if(fp->r700Shader.bNeedsAssembly == GL_TRUE)
+	    {
+		    Assemble( &(fp->r700Shader) );
+	    }
+
+        /* Load fp to gpu */
+        r600EmitShader(ctx,
+                       &(fp->shaderbo),
+                       (GLvoid *)(fp->r700Shader.pProgram),
+                       fp->r700Shader.uShaderBinaryDWORDSize,
+                       "FS");
+
+        fp->loaded = GL_TRUE;
+    }
+
+    DumpHwBinary(DUMP_PIXEL_SHADER, (GLvoid *)(fp->r700Shader.pProgram),
+                 fp->r700Shader.uShaderBinaryDWORDSize);
+
+    /* TODO : enable this after MemUse fixed *=
+    (context->chipobj.MemUse)(context, fp->shadercode.buf->id);
+    */
+
+    R600_STATECHANGE(context, ps);
+
+    r700->ps.SQ_PGM_RESOURCES_PS.u32All = 0;
+    SETbit(r700->ps.SQ_PGM_RESOURCES_PS.u32All, PGM_RESOURCES__PRIME_CACHE_ON_DRAW_bit);
+
+    r700->ps.SQ_PGM_START_PS.u32All = 0; /* set from buffer obj */
+
+    R600_STATECHANGE(context, spi);
+
+    unNumOfReg = fp->r700Shader.nRegs + 1;
+
+    ui = (r700->SPI_PS_IN_CONTROL_0.u32All & NUM_INTERP_mask) / (1 << NUM_INTERP_shift);
+
+    /* PS uses fragment.position */
+    if (mesa_fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS))
+    {
+        ui += 1;
+        SETfield(r700->SPI_PS_IN_CONTROL_0.u32All, ui, NUM_INTERP_shift, NUM_INTERP_mask);
+        SETfield(r700->SPI_PS_IN_CONTROL_0.u32All, CENTERS_ONLY, BARYC_SAMPLE_CNTL_shift, BARYC_SAMPLE_CNTL_mask);
+        SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, POSITION_ENA_bit);
+        SETbit(r700->SPI_INPUT_Z.u32All, PROVIDE_Z_TO_SPI_bit);
+    }
+
+    ui = (unNumOfReg < ui) ? ui : unNumOfReg;
+
+    SETfield(r700->ps.SQ_PGM_RESOURCES_PS.u32All, ui, NUM_GPRS_shift, NUM_GPRS_mask);
+
+    CLEARbit(r700->ps.SQ_PGM_RESOURCES_PS.u32All, UNCACHED_FIRST_INST_bit);
+
+    if(fp->r700Shader.uStackSize) /* we don't use branch for now, it should be zero. */
+	{
+        SETfield(r700->ps.SQ_PGM_RESOURCES_PS.u32All, fp->r700Shader.uStackSize,
+                 STACK_SIZE_shift, STACK_SIZE_mask);
+    }
+
+    SETfield(r700->ps.SQ_PGM_EXPORTS_PS.u32All, fp->r700Shader.exportMode,
+             EXPORT_MODE_shift, EXPORT_MODE_mask);
+
+    R600_STATECHANGE(context, db);
+
+    if(fp->r700Shader.killIsUsed)
+    {
+	    SETbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
+    }
+    else
+    {
+        CLEARbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
+    }
+
+    if(fp->r700Shader.depthIsExported)
+    {
+	    SETbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
+    }
+    else
+    {
+        CLEARbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
+    }
+
+    // emit ps input map
+    unBit = 1 << FRAG_ATTRIB_WPOS;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+            ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_WPOS];
+            SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+            SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+                     SEMANTIC_shift, SEMANTIC_mask);
+            if (r700->SPI_INTERP_CONTROL_0.u32All & FLAT_SHADE_ENA_bit)
+                    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+            else
+                    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+    }
+
+    unBit = 1 << FRAG_ATTRIB_COL0;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+	    ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL0];
+	    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+	    SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+		     SEMANTIC_shift, SEMANTIC_mask);
+	    if (r700->SPI_INTERP_CONTROL_0.u32All & FLAT_SHADE_ENA_bit)
+		    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+	    else
+		    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+    }
+
+    unBit = 1 << FRAG_ATTRIB_COL1;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+	    ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL1];
+	    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+	    SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+		     SEMANTIC_shift, SEMANTIC_mask);
+	    if (r700->SPI_INTERP_CONTROL_0.u32All & FLAT_SHADE_ENA_bit)
+		    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+	    else
+		    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+    }
+
+    unBit = 1 << FRAG_ATTRIB_FOGC;
+    if(mesa_fp->Base.InputsRead & unBit)
+    {
+            ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_FOGC];
+            SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+            SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+                     SEMANTIC_shift, SEMANTIC_mask);
+            if (r700->SPI_INTERP_CONTROL_0.u32All & FLAT_SHADE_ENA_bit)
+                    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+            else
+                    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+    }
+
+    for(i=0; i<8; i++)
+    {
+	    unBit = 1 << (FRAG_ATTRIB_TEX0 + i);
+	    if(mesa_fp->Base.InputsRead & unBit)
+	    {
+		    ui = pAsm->uiFP_AttributeMap[FRAG_ATTRIB_TEX0 + i];
+		    SETbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, SEL_CENTROID_bit);
+		    SETfield(r700->SPI_PS_INPUT_CNTL[ui].u32All, ui,
+			     SEMANTIC_shift, SEMANTIC_mask);
+		    CLEARbit(r700->SPI_PS_INPUT_CNTL[ui].u32All, FLAT_SHADE_bit);
+	    }
+    }
+
+    R600_STATECHANGE(context, cb);
+    exportCount = (r700->ps.SQ_PGM_EXPORTS_PS.u32All & EXPORT_MODE_mask) / (1 << EXPORT_MODE_shift);
+    r700->CB_SHADER_CONTROL.u32All = (1 << exportCount) - 1;
+
+    /* sent out shader constants. */
+    paramList = fp->mesa_program.Base.Parameters;
+
+    if(NULL != paramList) {
+	    _mesa_load_state_parameters(ctx, paramList);
+
+	    if (paramList->NumParameters > R700_MAX_DX9_CONSTS)
+		    return GL_FALSE;
+
+	    R600_STATECHANGE(context, ps_consts);
+
+	    r700->ps.num_consts = paramList->NumParameters;
+
+	    unNumParamData = paramList->NumParameters;
+
+	    for(ui=0; ui<unNumParamData; ui++) {
+		    r700->ps.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
+		    r700->ps.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
+		    r700->ps.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
+		    r700->ps.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+	    }
+    } else
+	    r700->ps.num_consts = 0;
+
+    return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.h b/src/mesa/drivers/dri/r600/r700_fragprog.h
new file mode 100644
index 0000000000..cbb108d212
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#ifndef _R700_FRAGPROG_H_
+#define _R700_FRAGPROG_H_
+
+#include "r600_context.h"
+#include "r700_assembler.h"
+
+struct r700_fragment_program
+{
+	struct gl_fragment_program mesa_program;
+
+    r700_AssemblerBase r700AsmCode;
+	R700_Shader        r700Shader;
+
+	GLboolean translated;
+    GLboolean loaded;
+	GLboolean error;
+
+    void * shaderbo;
+
+	GLboolean WritesDepth;
+	GLuint optimization;
+};
+
+/* Internal */
+void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
+			  struct gl_fragment_program *mesa_fp);
+GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
+					   struct gl_fragment_program   *mesa_fp);
+
+GLboolean r700TranslateFragmentShader(struct r700_fragment_program *fp,
+				      struct gl_fragment_program   *mesa_vp);
+
+/* Interface */
+extern void r700SelectFragmentShader(GLcontext *ctx);
+
+extern GLboolean r700SetupFragmentProgram(GLcontext * ctx);
+
+extern void *    r700GetActiveFpShaderBo(GLcontext * ctx);
+
+#endif /*_R700_FRAGPROG_H_*/
diff --git a/src/mesa/drivers/dri/r600/r700_ioctl.c b/src/mesa/drivers/dri/r600/r700_ioctl.c
new file mode 100644
index 0000000000..72a8978976
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_ioctl.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#include <sched.h>
+#include <errno.h>
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/simple_list.h"
+#include "swrast/swrast.h"
+
+#include "radeon_common.h"
+#include "radeon_lock.h"
+#include "r600_context.h"
+
+#include "r700_ioctl.h"
+#include "r700_clear.h"
+
+
+void r700InitIoctlFuncs(struct dd_function_table *functions)
+{
+	functions->Clear = r700Clear;
+	functions->Finish = radeonFinish;
+	functions->Flush = radeonFlush;
+}
diff --git a/src/mesa/drivers/dri/r600/r700_ioctl.h b/src/mesa/drivers/dri/r600/r700_ioctl.h
new file mode 100644
index 0000000000..414dc3e23e
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_ioctl.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#ifndef __R700_IOCTL_H__
+#define __R700_IOCTL_H__
+
+#include "r600_context.h"
+#include "radeon_drm.h"
+
+extern void r700InitIoctlFuncs(struct dd_function_table *functions);
+
+#endif				/* __R700_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.c b/src/mesa/drivers/dri/r600/r700_oglprog.c
new file mode 100644
index 0000000000..5290ef31be
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_oglprog.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#include <string.h>
+
+#include "main/glheader.h"
+#include "main/imports.h"
+
+#include "shader/program.h"
+#include "tnl/tnl.h"
+
+#include "r600_context.h"
+#include "r600_emit.h"
+
+#include "r700_oglprog.h"
+#include "r700_fragprog.h"
+#include "r700_vertprog.h"
+
+
+static struct gl_program *r700NewProgram(GLcontext * ctx, 
+                                         GLenum target,
+					                     GLuint id)
+{
+	struct gl_program *pProgram = NULL;
+
+    struct r700_vertex_program_cont *vpc;
+	struct r700_fragment_program *fp;
+
+	radeon_print(RADEON_SHADER, RADEON_VERBOSE,
+			"%s %u, %u\n", __func__, target, id);
+
+    switch (target) 
+    {
+    case GL_VERTEX_STATE_PROGRAM_NV:
+    case GL_VERTEX_PROGRAM_ARB:	    
+        vpc       = CALLOC_STRUCT(r700_vertex_program_cont);
+	    pProgram = _mesa_init_vertex_program(ctx, 
+                                             &vpc->mesa_program,
+					                         target, 
+                                             id);
+	    break;
+    case GL_FRAGMENT_PROGRAM_NV:
+    case GL_FRAGMENT_PROGRAM_ARB:
+		fp       = CALLOC_STRUCT(r700_fragment_program);
+		pProgram = _mesa_init_fragment_program(ctx, 
+                                               &fp->mesa_program,
+						                       target, 
+                                               id);
+        fp->translated = GL_FALSE;
+        fp->loaded     = GL_FALSE;
+
+        fp->shaderbo   = NULL;
+
+	    break;
+    default:
+	    _mesa_problem(ctx, "Bad target in r700NewProgram");
+    }
+
+	return pProgram;
+}
+
+static void r700DeleteProgram(GLcontext * ctx, struct gl_program *prog)
+{
+    struct r700_vertex_program_cont   * vpc;
+    struct r700_vertex_program *vp, *tmp;
+    struct r700_fragment_program * fp;
+
+	radeon_print(RADEON_SHADER, RADEON_VERBOSE,
+			"%s %p\n", __func__, prog);
+
+    switch (prog->Target) 
+    {
+    case GL_VERTEX_STATE_PROGRAM_NV:
+    case GL_VERTEX_PROGRAM_ARB:	    
+        vpc = (struct r700_vertex_program_cont*)prog;
+        vp = vpc->progs;
+	while (vp) {
+		tmp = vp->next;
+		/* Release DMA region */
+	 
+	        r600DeleteShader(ctx, vp->shaderbo);
+
+	        /* Clean up */
+	        Clean_Up_Assembler(&(vp->r700AsmCode));
+	        Clean_Up_Shader(&(vp->r700Shader));
+		_mesa_free(vp);
+		vp = tmp;
+	}
+	    break;
+    case GL_FRAGMENT_PROGRAM_NV:
+    case GL_FRAGMENT_PROGRAM_ARB:
+		fp = (struct r700_fragment_program*)prog;
+        /* Release DMA region */
+
+        r600DeleteShader(ctx, fp->shaderbo);
+
+        /* Clean up */
+        Clean_Up_Assembler(&(fp->r700AsmCode));
+        Clean_Up_Shader(&(fp->r700Shader));
+	    break;
+    default:
+	    _mesa_problem(ctx, "Bad target in r700NewProgram");
+    }
+
+	_mesa_delete_program(ctx, prog);
+}
+
+static void
+r700ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+
+}
+
+static GLboolean r700IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+
+	return GL_TRUE;
+}
+
+void r700InitShaderFuncs(struct dd_function_table *functions)
+{
+	functions->NewProgram = r700NewProgram;
+	functions->DeleteProgram = r700DeleteProgram;
+	functions->ProgramStringNotify = r700ProgramStringNotify;
+	functions->IsProgramNative = r700IsProgramNative;
+}
diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.h b/src/mesa/drivers/dri/r600/r700_oglprog.h
new file mode 100644
index 0000000000..fe2e9d1974
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_oglprog.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#ifndef _R700_OGLPROG_H_
+#define _R700_OGLPROG_H_
+#include "r600_context.h"
+
+extern void r700InitShaderFuncs(struct dd_function_table *functions);
+
+#endif /*_R700_OGLPROG_H_*/
diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c
new file mode 100644
index 0000000000..b1c3648ca5
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_render.c
@@ -0,0 +1,480 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ *   CooperYuan <cooper.yuan@amd.com>, <cooperyuan@gmail.com>
+ */
+
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/dd.h"
+#include "main/simple_list.h"
+#include "main/api_arrayelt.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "tnl/t_context.h"
+#include "tnl/t_vertex.h"
+#include "tnl/t_pipeline.h"
+
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+
+#include "r600_tex.h"
+
+#include "r700_vertprog.h"
+#include "r700_fragprog.h"
+#include "r700_state.h"
+
+#include "radeon_common_context.h"
+
+void r700WaitForIdle(context_t *context);
+void r700WaitForIdleClean(context_t *context);
+GLboolean r700SendTextureState(context_t *context);
+static unsigned int r700PrimitiveType(int prim);
+void r600UpdateTextureState(GLcontext * ctx);
+GLboolean r700SyncSurf(context_t *context,
+		       struct radeon_bo *pbo,
+		       uint32_t read_domain,
+		       uint32_t write_domain,
+		       uint32_t sync_type);
+
+void r700WaitForIdle(context_t *context)
+{
+    BATCH_LOCALS(&context->radeon);
+    radeon_print(RADEON_RENDER | RADEON_STATE, RADEON_TRACE, "%s\n", __func__);
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
+    R600_OUT_BATCH(mmWAIT_UNTIL - ASIC_CONFIG_BASE_INDEX);
+    R600_OUT_BATCH(WAIT_3D_IDLE_bit);
+
+    END_BATCH();
+    COMMIT_BATCH();
+}
+
+void r700WaitForIdleClean(context_t *context)
+{
+    BATCH_LOCALS(&context->radeon);
+    radeon_print(RADEON_RENDER | RADEON_STATE, RADEON_TRACE, "%s\n", __func__);
+    BEGIN_BATCH_NO_AUTOSTATE(5);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_EVENT_WRITE, 0));
+    R600_OUT_BATCH(CACHE_FLUSH_AND_INV_EVENT);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
+    R600_OUT_BATCH(mmWAIT_UNTIL - ASIC_CONFIG_BASE_INDEX);
+    R600_OUT_BATCH(WAIT_3D_IDLE_bit | WAIT_3D_IDLECLEAN_bit);
+
+    END_BATCH();
+    COMMIT_BATCH();
+}
+
+void r700Start3D(context_t *context)
+{
+    BATCH_LOCALS(&context->radeon);
+    radeon_print(RADEON_RENDER | RADEON_STATE, RADEON_TRACE, "%s\n", __func__);
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+    {
+        BEGIN_BATCH_NO_AUTOSTATE(2);
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_START_3D_CMDBUF, 0));
+        R600_OUT_BATCH(0);
+        END_BATCH();
+    }
+
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_CONTEXT_CONTROL, 1));
+    R600_OUT_BATCH(0x80000000);
+    R600_OUT_BATCH(0x80000000);
+    END_BATCH();
+
+    COMMIT_BATCH();
+
+    r700WaitForIdleClean(context);
+}
+
+GLboolean r700SyncSurf(context_t *context,
+		       struct radeon_bo *pbo,
+		       uint32_t read_domain,
+		       uint32_t write_domain,
+		       uint32_t sync_type)
+{
+    BATCH_LOCALS(&context->radeon);
+    radeon_print(RADEON_RENDER | RADEON_STATE, RADEON_TRACE, "%s\n", __func__);
+    uint32_t cp_coher_size;
+
+    if (!pbo)
+	    return GL_FALSE;
+
+    if (pbo->size == 0xffffffff)
+	    cp_coher_size = 0xffffffff;
+    else
+	    cp_coher_size = ((pbo->size + 255) >> 8);
+
+    BEGIN_BATCH_NO_AUTOSTATE(5 + 2);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SURFACE_SYNC, 3));
+    R600_OUT_BATCH(sync_type);
+    R600_OUT_BATCH(cp_coher_size);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(10);
+    R600_OUT_BATCH_RELOC(0,
+			 pbo,
+			 0,
+			 read_domain, write_domain, 0);
+    END_BATCH();
+    COMMIT_BATCH();
+
+    return GL_TRUE;
+}
+
+static unsigned int r700PrimitiveType(int prim)
+{
+    switch (prim & PRIM_MODE_MASK)
+    {
+    case GL_POINTS:
+        return DI_PT_POINTLIST;
+        break;
+    case GL_LINES:
+        return DI_PT_LINELIST;
+        break;
+    case GL_LINE_STRIP:
+        return DI_PT_LINESTRIP;
+        break;
+    case GL_LINE_LOOP:
+        return DI_PT_LINELOOP;
+        break;
+    case GL_TRIANGLES:
+        return DI_PT_TRILIST;
+        break;
+    case GL_TRIANGLE_STRIP:
+        return DI_PT_TRISTRIP;
+        break;
+    case GL_TRIANGLE_FAN:
+        return DI_PT_TRIFAN;
+        break;
+    case GL_QUADS:
+        return DI_PT_QUADLIST;
+        break;
+    case GL_QUAD_STRIP:
+        return DI_PT_QUADSTRIP;
+        break;
+    case GL_POLYGON:
+        return DI_PT_POLYGON;
+        break;
+    default:
+        assert(0);
+        return -1;
+        break;
+    }
+}
+
+static int r700NumVerts(int num_verts, int prim)
+{
+	int verts_off = 0;
+
+	switch (prim & PRIM_MODE_MASK) {
+	case GL_POINTS:
+		verts_off = 0;
+		break;
+	case GL_LINES:
+		verts_off = num_verts % 2;
+		break;
+	case GL_LINE_STRIP:
+		if (num_verts < 2)
+			verts_off = num_verts;
+		break;
+	case GL_LINE_LOOP:
+		if (num_verts < 2)
+			verts_off = num_verts;
+		break;
+	case GL_TRIANGLES:
+		verts_off = num_verts % 3;
+		break;
+	case GL_TRIANGLE_STRIP:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	case GL_TRIANGLE_FAN:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	case GL_QUADS:
+		verts_off = num_verts % 4;
+		break;
+	case GL_QUAD_STRIP:
+		if (num_verts < 4)
+			verts_off = num_verts;
+		else
+			verts_off = num_verts % 2;
+		break;
+	case GL_POLYGON:
+		if (num_verts < 3)
+			verts_off = num_verts;
+		break;
+	default:
+		assert(0);
+		return -1;
+		break;
+	}
+
+	return num_verts - verts_off;
+}
+
+static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	BATCH_LOCALS(&context->radeon);
+	int type, i, total_emit;
+	int num_indices;
+	uint32_t vgt_draw_initiator = 0;
+	uint32_t vgt_index_type     = 0;
+	uint32_t vgt_primitive_type = 0;
+	uint32_t vgt_num_indices    = 0;
+	TNLcontext *tnl = TNL_CONTEXT(ctx);
+	struct vertex_buffer *vb = &tnl->vb;
+
+	type = r700PrimitiveType(prim);
+	num_indices = r700NumVerts(end - start, prim);
+
+	radeon_print(RADEON_RENDER, RADEON_TRACE,
+		"%s type %x num_indices %d\n",
+		__func__, type, num_indices);
+
+	if (type < 0 || num_indices <= 0)
+		return;
+
+        total_emit =   3 /* VGT_PRIMITIVE_TYPE */
+		     + 2 /* VGT_INDEX_TYPE */
+		     + 2 /* NUM_INSTANCES */
+                     + num_indices + 3; /* DRAW_INDEX_IMMD */
+
+        BEGIN_BATCH_NO_AUTOSTATE(total_emit);
+	// prim
+        SETfield(vgt_primitive_type, type,
+		 VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift, VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask);
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
+        R600_OUT_BATCH(mmVGT_PRIMITIVE_TYPE - ASIC_CONFIG_BASE_INDEX);
+        R600_OUT_BATCH(vgt_primitive_type);
+
+	// index type
+        SETfield(vgt_index_type, DI_INDEX_SIZE_32_BIT, INDEX_TYPE_shift, INDEX_TYPE_mask);
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_INDEX_TYPE, 0));
+        R600_OUT_BATCH(vgt_index_type);
+
+	// num instances
+	R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
+        R600_OUT_BATCH(1);
+
+	// draw packet
+        vgt_num_indices = num_indices;
+        SETfield(vgt_draw_initiator, DI_SRC_SEL_IMMEDIATE, SOURCE_SELECT_shift, SOURCE_SELECT_mask);
+	SETfield(vgt_draw_initiator, DI_MAJOR_MODE_0, MAJOR_MODE_shift, MAJOR_MODE_mask);
+
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX_IMMD, (num_indices + 1)));
+        R600_OUT_BATCH(vgt_num_indices);
+        R600_OUT_BATCH(vgt_draw_initiator);
+
+        for (i = start; i < (start + num_indices); i++) {
+		if(vb->Elts)
+			R600_OUT_BATCH(vb->Elts[i]);
+		else
+			R600_OUT_BATCH(i);
+        }
+        END_BATCH();
+        COMMIT_BATCH();
+
+}
+
+/* start 3d, idle, cb/db flush */
+#define PRE_EMIT_STATE_BUFSZ 10 + 5 + 14
+
+static GLuint r700PredictRenderSize(GLcontext* ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    struct r700_vertex_program *vp = context->selected_vp;
+    struct vertex_buffer *vb = &tnl->vb;
+    GLboolean flushed;
+    GLuint dwords, i;
+    GLuint state_size;
+    /* pre calculate aos count so state prediction works */
+    context->radeon.tcl.aos_count = _mesa_bitcount(vp->mesa_program->Base.InputsRead);
+
+    dwords = PRE_EMIT_STATE_BUFSZ;
+    for (i = 0; i < vb->PrimitiveCount; i++)
+        dwords += vb->Primitive[i].count + 10;
+    state_size = radeonCountStateEmitSize(&context->radeon);
+    flushed = rcommonEnsureCmdBufSpace(&context->radeon,
+            dwords + state_size, __FUNCTION__);
+
+    if (flushed)
+        dwords += radeonCountStateEmitSize(&context->radeon);
+    else
+        dwords += state_size;
+
+    radeon_print(RADEON_RENDER, RADEON_VERBOSE,
+	"%s: total prediction size is %d.\n", __FUNCTION__, dwords);
+    return dwords;
+}
+
+static GLboolean r700RunRender(GLcontext * ctx,
+			       struct tnl_pipeline_stage *stage)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    radeonContextPtr radeon = &context->radeon;
+    unsigned int i, id = 0;
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    struct vertex_buffer *vb = &tnl->vb;
+    struct radeon_renderbuffer *rrb;
+
+    radeon_print(RADEON_RENDER, RADEON_NORMAL, "%s: cs begin at %d\n",
+                __func__, context->radeon.cmdbuf.cs->cdw);
+
+    /* always emit CB base to prevent
+     * lock ups on some chips.
+     */
+    R600_STATECHANGE(context, cb_target);
+    /* mark vtx as dirty since it changes per-draw */
+    R600_STATECHANGE(context, vtx);
+
+    r700SetScissor(context);
+    r700SetupVertexProgram(ctx);
+    r700SetupFragmentProgram(ctx);
+    r600UpdateTextureState(ctx);
+
+    GLuint emit_end = r700PredictRenderSize(ctx) 
+        + context->radeon.cmdbuf.cs->cdw;
+    r700SetupStreams(ctx);
+
+    radeonEmitState(radeon);
+
+    radeon_debug_add_indent();
+    /* richard test code */
+    for (i = 0; i < vb->PrimitiveCount; i++) {
+        GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
+        GLuint start = vb->Primitive[i].start;
+        GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
+        r700RunRenderPrimitive(ctx, start, end, prim);
+    }
+    radeon_debug_remove_indent();
+
+    /* Flush render op cached for last several quads. */
+    r700WaitForIdleClean(context);
+
+    rrb = radeon_get_colorbuffer(&context->radeon);
+    if (rrb && rrb->bo)
+	    r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
+			 CB_ACTION_ENA_bit | (1 << (id + 6)));
+
+    rrb = radeon_get_depthbuffer(&context->radeon);
+    if (rrb && rrb->bo)
+	    r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
+			 DB_ACTION_ENA_bit | DB_DEST_BASE_ENA_bit);
+
+    radeonReleaseArrays(ctx, ~0);
+
+    radeon_print(RADEON_RENDER, RADEON_TRACE, "%s: cs end at %d\n",
+                __func__, context->radeon.cmdbuf.cs->cdw);
+
+    if ( emit_end < context->radeon.cmdbuf.cs->cdw )
+       WARN_ONCE("Rendering was %d commands larger than predicted size."
+	       " We might overflow  command buffer.\n", context->radeon.cmdbuf.cs->cdw - emit_end);
+
+    return GL_FALSE;
+}
+
+static GLboolean r700RunNonTCLRender(GLcontext * ctx,
+				     struct tnl_pipeline_stage *stage) /* -------------------- */
+{
+	GLboolean bRet = GL_TRUE;
+	
+	return bRet;
+}
+
+static GLboolean r700RunTCLRender(GLcontext * ctx,  /*----------------------*/
+				  struct tnl_pipeline_stage *stage)
+{
+	GLboolean bRet = GL_FALSE;
+
+    /* TODO : sw fallback */
+
+    /* Need shader bo's setup before bo check */
+    r700UpdateShaders(ctx);
+    /**
+
+    * Ensure all enabled and complete textures are uploaded along with any buffers being used.
+    */
+    if(!r600ValidateBuffers(ctx))
+    {
+        return GL_TRUE;
+    }
+
+    bRet = r700RunRender(ctx, stage);
+
+    return bRet;
+	//GL_FALSE will stop to do other pipe stage in _tnl_run_pipeline
+    //The render here DOES finish the whole pipe, so GL_FALSE should be returned for success.
+}
+
+const struct tnl_pipeline_stage _r700_render_stage = {
+	"r700 Hardware Rasterization",
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	r700RunNonTCLRender
+};
+
+const struct tnl_pipeline_stage _r700_tcl_stage = {
+	"r700 Hardware Transform, Clipping and Lighting",
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	r700RunTCLRender
+};
+
+const struct tnl_pipeline_stage *r700_pipeline[] = 
+{
+    &_r700_tcl_stage,
+    &_tnl_vertex_transform_stage,
+	&_tnl_normal_transform_stage,
+	&_tnl_lighting_stage,
+	&_tnl_fog_coordinate_stage,
+	&_tnl_texgen_stage,
+	&_tnl_texture_transform_stage,
+	&_tnl_vertex_program_stage,
+
+    &_r700_render_stage,
+    &_tnl_render_stage,
+    0,
+};
+
+
diff --git a/src/mesa/drivers/dri/r600/r700_shader.c b/src/mesa/drivers/dri/r600/r700_shader.c
new file mode 100644
index 0000000000..b4fd51c137
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_shader.c
@@ -0,0 +1,527 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "main/imports.h"
+
+#include "main/glheader.h"
+
+#include "r600_context.h"
+#include "r700_debug.h"
+
+#include "r700_shader.h"
+
+void r700ShaderInit(GLcontext * ctx)
+{
+}
+
+void AddInstToList(TypedShaderList * plstCFInstructions, R700ShaderInstruction * pInst)
+{
+	if(NULL == plstCFInstructions->pTail)
+	{	//first
+		plstCFInstructions->pHead = pInst;
+		plstCFInstructions->pTail = pInst;
+	}
+	else
+	{
+		plstCFInstructions->pTail->pNextInst = pInst;
+		plstCFInstructions->pTail = pInst;
+	}
+	pInst->pNextInst = NULL;
+
+	plstCFInstructions->uNumOfNode++;
+}
+
+void Init_R700_Shader(R700_Shader * pShader)
+{
+	pShader->Type = R700_SHADER_INVALID;
+	pShader->pProgram = NULL;
+	pShader->bBinaryShader = GL_FALSE;
+	pShader->bFetchShaderRequired = GL_FALSE;
+	pShader->bNeedsAssembly = GL_FALSE;
+	pShader->bLinksDirty = GL_TRUE;
+	pShader->uShaderBinaryDWORDSize = 0;
+	pShader->nRegs = 0;
+	pShader->nParamExports = 0;
+	pShader->nMemExports = 0;
+	pShader->resource = 0;
+
+	pShader->exportMode = 0;
+	pShader->depthIsImported = GL_FALSE;
+
+	pShader->positionVectorIsExported = GL_FALSE;
+	pShader->miscVectorIsExported = GL_FALSE;
+	pShader->renderTargetArrayIndexIsExported = GL_FALSE;
+	pShader->ccDist0VectorIsExported = GL_FALSE;
+	pShader->ccDist1VectorIsExported = GL_FALSE; 
+
+
+	pShader->depthIsExported = GL_FALSE;
+	pShader->stencilRefIsExported = GL_FALSE;
+	pShader->coverageToMaskIsExported = GL_FALSE;
+	pShader->maskIsExported = GL_FALSE;
+	pShader->killIsUsed = GL_FALSE;
+
+	pShader->uCFOffset = 0;
+	pShader->uStackSize = 0;
+	pShader->uMaxCallDepth = 0;
+
+	pShader->bSurfAllocated = GL_FALSE;
+	
+	pShader->lstCFInstructions.pHead=NULL;  
+	pShader->lstCFInstructions.pTail=NULL;  
+	pShader->lstCFInstructions.uNumOfNode=0;
+	pShader->lstALUInstructions.pHead=NULL; 
+	pShader->lstALUInstructions.pTail=NULL; 
+	pShader->lstALUInstructions.uNumOfNode=0;
+	pShader->lstTEXInstructions.pHead=NULL; 
+	pShader->lstTEXInstructions.pTail=NULL; 
+	pShader->lstTEXInstructions.uNumOfNode=0;
+	pShader->lstVTXInstructions.pHead=NULL; 
+	pShader->lstVTXInstructions.pTail=NULL; 
+	pShader->lstVTXInstructions.uNumOfNode=0;
+}
+
+void AddCFInstruction(R700_Shader *pShader, R700ControlFlowInstruction *pCFInst)
+{
+    R700ControlFlowSXClause*  pSXClause; 
+    R700ControlFlowSMXClause* pSMXClause;
+
+    pCFInst->m_uIndex = pShader->lstCFInstructions.uNumOfNode;
+    AddInstToList(&(pShader->lstCFInstructions), 
+                  (R700ShaderInstruction*)pCFInst);
+    pShader->uShaderBinaryDWORDSize += GetInstructionSize(pCFInst->m_ShaderInstType);
+
+    pSXClause = NULL;
+    pSMXClause = NULL; 
+	switch (pCFInst->m_ShaderInstType)
+	{
+	case SIT_CF_ALL_EXP_SX:
+		pSXClause =  (R700ControlFlowSXClause*)pCFInst;
+		break;
+	case SIT_CF_ALL_EXP_SMX:
+		pSMXClause = (R700ControlFlowSMXClause*)pCFInst;
+		break;
+	default:
+		break;
+	};
+
+    if((pSXClause != NULL) && (pSXClause->m_Word0.f.type == SQ_EXPORT_PARAM))
+    {
+        pShader->nParamExports += pSXClause->m_Word1.f.burst_count + 1;
+    }
+    else if ((pSMXClause != NULL) && (pSMXClause->m_Word1.f.cf_inst == SQ_CF_INST_MEM_RING) &&
+            (pSMXClause->m_Word0.f.type == SQ_EXPORT_WRITE || pSMXClause->m_Word0.f.type == SQ_EXPORT_WRITE_IND))
+    {
+        pShader->nMemExports += pSMXClause->m_Word1.f.burst_count + 1;
+    }
+
+    pShader->bLinksDirty    = GL_TRUE;
+    pShader->bNeedsAssembly = GL_TRUE;
+
+    pCFInst->useCount++;
+}
+
+void AddVTXInstruction(R700_Shader *pShader, R700VertexInstruction *pVTXInst)
+{
+    pVTXInst->m_uIndex = pShader->lstVTXInstructions.uNumOfNode;
+	AddInstToList(&(pShader->lstVTXInstructions), 
+                  (R700ShaderInstruction*)pVTXInst);
+	pShader->uShaderBinaryDWORDSize += GetInstructionSize(pVTXInst->m_ShaderInstType);
+
+	if(pVTXInst->m_ShaderInstType == SIT_VTX_GENERIC)
+	{
+		R700VertexGenericFetch* pVTXGenericClause = (R700VertexGenericFetch*)pVTXInst;	
+		pShader->nRegs = (pShader->nRegs < pVTXGenericClause->m_Word1_GPR.f.dst_gpr) ? pVTXGenericClause->m_Word1_GPR.f.dst_gpr : pShader->nRegs;
+	}
+
+    pShader->bLinksDirty    = GL_TRUE;
+    pShader->bNeedsAssembly = GL_TRUE;
+
+    pVTXInst->useCount++;
+}
+
+void AddTEXInstruction(R700_Shader *pShader, R700TextureInstruction *pTEXInst)
+{
+    pTEXInst->m_uIndex = pShader->lstTEXInstructions.uNumOfNode;
+	AddInstToList(&(pShader->lstTEXInstructions), 
+                  (R700ShaderInstruction*)pTEXInst);
+	pShader->uShaderBinaryDWORDSize += GetInstructionSize(pTEXInst->m_ShaderInstType);
+
+    pShader->nRegs = (pShader->nRegs < pTEXInst->m_Word1.f.dst_gpr) ? pTEXInst->m_Word1.f.dst_gpr : pShader->nRegs;
+
+    pShader->bLinksDirty    = GL_TRUE;
+    pShader->bNeedsAssembly = GL_TRUE;
+
+    pTEXInst->useCount++;
+}
+
+void AddALUInstruction(R700_Shader *pShader, R700ALUInstruction *pALUInst)
+{
+    pALUInst->m_uIndex = pShader->lstALUInstructions.uNumOfNode;
+    AddInstToList(&(pShader->lstALUInstructions), 
+                  (R700ShaderInstruction*)pALUInst);
+    pShader->uShaderBinaryDWORDSize += GetInstructionSize(pALUInst->m_ShaderInstType);
+
+    pShader->nRegs = (pShader->nRegs < pALUInst->m_Word1.f.dst_gpr) ? pALUInst->m_Word1.f.dst_gpr : pShader->nRegs;
+
+    pShader->bLinksDirty    = GL_TRUE;
+    pShader->bNeedsAssembly = GL_TRUE;
+
+    pALUInst->useCount++;
+}
+
+void ResolveLinks(R700_Shader *pShader)
+{
+    GLuint uiSize;
+    R700ShaderInstruction  *pInst;
+    R700ALUInstruction     *pALUinst;
+    R700TextureInstruction *pTEXinst;
+    R700VertexInstruction  *pVTXinst; 
+
+    GLuint vtxOffset;
+
+	GLuint cfOffset = 0x0;  
+
+    GLuint aluOffset = cfOffset + pShader->lstCFInstructions.uNumOfNode * GetInstructionSize(SIT_CF);
+
+    GLuint texOffset = aluOffset;  // + m_lstALUInstructions.size() * R700ALUInstruction::SIZE,
+
+    pInst = pShader->lstALUInstructions.pHead;
+    while(NULL != pInst)
+    {
+        texOffset += GetInstructionSize(pInst->m_ShaderInstType);
+
+        pInst = pInst->pNextInst;
+    };
+  
+    vtxOffset = texOffset + pShader->lstTEXInstructions.uNumOfNode * GetInstructionSize(SIT_TEX);
+
+    if ( ((pShader->lstTEXInstructions.uNumOfNode > 0) && (texOffset % 4 != 0)) || 
+         ((pShader->lstVTXInstructions.uNumOfNode > 0) && (vtxOffset % 4 != 0))    )
+    {
+        pALUinst = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
+        Init_R700ALUInstruction(pALUinst);
+        AddALUInstruction(pShader, pALUinst);
+        texOffset += GetInstructionSize(SIT_ALU);
+        vtxOffset += GetInstructionSize(SIT_ALU);
+    }
+
+    pInst  = pShader->lstALUInstructions.pHead;
+    uiSize = 0;
+    while(NULL != pInst)
+    {
+        pALUinst = (R700ALUInstruction*)pInst;
+
+        if(pALUinst->m_pLinkedALUClause != NULL)
+        {
+            // This address is quad-word aligned
+            pALUinst->m_pLinkedALUClause->m_Word0.f.addr = (aluOffset + uiSize) >> 1;
+        }
+
+        uiSize += GetInstructionSize(pALUinst->m_ShaderInstType);
+
+        pInst = pInst->pNextInst;
+    };
+
+    pInst  = pShader->lstTEXInstructions.pHead;
+    uiSize = 0;
+    while(NULL != pInst)
+    {
+        pTEXinst = (R700TextureInstruction*)pInst;
+
+        if (pTEXinst->m_pLinkedGenericClause != NULL)
+        {
+            pTEXinst->m_pLinkedGenericClause->m_Word0.f.addr = (texOffset + uiSize) >> 1;
+        }
+
+        uiSize += GetInstructionSize(pTEXinst->m_ShaderInstType);
+
+        pInst = pInst->pNextInst;
+    };
+
+    pInst  = pShader->lstVTXInstructions.pHead;
+    uiSize = 0;
+    while(NULL != pInst)
+    {
+        pVTXinst = (R700VertexInstruction*)pInst;
+
+        if (pVTXinst->m_pLinkedGenericClause != NULL)
+        {
+            pVTXinst->m_pLinkedGenericClause->m_Word0.f.addr = (vtxOffset + uiSize) >> 1;
+        }
+
+        uiSize += GetInstructionSize(pVTXinst->m_ShaderInstType);
+
+        pInst = pInst->pNextInst;
+    };
+
+    pShader->bLinksDirty = GL_FALSE;
+}
+
+void Assemble(R700_Shader *pShader)
+{
+	GLuint i;
+    GLuint *pShaderBinary;
+    GLuint size_of_program;
+    GLuint *pCurrPos;
+
+    GLuint end_of_cf_instructions;
+    GLuint number_of_alu_dwords;
+
+    R700ShaderInstruction  *pInst;
+
+    if(GL_TRUE == pShader->bBinaryShader)
+    {
+        return;
+    }
+
+    if(pShader->bLinksDirty == GL_TRUE) 
+    {
+        ResolveLinks(pShader);
+    }
+
+    size_of_program = pShader->uShaderBinaryDWORDSize;
+    
+    pShaderBinary = (GLuint*) MALLOC(sizeof(GLuint)*size_of_program);
+ 
+    pCurrPos = pShaderBinary;
+
+    for (i = 0; i < size_of_program; i++)
+    {
+        pShaderBinary[i] = 0;
+    }
+
+    pInst = pShader->lstCFInstructions.pHead;
+    while(NULL != pInst)
+    {
+        switch (pInst->m_ShaderInstType)
+        {
+        case SIT_CF_GENERIC: 
+            {
+                R700ControlFlowGenericClause* pCFgeneric = (R700ControlFlowGenericClause*)pInst;
+                *pCurrPos++ = pCFgeneric->m_Word0.val;
+                *pCurrPos++ = pCFgeneric->m_Word1.val;
+            }
+            break;
+        case SIT_CF_ALU: 
+            {
+                R700ControlFlowALUClause* pCFalu = (R700ControlFlowALUClause*)pInst;
+                *pCurrPos++ = pCFalu->m_Word0.val;
+                *pCurrPos++ = pCFalu->m_Word1.val;
+            }
+            break;
+        case SIT_CF_ALL_EXP_SX: 
+            {
+                R700ControlFlowSXClause* pCFsx = (R700ControlFlowSXClause*)pInst;
+                *pCurrPos++ = pCFsx->m_Word0.val;
+                *pCurrPos++ = (pCFsx->m_Word1.val | pCFsx->m_Word1_SWIZ.val);
+            }
+            break;
+        case SIT_CF_ALL_EXP_SMX: 
+            {
+                R700ControlFlowSMXClause* pCFsmx = (R700ControlFlowSMXClause*)pInst;
+                *pCurrPos++ = pCFsmx->m_Word0.val;
+                *pCurrPos++ = (pCFsmx->m_Word1.val | pCFsmx->m_Word1_BUF.val);
+            }
+            break;
+        default:
+            break;
+        }
+
+        pInst = pInst->pNextInst;
+    };
+    
+    number_of_alu_dwords = 0;
+    pInst = pShader->lstALUInstructions.pHead;
+    while(NULL != pInst)
+    {
+        switch (pInst->m_ShaderInstType)
+        {
+        case SIT_ALU: 
+            {
+                R700ALUInstruction* pALU = (R700ALUInstruction*)pInst;
+
+                *pCurrPos++ = pALU->m_Word0.val;
+                *pCurrPos++ = (pALU->m_Word1.val | pALU->m_Word1_OP2.val | pALU->m_Word1_OP3.val);
+
+                number_of_alu_dwords += 2;
+            }
+            break;
+        case SIT_ALU_HALF_LIT: 
+            {
+                R700ALUInstructionHalfLiteral* pALUhalf = (R700ALUInstructionHalfLiteral*)pInst;
+
+                *pCurrPos++ = pALUhalf->m_Word0.val;
+                *pCurrPos++ = (pALUhalf->m_Word1.val | pALUhalf->m_Word1_OP2.val | pALUhalf->m_Word1_OP3.val);
+                *pCurrPos++ = *((GLuint*)&(pALUhalf->m_fLiteralX));
+                *pCurrPos++ = *((GLuint*)&(pALUhalf->m_fLiteralY));
+
+                number_of_alu_dwords += 4;
+            }
+            break;
+        case SIT_ALU_FALL_LIT: 
+            {
+                R700ALUInstructionFullLiteral* pALUfull = (R700ALUInstructionFullLiteral*)pInst;
+
+                *pCurrPos++ = pALUfull->m_Word0.val;
+                *pCurrPos++ = (pALUfull->m_Word1.val | pALUfull->m_Word1_OP2.val | pALUfull->m_Word1_OP3.val);
+
+                *pCurrPos++ = *((GLuint*)&(pALUfull->m_fLiteralX));
+                *pCurrPos++ = *((GLuint*)&(pALUfull->m_fLiteralY));
+                *pCurrPos++ = *((GLuint*)&(pALUfull->m_fLiteralZ));
+                *pCurrPos++ = *((GLuint*)&(pALUfull->m_fLiteralW));
+
+                number_of_alu_dwords += 6;
+            }
+            break;
+        default:
+            break;
+        }
+
+        pInst = pInst->pNextInst;
+    };
+    
+    pInst = pShader->lstTEXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        R700TextureInstruction* pTEX = (R700TextureInstruction*)pInst;
+
+        *pCurrPos++ = pTEX->m_Word0.val;
+        *pCurrPos++ = pTEX->m_Word1.val;
+        *pCurrPos++ = pTEX->m_Word2.val;
+        *pCurrPos++ = 0x0beadeaf;
+
+        pInst = pInst->pNextInst;
+    };
+    
+    pInst = pShader->lstVTXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        switch (pInst->m_ShaderInstType)
+        {
+        case SIT_VTX_SEM: //
+            {
+                R700VertexSemanticFetch* pVTXsem = (R700VertexSemanticFetch*)pInst;
+
+                *pCurrPos++ = pVTXsem->m_Word0.val;
+                *pCurrPos++ = (pVTXsem->m_Word1.val | pVTXsem->m_Word1_SEM.val);
+                *pCurrPos++ = pVTXsem->m_Word2.val;
+                *pCurrPos++ = 0x0beadeaf;
+            }
+            break;
+        case SIT_VTX_GENERIC: //
+            {
+                R700VertexGenericFetch* pVTXgeneric = (R700VertexGenericFetch*)pInst;
+
+                *pCurrPos++ = pVTXgeneric->m_Word0.val;
+                *pCurrPos++ = (pVTXgeneric->m_Word1.val | pVTXgeneric->m_Word1_GPR.val);
+                *pCurrPos++ = pVTXgeneric->m_Word2.val;
+                *pCurrPos++ = 0x0beadeaf;
+            }
+            break;
+        default:
+            break;
+        }
+
+        pInst = pInst->pNextInst;
+    };
+
+    if(NULL != pShader->pProgram)
+    {
+        FREE(pShader->pProgram);
+    }
+    pShader->pProgram = (GLubyte*)pShaderBinary;
+
+    end_of_cf_instructions = pShader->uCFOffset + pShader->lstCFInstructions.uNumOfNode * GetInstructionSize(SIT_CF);
+    
+    pShader->uEndOfCF = end_of_cf_instructions >> 1;
+
+    pShader->uEndOfALU = (end_of_cf_instructions + number_of_alu_dwords) >> 1;
+
+    pShader->uEndOfFetch = (pShader->uCFOffset + pShader->uShaderBinaryDWORDSize) >> 1;
+
+    pShader->bNeedsAssembly = GL_FALSE;
+}
+
+void LoadProgram(R700_Shader *pShader) //context
+{
+}
+
+void UpdateShaderRegisters(R700_Shader *pShader) //context
+{
+}
+
+void DeleteInstructions(R700_Shader *pShader)
+{
+}
+
+void DebugPrint(void)
+{
+}
+
+void Clean_Up_Shader(R700_Shader *pShader)
+{
+    FREE(pShader->pProgram);
+
+    R700ShaderInstruction  *pInst;
+    R700ShaderInstruction  *pInstToFree;
+
+    pInst = pShader->lstCFInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+    pInst = pShader->lstALUInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+    pInst = pShader->lstTEXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+    pInst = pShader->lstVTXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_shader.h b/src/mesa/drivers/dri/r600/r700_shader.h
new file mode 100644
index 0000000000..bfd01e1a93
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_shader.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+
+#ifndef __R700_SHADER_H__
+#define __R700_SHADER_H__
+
+#include "main/mtypes.h"
+
+#include "r700_shaderinst.h"
+
+
+void r700ShaderInit(GLcontext * ctx);
+
+typedef enum R700ShaderType
+{
+    R700_SHADER_FS      = 0x0,
+    R700_SHADER_ES      = 0x1,
+    R700_SHADER_GS      = 0x2,
+    R700_SHADER_VS      = 0x3,
+    R700_SHADER_PS      = 0x4,
+    R700_SHADER_INVALID = 0x5,
+} R700ShaderType;
+
+typedef struct TypedShaderList 
+{
+	R700ShaderInstruction * pHead;
+	R700ShaderInstruction * pTail;
+	GLuint  uNumOfNode;
+} TypedShaderList;
+
+typedef struct RealRegister 
+{
+    GLuint uAddr;
+    GLuint uValue;
+} RealRegister;
+
+typedef struct InstDeps
+{
+    GLint nDstDep;
+    GLint nSrcDeps[3];
+} InstDeps;
+
+typedef struct R700_Shader 
+{
+	R700ShaderType   Type;
+
+    GLubyte*  pProgram;
+
+    GLboolean bBinaryShader;
+    GLboolean bFetchShaderRequired;
+    GLboolean bNeedsAssembly;
+    GLboolean bLinksDirty;
+
+    GLuint  uShaderBinaryDWORDSize; // in DWORDS
+    GLuint  nRegs;      
+    GLuint  nParamExports;   // VS_ EXPORT_COUNT (1 based, the actual register is 0 based!)
+    GLuint  nMemExports; 
+    GLuint  resource;     // VS and PS _RESOURCE
+    GLuint  exportMode;   // VS and PS _EXPORT_MODE
+
+    GLboolean  depthIsImported;             
+
+    // Vertex program exports
+    GLboolean  positionVectorIsExported;          
+
+    GLboolean  miscVectorIsExported;               
+    GLboolean  renderTargetArrayIndexIsExported;  
+
+    GLboolean  ccDist0VectorIsExported;  
+    GLboolean  ccDist1VectorIsExported;  
+
+    // Pixel program exports
+    GLboolean  depthIsExported;             
+    GLboolean  stencilRefIsExported;        
+    GLboolean  coverageToMaskIsExported;    
+    GLboolean  maskIsExported;              
+
+    GLboolean  killIsUsed;                  
+
+    GLuint  uStartAddr;
+    GLuint  uCFOffset;
+    GLuint  uEndOfCF;
+    GLuint  uEndOfALU;
+    GLuint  uEndOfFetch;
+    GLuint  uStackSize;
+    GLuint  uMaxCallDepth;
+
+	TypedShaderList lstCFInstructions;
+	TypedShaderList lstALUInstructions;
+	TypedShaderList lstTEXInstructions;
+	TypedShaderList lstVTXInstructions;
+
+    RealRegister RegStartAddr;
+    RealRegister RegCFOffset;
+    RealRegister RegEndCF;
+    RealRegister RegEndALU;
+    RealRegister egEndFetcg;
+
+	// -------- constants
+	GLfloat   ConstantArray[SQ_ALU_CONSTANT_PS_COUNT * 4];
+	
+	GLboolean bSurfAllocated;
+} R700_Shader;
+
+//Internal
+void AddInstToList(TypedShaderList * plstCFInstructions, R700ShaderInstruction * pInst);
+void ResolveLinks(R700_Shader *pShader);
+void Assemble(R700_Shader *pShader);
+
+
+//Interface
+void Init_R700_Shader(R700_Shader * pShader);
+void AddCFInstruction(R700_Shader *pShader, R700ControlFlowInstruction *pCFInst);
+void AddVTXInstruction(R700_Shader *pShader, R700VertexInstruction *pVTXInst);
+void AddTEXInstruction(R700_Shader *pShader, R700TextureInstruction *pTEXInst);
+void AddALUInstruction(R700_Shader *pShader, R700ALUInstruction *pALUInst);
+
+void LoadProgram(R700_Shader *pShader);
+void UpdateShaderRegisters(R700_Shader *pShader);
+void DeleteInstructions(R700_Shader *pShader);
+void DebugPrint(void);
+
+void Clean_Up_Shader(R700_Shader *pShader);
+
+#endif /*__R700_SHADER_H__*/
+
diff --git a/src/mesa/drivers/dri/r600/r700_shaderinst.c b/src/mesa/drivers/dri/r600/r700_shaderinst.c
new file mode 100644
index 0000000000..f120d9f941
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_shaderinst.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+
+#include "main/mtypes.h"
+
+#include "radeon_debug.h"
+#include "r700_shaderinst.h"
+
+void Init_R700ControlFlowGenericClause(R700ControlFlowGenericClause* pInst)
+{
+    pInst->m_Word0.val = 0x00000000;
+    pInst->m_Word1.val = 0x00000000;
+
+    pInst->m_pLinkedVTXInstruction = 0;
+    pInst->m_pLinkedTEXInstruction = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_CF_GENERIC;
+}
+
+void Init_R700ControlFlowALUClause(R700ControlFlowALUClause* pInst)
+{
+    pInst->m_Word0.val = 0x00000000;
+    pInst->m_Word1.val = 0x00000000;
+
+    pInst->m_pLinkedALUInstruction = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_CF_ALU;
+}
+
+void Init_R700ControlFlowSXClause(R700ControlFlowSXClause* pInst)
+{
+    pInst->m_Word0.val      = 0x00000000;
+    pInst->m_Word1.val      = 0x00000000;
+    pInst->m_Word1_SWIZ.val = 0x00000000;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_CF_ALL_EXP_SX;
+}
+
+void Init_R700ControlFlowSMXClause(R700ControlFlowSMXClause* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_BUF.val = 0x00000000;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_CF_ALL_EXP_SMX;
+}
+
+void Init_R700ALUInstruction(R700ALUInstruction* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_OP2.val = 0x00000000;
+    pInst->m_Word1_OP3.val = 0x00000000;
+
+    pInst->m_pLinkedALUClause = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_ALU;
+}
+
+void Init_R700ALUInstructionHalfLiteral(R700ALUInstructionHalfLiteral* pInst, GLfloat x, GLfloat y)
+{
+	pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_OP2.val = 0x00000000;
+    pInst->m_Word1_OP3.val = 0x00000000;
+
+	pInst->m_pLinkedALUClause = 0;
+
+    pInst->m_fLiteralX = x;
+    pInst->m_fLiteralY = y;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_ALU_HALF_LIT;
+}
+
+void Init_R700ALUInstructionFullLiteral(R700ALUInstructionFullLiteral* pInst, GLfloat x, GLfloat y, GLfloat z, GLfloat w)
+{
+	pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_OP2.val = 0x00000000;
+    pInst->m_Word1_OP3.val = 0x00000000;
+
+	pInst->m_pLinkedALUClause = 0;
+
+    pInst->m_fLiteralX = x;
+    pInst->m_fLiteralY = y;
+    pInst->m_fLiteralZ = z;
+    pInst->m_fLiteralW = w;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_ALU_FALL_LIT;
+}
+
+void Init_R700TextureInstruction(R700TextureInstruction* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word2.val     = 0x00000000;
+
+    pInst->m_pLinkedGenericClause = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_TEX;
+}
+
+void Init_R700VertexSemanticFetch(R700VertexSemanticFetch* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_SEM.val = 0x00000000;
+    pInst->m_Word2.val     = 0x00000000;
+
+    pInst->m_pLinkedGenericClause = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_VTX_SEM;
+}
+
+void Init_R700VertexGenericFetch(R700VertexGenericFetch* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_GPR.val = 0x00000000;
+    pInst->m_Word2.val     = 0x00000000;
+
+    pInst->m_pLinkedGenericClause = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_VTX_GENERIC;
+}
+
+unsigned int GetInstructionSize(ShaderInstType instType)
+{
+    switch(instType)
+    {
+    case SIT_ALU_HALF_LIT:  
+    case SIT_TEX:           
+    case SIT_VTX:           
+    case SIT_VTX_GENERIC:   
+    case SIT_VTX_SEM:       
+        return 4;
+    case SIT_ALU_FALL_LIT:
+        return 6;
+    default:
+        break;
+    }
+
+    return 2;
+}
+
+unsigned int GetCFMaxInstructions(ShaderInstType instType)
+{
+    switch (instType)
+    {
+    case SIT_CF_ALL_EXP:    
+    case SIT_CF_ALL_EXP_SX: 
+    case SIT_CF_ALL_EXP_SMX:  
+        return 0x10;
+    case SIT_CF_GENERIC:
+        return 0x8;  //For tex and vtx
+    case SIT_CF_ALU:
+        return 0x80;
+    default:
+        break;
+    }
+    return 0x10;
+}
+
+GLboolean LinkVertexInstruction(R700ControlFlowGenericClause *pCFGeneric,
+								R700VertexInstruction *pVTXInstruction)
+{
+    if (pCFGeneric->m_pLinkedTEXInstruction != 0)
+    {
+	radeon_error("This instruction is already linked to a texture instruction.\n");
+	return GL_FALSE;
+    }
+
+    pCFGeneric->m_pLinkedVTXInstruction     = pVTXInstruction;
+    pVTXInstruction->m_pLinkedGenericClause = pCFGeneric;
+
+    return GL_TRUE;
+}
+
+
+
diff --git a/src/mesa/drivers/dri/r600/r700_shaderinst.h b/src/mesa/drivers/dri/r600/r700_shaderinst.h
new file mode 100644
index 0000000000..2829cca0a3
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_shaderinst.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+
+#ifndef _R700_SHADERINST_H_
+#define _R700_SHADERINST_H_
+
+#include "main/glheader.h"
+
+#include "defaultendian.h" 
+#include "sq_micro_reg.h"
+
+#define SQ_ALU_CONSTANT_PS_OFFSET      0x00000000
+#define SQ_ALU_CONSTANT_PS_COUNT       0x00000100
+#define SQ_ALU_CONSTANT_VS_OFFSET      0x00000100
+#define SQ_ALU_CONSTANT_VS_COUNT       0x00000100
+#define SQ_FETCH_RESOURCE_PS_OFFSET    0x00000000
+#define SQ_FETCH_RESOURCE_PS_COUNT     0x000000a0
+#define SQ_FETCH_RESOURCE_VS_OFFSET    0x000000a0
+#define SQ_FETCH_RESOURCE_VS_COUNT     0x000000b0
+
+#define SHADERINST_TYPEMASK_CF  0x10
+#define SHADERINST_TYPEMASK_ALU 0x20
+#define SHADERINST_TYPEMASK_TEX 0x40
+#define SHADERINST_TYPEMASK_VTX 0x80
+
+typedef enum ShaderInstType 
+{
+    SIT_CF = 0x10,            /*SIZE = 0x2*/
+        SIT_CF_ALL_EXP = 0x14,    /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x10;*/
+            SIT_CF_ALL_EXP_SX = 0x15, /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x10;*/
+            SIT_CF_ALL_EXP_SMX= 0x16, /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x10;*/
+        SIT_CF_GENERIC = 0x18,    /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x8;  //For tex and vtx*/
+        SIT_CF_ALU = 0x19,        /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x80;*/
+    SIT_ALU = 0x20,           /*SIZE = 0x2,*/
+        SIT_ALU_HALF_LIT = 0x21,  /*SIZE = 0x4,*/
+        SIT_ALU_FALL_LIT = 0x22,  /*SIZE = 0x6,*/
+    SIT_TEX = 0x40,           /*SIZE = 0x4,*/
+    SIT_VTX = 0x80,           /*SIZE = 0x4, MEGA_FETCH_BYTES = 0x20*/
+        SIT_VTX_GENERIC = 0x81,   /*SIZE = 0x4, MEGA_FETCH_BYTES = 0x20*/
+        SIT_VTX_SEM = 0x82       /*SIZE = 0x4, MEGA_FETCH_BYTES = 0x20*/
+} ShaderInstType;
+
+typedef struct R700ShaderInstruction 
+{
+    ShaderInstType m_ShaderInstType;
+    struct R700ShaderInstruction *pNextInst;
+    GLuint m_uIndex;
+    GLuint useCount;
+} R700ShaderInstruction;
+
+// ------------------ CF insts ---------------------------
+
+typedef R700ShaderInstruction R700ControlFlowInstruction;
+
+typedef struct R700ControlFlowAllocExportClause  
+{
+    ShaderInstType          m_ShaderInstType;
+    R700ShaderInstruction * pNextInst;    
+    GLuint m_uIndex;
+    GLuint useCount;
+		
+    sq_cf_alloc_export_word0_u      m_Word0;
+    sq_cf_alloc_export_word1_u      m_Word1;
+} R700ControlFlowAllocExportClause;
+
+typedef struct R700ControlFlowSXClause 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ControlFlowAllocExportClause
+		//R700ControlFlowInstruction 
+			//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+			//---------------------
+		//---------------------------
+    sq_cf_alloc_export_word0_u      m_Word0;
+    sq_cf_alloc_export_word1_u      m_Word1;
+	//-------------------------------------
+
+    sq_cf_alloc_export_word1_swiz_u m_Word1_SWIZ;
+} R700ControlFlowSXClause;
+
+typedef struct R700ControlFlowSMXClause 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+    //R700ControlFlowAllocExportClause
+		//R700ControlFlowInstruction 
+			//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+			//---------------------
+		//---------------------------
+    sq_cf_alloc_export_word0_u      m_Word0;
+    sq_cf_alloc_export_word1_u      m_Word1;
+	//-------------------------------
+
+    sq_cf_alloc_export_word1_buf_u m_Word1_BUF;
+} R700ControlFlowSMXClause;
+
+typedef struct R700ControlFlowGenericClause 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ControlFlowInstruction
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+	//---------------------
+
+    sq_cf_word0_u m_Word0;
+    sq_cf_word1_u m_Word1;
+
+    struct R700VertexInstruction  *m_pLinkedVTXInstruction;
+    struct R700TextureInstruction *m_pLinkedTEXInstruction;
+} R700ControlFlowGenericClause;
+
+typedef struct R700ControlFlowALUClause 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+    //R700ControlFlowInstruction
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+	//---------------------
+
+    sq_cf_alu_word0_u m_Word0;
+    sq_cf_alu_word1_u m_Word1;
+    
+    struct R700ALUInstruction *m_pLinkedALUInstruction;
+} R700ControlFlowALUClause;
+
+// ------------------- End of CF Inst ------------------------
+
+// ------------------- ALU Inst ------------------------------
+typedef struct R700ALUInstruction 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+	//---------------------
+
+    sq_alu_word0_u     m_Word0;
+    sq_alu_word1_u     m_Word1;
+    sq_alu_word1_op2_v2_u m_Word1_OP2;
+    sq_alu_word1_op3_u m_Word1_OP3;
+
+    struct R700ControlFlowALUClause *m_pLinkedALUClause;
+} R700ALUInstruction;
+
+typedef struct R700ALUInstructionHalfLiteral
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ALUInstruction 
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+
+    sq_alu_word0_u     m_Word0;
+    sq_alu_word1_u     m_Word1;
+    sq_alu_word1_op2_v2_u m_Word1_OP2;
+    sq_alu_word1_op3_u m_Word1_OP3;
+
+    struct R700ControlFlowALUClause *m_pLinkedALUClause;
+	//-------------------
+
+    GLfloat m_fLiteralX,
+            m_fLiteralY;
+} R700ALUInstructionHalfLiteral;
+
+typedef struct R700ALUInstructionFullLiteral 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ALUInstruction 
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+
+    sq_alu_word0_u     m_Word0;
+    sq_alu_word1_u     m_Word1;
+    sq_alu_word1_op2_v2_u m_Word1_OP2;
+    sq_alu_word1_op3_u m_Word1_OP3;
+
+    struct R700ControlFlowALUClause *m_pLinkedALUClause;
+	//-------------------
+
+    GLfloat m_fLiteralX,
+            m_fLiteralY,
+            m_fLiteralZ,
+            m_fLiteralW;
+} R700ALUInstructionFullLiteral;
+// ------------------- End of ALU Inst -----------------------
+
+// ------------------- Textuer/Vertex Instruction --------------------
+
+typedef struct R700TextureInstruction 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+	//---------------------
+	
+    sq_tex_word0_u m_Word0;
+    sq_tex_word1_u m_Word1;
+    sq_tex_word2_u m_Word2;
+
+    struct R700ControlFlowGenericClause *m_pLinkedGenericClause;
+} R700TextureInstruction;
+
+typedef struct R700VertexInstruction 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+	//---------------------
+	
+    sq_vtx_word0_u     m_Word0;
+    sq_vtx_word1_u     m_Word1;
+    sq_vtx_word2_u     m_Word2;
+
+    struct R700ControlFlowGenericClause *m_pLinkedGenericClause;
+} R700VertexInstruction;
+//
+typedef struct R700VertexSemanticFetch 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700VertexInstruction
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+	
+    sq_vtx_word0_u     m_Word0;
+    sq_vtx_word1_u     m_Word1;
+    sq_vtx_word2_u     m_Word2;
+
+    struct R700ControlFlowGenericClause *m_pLinkedGenericClause;
+	//---------------------------
+
+    sq_vtx_word1_sem_u m_Word1_SEM;
+} R700VertexSemanticFetch;
+//
+typedef struct R700VertexGenericFetch 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700VertexInstruction
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+	
+    sq_vtx_word0_u     m_Word0;
+    sq_vtx_word1_u     m_Word1;
+    sq_vtx_word2_u     m_Word2;
+
+    struct R700ControlFlowGenericClause *m_pLinkedGenericClause;
+	//---------------------------
+
+    sq_vtx_word1_gpr_u m_Word1_GPR;
+} R700VertexGenericFetch;
+
+// ------------------- End of Texture Vertex Instruction --------------------
+
+void Init_R700ControlFlowGenericClause(R700ControlFlowGenericClause* pInst);
+void Init_R700ControlFlowALUClause(R700ControlFlowALUClause* pInst);
+void Init_R700ControlFlowSXClause(R700ControlFlowSXClause* pInst);
+void Init_R700ControlFlowSMXClause(R700ControlFlowSMXClause* pInst);
+void Init_R700ALUInstruction(R700ALUInstruction* pInst);
+void Init_R700ALUInstructionHalfLiteral(R700ALUInstructionHalfLiteral* pInst, GLfloat x, GLfloat y);
+void Init_R700ALUInstructionFullLiteral(R700ALUInstructionFullLiteral* pInst, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
+void Init_R700TextureInstruction(R700TextureInstruction* pInst);
+void Init_R700VertexSemanticFetch(R700VertexSemanticFetch* pInst);
+void Init_R700VertexGenericFetch(R700VertexGenericFetch* pInst);
+
+unsigned int GetInstructionSize(ShaderInstType instType);
+unsigned int GetCFMaxInstructions(ShaderInstType instType);
+
+GLboolean LinkVertexInstruction(R700ControlFlowGenericClause *pCFGeneric,
+								R700VertexInstruction *pVTXInstruction);
+
+#endif //_R700_SHADERINST_H_
diff --git a/src/mesa/drivers/dri/r600/r700_state.c b/src/mesa/drivers/dri/r600/r700_state.c
new file mode 100644
index 0000000000..e91aa43118
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_state.c
@@ -0,0 +1,1810 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/dd.h"
+#include "main/simple_list.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "tnl/t_vp_build.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "main/api_arrayelt.h"
+#include "main/state.h"
+#include "main/framebuffer.h"
+
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "vbo/vbo.h"
+#include "main/texformat.h"
+
+#include "r600_context.h"
+
+#include "r700_state.h"
+
+#include "r700_fragprog.h"
+#include "r700_vertprog.h"
+
+
+static void r700SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state);
+static void r700UpdatePolygonMode(GLcontext * ctx);
+static void r700SetPolygonOffsetState(GLcontext * ctx, GLboolean state);
+static void r700SetStencilState(GLcontext * ctx, GLboolean state);
+
+void r700UpdateShaders (GLcontext * ctx)  //----------------------------------
+{
+    context_t *context = R700_CONTEXT(ctx);
+    GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
+    GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
+    int i;
+
+    /* should only happenen once, just after context is created */
+    /* TODO: shouldn't we fallback to sw here? */
+    if (!ctx->FragmentProgram._Current) {
+	    _mesa_fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+	    return;
+    }
+
+    r700SelectFragmentShader(ctx);
+
+    if (context->radeon.NewGLState) {
+	    for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+		    /* mat states from state var not array for sw */
+		    dummy_attrib[i].stride = 0;
+	            temp_attrib[i] = TNL_CONTEXT(ctx)->vb.AttribPtr[i];
+		    TNL_CONTEXT(ctx)->vb.AttribPtr[i] = &(dummy_attrib[i]);
+	    }
+
+	    _tnl_UpdateFixedFunctionProgram(ctx);
+
+	    for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+		    TNL_CONTEXT(ctx)->vb.AttribPtr[i] = temp_attrib[i];
+	    }
+    }
+
+    r700SelectVertexShader(ctx);
+    r700UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+    context->radeon.NewGLState = 0;
+}
+
+/*
+ * To correctly position primitives:
+ */
+void r700UpdateViewportOffset(GLcontext * ctx) //------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
+	GLfloat xoffset = (GLfloat) dPriv->x;
+	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
+	const GLfloat *v = ctx->Viewport._WindowMap.m;
+	int id = 0;
+
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat ty = (-v[MAT_TY]) + yoffset;
+
+	if (r700->viewport[id].PA_CL_VPORT_XOFFSET.f32All != tx ||
+	    r700->viewport[id].PA_CL_VPORT_YOFFSET.f32All != ty) {
+		/* Note: this should also modify whatever data the context reset
+		 * code uses...
+		 */
+		R600_STATECHANGE(context, vpt);
+		r700->viewport[id].PA_CL_VPORT_XOFFSET.f32All = tx;
+		r700->viewport[id].PA_CL_VPORT_YOFFSET.f32All = ty;
+	}
+
+	radeonUpdateScissor(ctx);
+}
+
+void r700UpdateStateParameters(GLcontext * ctx, GLuint new_state) //--------------------
+{
+	struct r700_fragment_program *fp =
+		(struct r700_fragment_program *)ctx->FragmentProgram._Current;
+	struct gl_program_parameter_list *paramList;
+
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)))
+		return;
+
+	if (!ctx->FragmentProgram._Current || !fp)
+		return;
+
+	paramList = ctx->FragmentProgram._Current->Base.Parameters;
+
+	if (!paramList)
+		return;
+
+	_mesa_load_state_parameters(ctx, paramList);
+
+}
+
+/**
+ * Called by Mesa after an internal state update.
+ */
+static void r700InvalidateState(GLcontext * ctx, GLuint new_state) //-------------------
+{
+    context_t *context = R700_CONTEXT(ctx);
+
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    _swrast_InvalidateState(ctx, new_state);
+    _swsetup_InvalidateState(ctx, new_state);
+    _vbo_InvalidateState(ctx, new_state);
+    _tnl_InvalidateState(ctx, new_state);
+    _ae_invalidate_state(ctx, new_state);
+
+    if (new_state & _NEW_BUFFERS) {
+	    _mesa_update_framebuffer(ctx);
+	    /* this updates the DrawBuffer's Width/Height if it's a FBO */
+	    _mesa_update_draw_buffer_bounds(ctx);
+
+	    R600_STATECHANGE(context, cb_target);
+	    R600_STATECHANGE(context, db_target);
+    }
+
+    if (new_state & (_NEW_LIGHT)) {
+	    R600_STATECHANGE(context, su);
+	    if (ctx->Light.ProvokingVertex == GL_LAST_VERTEX_CONVENTION)
+		    SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, PROVOKING_VTX_LAST_bit);
+	    else
+		    CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, PROVOKING_VTX_LAST_bit);
+    }
+
+    r700UpdateStateParameters(ctx, new_state);
+
+    R600_STATECHANGE(context, cl);
+    R600_STATECHANGE(context, spi);
+
+    if(GL_TRUE == r700->bEnablePerspective)
+    {
+        /* Do scale XY and Z by 1/W0 for perspective correction on pos. For orthogonal case, set both to one. */
+        CLEARbit(r700->PA_CL_VTE_CNTL.u32All, VTX_XY_FMT_bit);
+        CLEARbit(r700->PA_CL_VTE_CNTL.u32All, VTX_Z_FMT_bit);
+
+        SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_W0_FMT_bit);
+
+        SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, PERSP_GRADIENT_ENA_bit);
+        CLEARbit(r700->SPI_PS_IN_CONTROL_0.u32All, LINEAR_GRADIENT_ENA_bit);
+    }
+    else
+    {
+        /* For orthogonal case. */
+        SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_XY_FMT_bit);
+        SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_Z_FMT_bit);
+
+        SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_W0_FMT_bit);
+
+        CLEARbit(r700->SPI_PS_IN_CONTROL_0.u32All, PERSP_GRADIENT_ENA_bit);
+        SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, LINEAR_GRADIENT_ENA_bit);
+    }
+
+    context->radeon.NewGLState |= new_state;
+}
+
+static void r700SetDepthState(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	R600_STATECHANGE(context, db);
+
+    if (ctx->Depth.Test)
+    {
+        SETbit(r700->DB_DEPTH_CONTROL.u32All, Z_ENABLE_bit);
+        if (ctx->Depth.Mask)
+        {
+            SETbit(r700->DB_DEPTH_CONTROL.u32All, Z_WRITE_ENABLE_bit);
+        }
+        else
+        {
+            CLEARbit(r700->DB_DEPTH_CONTROL.u32All, Z_WRITE_ENABLE_bit);
+        }
+
+        switch (ctx->Depth.Func)
+        {
+        case GL_NEVER:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_NEVER,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_LESS:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_LESS,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_EQUAL:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_EQUAL,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_LEQUAL:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_LEQUAL,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_GREATER:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_GREATER,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_NOTEQUAL:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_NOTEQUAL,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_GEQUAL:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_GEQUAL,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_ALWAYS:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_ALWAYS,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        default:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_ALWAYS,
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        }
+    }
+    else
+    {
+        CLEARbit(r700->DB_DEPTH_CONTROL.u32All, Z_ENABLE_bit);
+        CLEARbit(r700->DB_DEPTH_CONTROL.u32All, Z_WRITE_ENABLE_bit);
+    }
+}
+
+static void r700SetAlphaState(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	uint32_t alpha_func = REF_ALWAYS;
+	GLboolean really_enabled = ctx->Color.AlphaEnabled;
+
+	R600_STATECHANGE(context, sx);
+
+	switch (ctx->Color.AlphaFunc) {
+	case GL_NEVER:
+		alpha_func = REF_NEVER;
+		break;
+	case GL_LESS:
+		alpha_func = REF_LESS;
+		break;
+	case GL_EQUAL:
+		alpha_func = REF_EQUAL;
+		break;
+	case GL_LEQUAL:
+		alpha_func = REF_LEQUAL;
+		break;
+	case GL_GREATER:
+		alpha_func = REF_GREATER;
+		break;
+	case GL_NOTEQUAL:
+		alpha_func = REF_NOTEQUAL;
+		break;
+	case GL_GEQUAL:
+		alpha_func = REF_GEQUAL;
+		break;
+	case GL_ALWAYS:
+		/*alpha_func = REF_ALWAYS; */
+		really_enabled = GL_FALSE;
+		break;
+	}
+
+	if (really_enabled) {
+		SETfield(r700->SX_ALPHA_TEST_CONTROL.u32All, alpha_func,
+			 ALPHA_FUNC_shift, ALPHA_FUNC_mask);
+		SETbit(r700->SX_ALPHA_TEST_CONTROL.u32All, ALPHA_TEST_ENABLE_bit);
+		r700->SX_ALPHA_REF.f32All = ctx->Color.AlphaRef;
+	} else {
+		CLEARbit(r700->SX_ALPHA_TEST_CONTROL.u32All, ALPHA_TEST_ENABLE_bit);
+	}
+
+}
+
+static void r700AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref) //---------------
+{
+	(void)func;
+	(void)ref;
+	r700SetAlphaState(ctx);
+}
+
+
+static void r700BlendColor(GLcontext * ctx, const GLfloat cf[4]) //----------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	R600_STATECHANGE(context, blnd_clr);
+
+	r700->CB_BLEND_RED.f32All = cf[0];
+	r700->CB_BLEND_GREEN.f32All = cf[1];
+	r700->CB_BLEND_BLUE.f32All = cf[2];
+	r700->CB_BLEND_ALPHA.f32All = cf[3];
+}
+
+static int blend_factor(GLenum factor, GLboolean is_src)
+{
+	switch (factor) {
+	case GL_ZERO:
+		return BLEND_ZERO;
+		break;
+	case GL_ONE:
+		return BLEND_ONE;
+		break;
+	case GL_DST_COLOR:
+		return BLEND_DST_COLOR;
+		break;
+	case GL_ONE_MINUS_DST_COLOR:
+		return BLEND_ONE_MINUS_DST_COLOR;
+		break;
+	case GL_SRC_COLOR:
+		return BLEND_SRC_COLOR;
+		break;
+	case GL_ONE_MINUS_SRC_COLOR:
+		return BLEND_ONE_MINUS_SRC_COLOR;
+		break;
+	case GL_SRC_ALPHA:
+		return BLEND_SRC_ALPHA;
+		break;
+	case GL_ONE_MINUS_SRC_ALPHA:
+		return BLEND_ONE_MINUS_SRC_ALPHA;
+		break;
+	case GL_DST_ALPHA:
+		return BLEND_DST_ALPHA;
+		break;
+	case GL_ONE_MINUS_DST_ALPHA:
+		return BLEND_ONE_MINUS_DST_ALPHA;
+		break;
+	case GL_SRC_ALPHA_SATURATE:
+		return (is_src) ? BLEND_SRC_ALPHA_SATURATE : BLEND_ZERO;
+		break;
+	case GL_CONSTANT_COLOR:
+		return BLEND_CONSTANT_COLOR;
+		break;
+	case GL_ONE_MINUS_CONSTANT_COLOR:
+		return BLEND_ONE_MINUS_CONSTANT_COLOR;
+		break;
+	case GL_CONSTANT_ALPHA:
+		return BLEND_CONSTANT_ALPHA;
+		break;
+	case GL_ONE_MINUS_CONSTANT_ALPHA:
+		return BLEND_ONE_MINUS_CONSTANT_ALPHA;
+		break;
+	default:
+		fprintf(stderr, "unknown blend factor %x\n", factor);
+		return (is_src) ? BLEND_ONE : BLEND_ZERO;
+		break;
+	}
+}
+
+static void r700SetBlendState(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	int id = 0;
+	uint32_t blend_reg = 0, eqn, eqnA;
+
+	R600_STATECHANGE(context, blnd);
+
+	if (RGBA_LOGICOP_ENABLED(ctx) || !ctx->Color.BlendEnabled) {
+		SETfield(blend_reg,
+			 BLEND_ONE, COLOR_SRCBLEND_shift, COLOR_SRCBLEND_mask);
+		SETfield(blend_reg,
+			 BLEND_ZERO, COLOR_DESTBLEND_shift, COLOR_DESTBLEND_mask);
+		SETfield(blend_reg,
+			 COMB_DST_PLUS_SRC, COLOR_COMB_FCN_shift, COLOR_COMB_FCN_mask);
+		SETfield(blend_reg,
+			 BLEND_ONE, ALPHA_SRCBLEND_shift, ALPHA_SRCBLEND_mask);
+		SETfield(blend_reg,
+			 BLEND_ZERO, ALPHA_DESTBLEND_shift, ALPHA_DESTBLEND_mask);
+		SETfield(blend_reg,
+			 COMB_DST_PLUS_SRC, ALPHA_COMB_FCN_shift, ALPHA_COMB_FCN_mask);
+		if (context->radeon.radeonScreen->chip_family == CHIP_FAMILY_R600)
+			r700->CB_BLEND_CONTROL.u32All = blend_reg;
+		else
+			r700->render_target[id].CB_BLEND0_CONTROL.u32All = blend_reg;
+		return;
+	}
+
+	SETfield(blend_reg,
+		 blend_factor(ctx->Color.BlendSrcRGB, GL_TRUE),
+		 COLOR_SRCBLEND_shift, COLOR_SRCBLEND_mask);
+	SETfield(blend_reg,
+		 blend_factor(ctx->Color.BlendDstRGB, GL_FALSE),
+		 COLOR_DESTBLEND_shift, COLOR_DESTBLEND_mask);
+
+	switch (ctx->Color.BlendEquationRGB) {
+	case GL_FUNC_ADD:
+		eqn = COMB_DST_PLUS_SRC;
+		break;
+	case GL_FUNC_SUBTRACT:
+		eqn = COMB_SRC_MINUS_DST;
+		break;
+	case GL_FUNC_REVERSE_SUBTRACT:
+		eqn = COMB_DST_MINUS_SRC;
+		break;
+	case GL_MIN:
+		eqn = COMB_MIN_DST_SRC;
+		SETfield(blend_reg,
+			 BLEND_ONE,
+			 COLOR_SRCBLEND_shift, COLOR_SRCBLEND_mask);
+		SETfield(blend_reg,
+			 BLEND_ONE,
+			 COLOR_DESTBLEND_shift, COLOR_DESTBLEND_mask);
+		break;
+	case GL_MAX:
+		eqn = COMB_MAX_DST_SRC;
+		SETfield(blend_reg,
+			 BLEND_ONE,
+			 COLOR_SRCBLEND_shift, COLOR_SRCBLEND_mask);
+		SETfield(blend_reg,
+			 BLEND_ONE,
+			 COLOR_DESTBLEND_shift, COLOR_DESTBLEND_mask);
+		break;
+
+	default:
+		fprintf(stderr,
+			"[%s:%u] Invalid RGB blend equation (0x%04x).\n",
+			__FUNCTION__, __LINE__, ctx->Color.BlendEquationRGB);
+		return;
+	}
+	SETfield(blend_reg,
+		 eqn, COLOR_COMB_FCN_shift, COLOR_COMB_FCN_mask);
+
+	SETfield(blend_reg,
+		 blend_factor(ctx->Color.BlendSrcRGB, GL_TRUE),
+		 ALPHA_SRCBLEND_shift, ALPHA_SRCBLEND_mask);
+	SETfield(blend_reg,
+		 blend_factor(ctx->Color.BlendDstRGB, GL_FALSE),
+		 ALPHA_DESTBLEND_shift, ALPHA_DESTBLEND_mask);
+
+	switch (ctx->Color.BlendEquationA) {
+	case GL_FUNC_ADD:
+		eqnA = COMB_DST_PLUS_SRC;
+		break;
+	case GL_FUNC_SUBTRACT:
+		eqnA = COMB_SRC_MINUS_DST;
+		break;
+	case GL_FUNC_REVERSE_SUBTRACT:
+		eqnA = COMB_DST_MINUS_SRC;
+		break;
+	case GL_MIN:
+		eqnA = COMB_MIN_DST_SRC;
+		SETfield(blend_reg,
+			 BLEND_ONE,
+			 ALPHA_SRCBLEND_shift, ALPHA_SRCBLEND_mask);
+		SETfield(blend_reg,
+			 BLEND_ONE,
+			 ALPHA_DESTBLEND_shift, ALPHA_DESTBLEND_mask);
+		break;
+	case GL_MAX:
+		eqnA = COMB_MAX_DST_SRC;
+		SETfield(blend_reg,
+			 BLEND_ONE,
+			 ALPHA_SRCBLEND_shift, ALPHA_SRCBLEND_mask);
+		SETfield(blend_reg,
+			 BLEND_ONE,
+			 ALPHA_DESTBLEND_shift, ALPHA_DESTBLEND_mask);
+		break;
+	default:
+		fprintf(stderr,
+			"[%s:%u] Invalid A blend equation (0x%04x).\n",
+			__FUNCTION__, __LINE__, ctx->Color.BlendEquationA);
+		return;
+	}
+
+	SETfield(blend_reg,
+		 eqnA, ALPHA_COMB_FCN_shift, ALPHA_COMB_FCN_mask);
+
+	SETbit(blend_reg, SEPARATE_ALPHA_BLEND_bit);
+
+	if (context->radeon.radeonScreen->chip_family == CHIP_FAMILY_R600)
+		r700->CB_BLEND_CONTROL.u32All = blend_reg;
+	else {
+		r700->render_target[id].CB_BLEND0_CONTROL.u32All = blend_reg;
+		SETbit(r700->CB_COLOR_CONTROL.u32All, PER_MRT_BLEND_bit);
+	}
+	SETfield(r700->CB_COLOR_CONTROL.u32All, (1 << id),
+		 TARGET_BLEND_ENABLE_shift, TARGET_BLEND_ENABLE_mask);
+
+}
+
+static void r700BlendEquationSeparate(GLcontext * ctx,
+				                      GLenum modeRGB, GLenum modeA) //-----------------
+{
+	r700SetBlendState(ctx);
+}
+
+static void r700BlendFuncSeparate(GLcontext * ctx,
+				  GLenum sfactorRGB, GLenum dfactorRGB,
+				  GLenum sfactorA, GLenum dfactorA) //------------------------
+{
+	r700SetBlendState(ctx);
+}
+
+/**
+ * Translate LogicOp enums into hardware representation.
+ */
+static GLuint translate_logicop(GLenum logicop)
+{
+	switch (logicop) {
+	case GL_CLEAR:
+		return 0x00;
+	case GL_SET:
+		return 0xff;
+	case GL_COPY:
+		return 0xcc;
+	case GL_COPY_INVERTED:
+		return 0x33;
+	case GL_NOOP:
+		return 0xaa;
+	case GL_INVERT:
+		return 0x55;
+	case GL_AND:
+		return 0x88;
+	case GL_NAND:
+		return 0x77;
+	case GL_OR:
+		return 0xee;
+	case GL_NOR:
+		return 0x11;
+	case GL_XOR:
+		return 0x66;
+	case GL_EQUIV:
+		return 0xaa;
+	case GL_AND_REVERSE:
+		return 0x44;
+	case GL_AND_INVERTED:
+		return 0x22;
+	case GL_OR_REVERSE:
+		return 0xdd;
+	case GL_OR_INVERTED:
+		return 0xbb;
+	default:
+		fprintf(stderr, "unknown blend logic operation %x\n", logicop);
+		return 0xcc;
+	}
+}
+
+/**
+ * Used internally to update the r300->hw hardware state to match the
+ * current OpenGL state.
+ */
+static void r700SetLogicOpState(GLcontext *ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&R700_CONTEXT(ctx)->hw);
+
+	R600_STATECHANGE(context, blnd);
+
+	if (RGBA_LOGICOP_ENABLED(ctx))
+		SETfield(r700->CB_COLOR_CONTROL.u32All,
+			 translate_logicop(ctx->Color.LogicOp), ROP3_shift, ROP3_mask);
+	else
+		SETfield(r700->CB_COLOR_CONTROL.u32All, 0xCC, ROP3_shift, ROP3_mask);
+}
+
+/**
+ * Called by Mesa when an application program changes the LogicOp state
+ * via glLogicOp.
+ */
+static void r700LogicOpcode(GLcontext *ctx, GLenum logicop)
+{
+	if (RGBA_LOGICOP_ENABLED(ctx))
+		r700SetLogicOpState(ctx);
+}
+
+static void r700UpdateCulling(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&R700_CONTEXT(ctx)->hw);
+
+    R600_STATECHANGE(context, su);
+
+    CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit);
+    CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+    CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+
+    if (ctx->Polygon.CullFlag)
+    {
+        switch (ctx->Polygon.CullFaceMode)
+        {
+        case GL_FRONT:
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+            break;
+        case GL_BACK:
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+            break;
+        case GL_FRONT_AND_BACK:
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+            break;
+        default:
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+            break;
+        }
+    }
+
+    switch (ctx->Polygon.FrontFace)
+    {
+        case GL_CW:
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit);
+            break;
+        case GL_CCW:
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit);
+            break;
+        default:
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit); /* default: ccw */
+            break;
+    }
+}
+
+static void r700UpdateLineStipple(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&R700_CONTEXT(ctx)->hw);
+
+    R600_STATECHANGE(context, sc);
+
+    if (ctx->Line.StippleFlag)
+    {
+	SETbit(r700->PA_SC_MODE_CNTL.u32All, LINE_STIPPLE_ENABLE_bit);
+    }
+    else
+    {
+	CLEARbit(r700->PA_SC_MODE_CNTL.u32All, LINE_STIPPLE_ENABLE_bit);
+    }
+}
+
+static void r700Enable(GLcontext * ctx, GLenum cap, GLboolean state) //------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+
+	switch (cap) {
+	case GL_TEXTURE_1D:
+	case GL_TEXTURE_2D:
+	case GL_TEXTURE_3D:
+		/* empty */
+		break;
+	case GL_FOG:
+		/* empty */
+		break;
+	case GL_ALPHA_TEST:
+		r700SetAlphaState(ctx);
+		break;
+	case GL_COLOR_LOGIC_OP:
+		r700SetLogicOpState(ctx);
+		/* fall-through, because logic op overrides blending */
+	case GL_BLEND:
+		r700SetBlendState(ctx);
+		break;
+	case GL_CLIP_PLANE0:
+	case GL_CLIP_PLANE1:
+	case GL_CLIP_PLANE2:
+	case GL_CLIP_PLANE3:
+	case GL_CLIP_PLANE4:
+	case GL_CLIP_PLANE5:
+		r700SetClipPlaneState(ctx, cap, state);
+		break;
+	case GL_DEPTH_TEST:
+		r700SetDepthState(ctx);
+		break;
+	case GL_STENCIL_TEST:
+		r700SetStencilState(ctx, state);
+		break;
+	case GL_CULL_FACE:
+		r700UpdateCulling(ctx);
+		break;
+	case GL_POLYGON_OFFSET_POINT:
+	case GL_POLYGON_OFFSET_LINE:
+	case GL_POLYGON_OFFSET_FILL:
+		r700SetPolygonOffsetState(ctx, state);
+		break;
+	case GL_SCISSOR_TEST:
+		radeon_firevertices(&context->radeon);
+		context->radeon.state.scissor.enabled = state;
+		radeonUpdateScissor(ctx);
+		break;
+	case GL_LINE_STIPPLE:
+		r700UpdateLineStipple(ctx);
+		break;
+	default:
+		break;
+	}
+
+}
+
+/**
+ * Handle glColorMask()
+ */
+static void r700ColorMask(GLcontext * ctx,
+			  GLboolean r, GLboolean g, GLboolean b, GLboolean a) //------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&R700_CONTEXT(ctx)->hw);
+	unsigned int mask = ((r ? 1 : 0) |
+			     (g ? 2 : 0) |
+			     (b ? 4 : 0) |
+			     (a ? 8 : 0));
+
+	if (mask != r700->CB_SHADER_MASK.u32All) {
+		R600_STATECHANGE(context, cb);
+		SETfield(r700->CB_SHADER_MASK.u32All, mask, OUTPUT0_ENABLE_shift, OUTPUT0_ENABLE_mask);
+	}
+}
+
+/**
+ * Change the depth testing function.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r700DepthFunc(GLcontext * ctx, GLenum func) //--------------------
+{
+    r700SetDepthState(ctx);
+}
+
+/**
+ * Enable/Disable depth writing.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r700DepthMask(GLcontext * ctx, GLboolean mask) //------------------
+{
+    r700SetDepthState(ctx);
+}
+
+/**
+ * Change the culling mode.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r700CullFace(GLcontext * ctx, GLenum mode) //-----------------
+{
+    r700UpdateCulling(ctx);
+}
+
+/* =============================================================
+ * Fog
+ */
+static void r700Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param) //--------------
+{
+}
+
+/**
+ * Change the polygon orientation.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r700FrontFace(GLcontext * ctx, GLenum mode) //------------------
+{
+    r700UpdateCulling(ctx);
+    r700UpdatePolygonMode(ctx);
+}
+
+static void r700ShadeModel(GLcontext * ctx, GLenum mode) //--------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	R600_STATECHANGE(context, spi);
+
+	/* also need to set/clear FLAT_SHADE bit per param in SPI_PS_INPUT_CNTL_[0-31] */
+	switch (mode) {
+	case GL_FLAT:
+		SETbit(r700->SPI_INTERP_CONTROL_0.u32All, FLAT_SHADE_ENA_bit);
+		break;
+	case GL_SMOOTH:
+		CLEARbit(r700->SPI_INTERP_CONTROL_0.u32All, FLAT_SHADE_ENA_bit);
+		break;
+	default:
+		return;
+	}
+}
+
+/* =============================================================
+ * Point state
+ */
+static void r700PointSize(GLcontext * ctx, GLfloat size)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	R600_STATECHANGE(context, su);
+
+	/* We need to clamp to user defined range here, because
+	 * the HW clamping happens only for per vertex point size. */
+	size = CLAMP(size, ctx->Point.MinSize, ctx->Point.MaxSize);
+
+	/* same size limits for AA, non-AA points */
+	size = CLAMP(size, ctx->Const.MinPointSize, ctx->Const.MaxPointSize);
+
+	/* format is 12.4 fixed point */
+	SETfield(r700->PA_SU_POINT_SIZE.u32All, (int)(size * 16),
+		 PA_SU_POINT_SIZE__HEIGHT_shift, PA_SU_POINT_SIZE__HEIGHT_mask);
+	SETfield(r700->PA_SU_POINT_SIZE.u32All, (int)(size * 16),
+		 PA_SU_POINT_SIZE__WIDTH_shift, PA_SU_POINT_SIZE__WIDTH_mask);
+
+}
+
+static void r700PointParameter(GLcontext * ctx, GLenum pname, const GLfloat * param) //---------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	R600_STATECHANGE(context, su);
+
+	/* format is 12.4 fixed point */
+	switch (pname) {
+	case GL_POINT_SIZE_MIN:
+		SETfield(r700->PA_SU_POINT_MINMAX.u32All, (int)(ctx->Point.MinSize * 16.0),
+			 MIN_SIZE_shift, MIN_SIZE_mask);
+		break;
+	case GL_POINT_SIZE_MAX:
+		SETfield(r700->PA_SU_POINT_MINMAX.u32All, (int)(ctx->Point.MaxSize * 16.0),
+			 MAX_SIZE_shift, MAX_SIZE_mask);
+		break;
+	case GL_POINT_DISTANCE_ATTENUATION:
+		break;
+	case GL_POINT_FADE_THRESHOLD_SIZE:
+		break;
+	default:
+		break;
+	}
+}
+
+static int translate_stencil_func(int func)
+{
+	switch (func) {
+	case GL_NEVER:
+		return REF_NEVER;
+	case GL_LESS:
+		return REF_LESS;
+	case GL_EQUAL:
+		return REF_EQUAL;
+	case GL_LEQUAL:
+		return REF_LEQUAL;
+	case GL_GREATER:
+		return REF_GREATER;
+	case GL_NOTEQUAL:
+		return REF_NOTEQUAL;
+	case GL_GEQUAL:
+		return REF_GEQUAL;
+	case GL_ALWAYS:
+		return REF_ALWAYS;
+	}
+	return 0;
+}
+
+static int translate_stencil_op(int op)
+{
+	switch (op) {
+	case GL_KEEP:
+		return STENCIL_KEEP;
+	case GL_ZERO:
+		return STENCIL_ZERO;
+	case GL_REPLACE:
+		return STENCIL_REPLACE;
+	case GL_INCR:
+		return STENCIL_INCR_CLAMP;
+	case GL_DECR:
+		return STENCIL_DECR_CLAMP;
+	case GL_INCR_WRAP_EXT:
+		return STENCIL_INCR_WRAP;
+	case GL_DECR_WRAP_EXT:
+		return STENCIL_DECR_WRAP;
+	case GL_INVERT:
+		return STENCIL_INVERT;
+	default:
+		WARN_ONCE("Do not know how to translate stencil op");
+		return STENCIL_KEEP;
+	}
+	return 0;
+}
+
+static void r700SetStencilState(GLcontext * ctx, GLboolean state)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	GLboolean hw_stencil = GL_FALSE;
+
+	if (ctx->DrawBuffer) {
+		struct radeon_renderbuffer *rrbStencil
+			= radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+		hw_stencil = (rrbStencil && rrbStencil->bo);
+	}
+
+	if (hw_stencil) {
+		R600_STATECHANGE(context, db);
+		if (state) {
+			SETbit(r700->DB_DEPTH_CONTROL.u32All, STENCIL_ENABLE_bit);
+			SETbit(r700->DB_DEPTH_CONTROL.u32All, BACKFACE_ENABLE_bit);
+		} else
+			CLEARbit(r700->DB_DEPTH_CONTROL.u32All, STENCIL_ENABLE_bit);
+	}
+}
+
+static void r700StencilFuncSeparate(GLcontext * ctx, GLenum face,
+				    GLenum func, GLint ref, GLuint mask) //---------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	const unsigned back = ctx->Stencil._BackFace;
+
+	R600_STATECHANGE(context, stencil);
+	R600_STATECHANGE(context, db);
+
+	//front
+	SETfield(r700->DB_STENCILREFMASK.u32All, ctx->Stencil.Ref[0],
+		 STENCILREF_shift, STENCILREF_mask);
+	SETfield(r700->DB_STENCILREFMASK.u32All, ctx->Stencil.ValueMask[0],
+		 STENCILMASK_shift, STENCILMASK_mask);
+
+	SETfield(r700->DB_DEPTH_CONTROL.u32All, translate_stencil_func(ctx->Stencil.Function[0]),
+		 STENCILFUNC_shift, STENCILFUNC_mask);
+
+	//back
+	SETfield(r700->DB_STENCILREFMASK_BF.u32All, ctx->Stencil.Ref[back],
+		 STENCILREF_BF_shift, STENCILREF_BF_mask);
+	SETfield(r700->DB_STENCILREFMASK_BF.u32All, ctx->Stencil.ValueMask[back],
+		 STENCILMASK_BF_shift, STENCILMASK_BF_mask);
+
+	SETfield(r700->DB_DEPTH_CONTROL.u32All, translate_stencil_func(ctx->Stencil.Function[back]),
+		 STENCILFUNC_BF_shift, STENCILFUNC_BF_mask);
+
+}
+
+static void r700StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask) //--------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	const unsigned back = ctx->Stencil._BackFace;
+
+	R600_STATECHANGE(context, stencil);
+
+	// front
+	SETfield(r700->DB_STENCILREFMASK.u32All, ctx->Stencil.WriteMask[0],
+		 STENCILWRITEMASK_shift, STENCILWRITEMASK_mask);
+
+	// back
+	SETfield(r700->DB_STENCILREFMASK_BF.u32All, ctx->Stencil.WriteMask[back],
+		 STENCILWRITEMASK_BF_shift, STENCILWRITEMASK_BF_mask);
+
+}
+
+static void r700StencilOpSeparate(GLcontext * ctx, GLenum face,
+				  GLenum fail, GLenum zfail, GLenum zpass) //--------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	const unsigned back = ctx->Stencil._BackFace;
+
+	R600_STATECHANGE(context, db);
+
+	SETfield(r700->DB_DEPTH_CONTROL.u32All, translate_stencil_op(ctx->Stencil.FailFunc[0]),
+		 STENCILFAIL_shift, STENCILFAIL_mask);
+	SETfield(r700->DB_DEPTH_CONTROL.u32All, translate_stencil_op(ctx->Stencil.ZFailFunc[0]),
+		 STENCILZFAIL_shift, STENCILZFAIL_mask);
+	SETfield(r700->DB_DEPTH_CONTROL.u32All, translate_stencil_op(ctx->Stencil.ZPassFunc[0]),
+		 STENCILZPASS_shift, STENCILZPASS_mask);
+
+	SETfield(r700->DB_DEPTH_CONTROL.u32All, translate_stencil_op(ctx->Stencil.FailFunc[back]),
+		 STENCILFAIL_BF_shift, STENCILFAIL_BF_mask);
+	SETfield(r700->DB_DEPTH_CONTROL.u32All, translate_stencil_op(ctx->Stencil.ZFailFunc[back]),
+		 STENCILZFAIL_BF_shift, STENCILZFAIL_BF_mask);
+	SETfield(r700->DB_DEPTH_CONTROL.u32All, translate_stencil_op(ctx->Stencil.ZPassFunc[back]),
+		 STENCILZPASS_BF_shift, STENCILZPASS_BF_mask);
+}
+
+static void r700UpdateWindow(GLcontext * ctx, int id) //--------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
+	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
+	const GLfloat *v = ctx->Viewport._WindowMap.m;
+	const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+	const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+	GLfloat y_scale, y_bias;
+
+	if (render_to_fbo) {
+		y_scale = 1.0;
+		y_bias = 0;
+	} else {
+		y_scale = -1.0;
+		y_bias = yoffset;
+	}
+
+	GLfloat sx = v[MAT_SX];
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat sy = v[MAT_SY] * y_scale;
+	GLfloat ty = (v[MAT_TY] * y_scale) + y_bias;
+	GLfloat sz = v[MAT_SZ] * depthScale;
+	GLfloat tz = v[MAT_TZ] * depthScale;
+
+	R600_STATECHANGE(context, vpt);
+
+	r700->viewport[id].PA_CL_VPORT_XSCALE.f32All  = sx;
+	r700->viewport[id].PA_CL_VPORT_XOFFSET.f32All = tx;
+
+	r700->viewport[id].PA_CL_VPORT_YSCALE.f32All  = sy;
+	r700->viewport[id].PA_CL_VPORT_YOFFSET.f32All = ty;
+
+	r700->viewport[id].PA_CL_VPORT_ZSCALE.f32All  = sz;
+	r700->viewport[id].PA_CL_VPORT_ZOFFSET.f32All = tz;
+
+	r700->viewport[id].enabled = GL_TRUE;
+
+	r700SetScissor(context);
+}
+
+
+static void r700Viewport(GLcontext * ctx,
+                         GLint x,
+                         GLint y,
+			 GLsizei width,
+                         GLsizei height) //--------------------
+{
+	r700UpdateWindow(ctx, 0);
+
+	radeon_viewport(ctx, x, y, width, height);
+}
+
+static void r700DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval) //-------------
+{
+	r700UpdateWindow(ctx, 0);
+}
+
+static void r700LineWidth(GLcontext * ctx, GLfloat widthf) //---------------
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    uint32_t lineWidth = (uint32_t)((widthf * 0.5) * (1 << 4));
+
+    R600_STATECHANGE(context, su);
+
+    if (lineWidth > 0xFFFF)
+	    lineWidth = 0xFFFF;
+    SETfield(r700->PA_SU_LINE_CNTL.u32All,(uint16_t)lineWidth,
+	     PA_SU_LINE_CNTL__WIDTH_shift, PA_SU_LINE_CNTL__WIDTH_mask);
+}
+
+static void r700LineStipple(GLcontext *ctx, GLint factor, GLushort pattern)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    R600_STATECHANGE(context, sc);
+
+    SETfield(r700->PA_SC_LINE_STIPPLE.u32All, pattern, LINE_PATTERN_shift, LINE_PATTERN_mask);
+    SETfield(r700->PA_SC_LINE_STIPPLE.u32All, (factor-1), REPEAT_COUNT_shift, REPEAT_COUNT_mask);
+    SETfield(r700->PA_SC_LINE_STIPPLE.u32All, 1, AUTO_RESET_CNTL_shift, AUTO_RESET_CNTL_mask);
+}
+
+static void r700SetPolygonOffsetState(GLcontext * ctx, GLboolean state)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	R600_STATECHANGE(context, su);
+
+	if (state) {
+		SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, POLY_OFFSET_FRONT_ENABLE_bit);
+		SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, POLY_OFFSET_BACK_ENABLE_bit);
+		SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, POLY_OFFSET_PARA_ENABLE_bit);
+	} else {
+		CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, POLY_OFFSET_FRONT_ENABLE_bit);
+		CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, POLY_OFFSET_BACK_ENABLE_bit);
+		CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, POLY_OFFSET_PARA_ENABLE_bit);
+	}
+}
+
+static void r700PolygonOffset(GLcontext * ctx, GLfloat factor, GLfloat units) //--------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	GLfloat constant = units;
+
+	switch (ctx->Visual.depthBits) {
+	case 16:
+		constant *= 4.0;
+		break;
+	case 24:
+		constant *= 2.0;
+		break;
+	}
+
+	factor *= 12.0;
+
+	R600_STATECHANGE(context, poly);
+
+	r700->PA_SU_POLY_OFFSET_FRONT_SCALE.f32All = factor;
+	r700->PA_SU_POLY_OFFSET_FRONT_OFFSET.f32All = constant;
+	r700->PA_SU_POLY_OFFSET_BACK_SCALE.f32All = factor;
+	r700->PA_SU_POLY_OFFSET_BACK_OFFSET.f32All = constant;
+}
+
+static void r700UpdatePolygonMode(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	R600_STATECHANGE(context, su);
+
+	SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DISABLE_POLY_MODE, POLY_MODE_shift, POLY_MODE_mask);
+
+	/* Only do something if a polygon mode is wanted, default is GL_FILL */
+	if (ctx->Polygon.FrontMode != GL_FILL ||
+	    ctx->Polygon.BackMode != GL_FILL) {
+		GLenum f, b;
+
+		/* Handle GL_CW (clock wise and GL_CCW (counter clock wise)
+		 * correctly by selecting the correct front and back face
+		 */
+		if (ctx->Polygon.FrontFace == GL_CCW) {
+			f = ctx->Polygon.FrontMode;
+			b = ctx->Polygon.BackMode;
+		} else {
+			f = ctx->Polygon.BackMode;
+			b = ctx->Polygon.FrontMode;
+		}
+
+		/* Enable polygon mode */
+		SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DUAL_MODE, POLY_MODE_shift, POLY_MODE_mask);
+
+		switch (f) {
+		case GL_LINE:
+			SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DRAW_LINES,
+				 POLYMODE_FRONT_PTYPE_shift, POLYMODE_FRONT_PTYPE_mask);
+			break;
+		case GL_POINT:
+			SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DRAW_POINTS,
+				 POLYMODE_FRONT_PTYPE_shift, POLYMODE_FRONT_PTYPE_mask);
+			break;
+		case GL_FILL:
+			SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DRAW_TRIANGLES,
+				 POLYMODE_FRONT_PTYPE_shift, POLYMODE_FRONT_PTYPE_mask);
+			break;
+		}
+
+		switch (b) {
+		case GL_LINE:
+			SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DRAW_LINES,
+				 POLYMODE_BACK_PTYPE_shift, POLYMODE_BACK_PTYPE_mask);
+			break;
+		case GL_POINT:
+			SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DRAW_POINTS,
+				 POLYMODE_BACK_PTYPE_shift, POLYMODE_BACK_PTYPE_mask);
+			break;
+		case GL_FILL:
+			SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DRAW_TRIANGLES,
+				 POLYMODE_BACK_PTYPE_shift, POLYMODE_BACK_PTYPE_mask);
+			break;
+		}
+	}
+}
+
+static void r700PolygonMode(GLcontext * ctx, GLenum face, GLenum mode) //------------------
+{
+	(void)face;
+	(void)mode;
+
+	r700UpdatePolygonMode(ctx);
+}
+
+static void r700RenderMode(GLcontext * ctx, GLenum mode) //---------------------
+{
+}
+
+static void r700ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	GLint p;
+	GLint *ip;
+
+	p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
+	ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+	R600_STATECHANGE(context, ucp);
+
+	r700->ucp[p].PA_CL_UCP_0_X.u32All = ip[0];
+	r700->ucp[p].PA_CL_UCP_0_Y.u32All = ip[1];
+	r700->ucp[p].PA_CL_UCP_0_Z.u32All = ip[2];
+	r700->ucp[p].PA_CL_UCP_0_W.u32All = ip[3];
+}
+
+static void r700SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	GLuint p;
+
+	p = cap - GL_CLIP_PLANE0;
+
+	R600_STATECHANGE(context, cl);
+
+	if (state) {
+		r700->PA_CL_CLIP_CNTL.u32All |= (UCP_ENA_0_bit << p);
+		r700->ucp[p].enabled = GL_TRUE;
+		r700ClipPlane(ctx, cap, NULL);
+	} else {
+		r700->PA_CL_CLIP_CNTL.u32All &= ~(UCP_ENA_0_bit << p);
+		r700->ucp[p].enabled = GL_FALSE;
+	}
+}
+
+void r700SetScissor(context_t *context) //---------------
+{
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	unsigned x1, y1, x2, y2;
+	int id = 0;
+	struct radeon_renderbuffer *rrb;
+
+	rrb = radeon_get_colorbuffer(&context->radeon);
+	if (!rrb || !rrb->bo) {
+		return;
+	}
+	if (context->radeon.state.scissor.enabled) {
+		x1 = context->radeon.state.scissor.rect.x1;
+		y1 = context->radeon.state.scissor.rect.y1;
+		x2 = context->radeon.state.scissor.rect.x2;
+		y2 = context->radeon.state.scissor.rect.y2;
+	} else {
+		if (context->radeon.radeonScreen->driScreen->dri2.enabled) {
+			x1 = 0;
+			y1 = 0;
+			x2 = rrb->base.Width;
+			y2 = rrb->base.Height;
+		} else {
+			x1 = rrb->dPriv->x;
+			y1 = rrb->dPriv->y;
+			x2 = rrb->dPriv->x + rrb->dPriv->w;
+			y2 = rrb->dPriv->y + rrb->dPriv->h;
+		}
+	}
+
+	R600_STATECHANGE(context, scissor);
+
+	/* screen */
+	SETbit(r700->PA_SC_SCREEN_SCISSOR_TL.u32All, WINDOW_OFFSET_DISABLE_bit);
+	SETfield(r700->PA_SC_SCREEN_SCISSOR_TL.u32All, x1,
+		 PA_SC_SCREEN_SCISSOR_TL__TL_X_shift, PA_SC_SCREEN_SCISSOR_TL__TL_X_mask);
+	SETfield(r700->PA_SC_SCREEN_SCISSOR_TL.u32All, y1,
+		 PA_SC_SCREEN_SCISSOR_TL__TL_Y_shift, PA_SC_SCREEN_SCISSOR_TL__TL_Y_mask);
+
+	SETfield(r700->PA_SC_SCREEN_SCISSOR_BR.u32All, x2,
+		 PA_SC_SCREEN_SCISSOR_BR__BR_X_shift, PA_SC_SCREEN_SCISSOR_BR__BR_X_mask);
+	SETfield(r700->PA_SC_SCREEN_SCISSOR_BR.u32All, y2,
+		 PA_SC_SCREEN_SCISSOR_BR__BR_Y_shift, PA_SC_SCREEN_SCISSOR_BR__BR_Y_mask);
+
+	/* window */
+	SETbit(r700->PA_SC_WINDOW_SCISSOR_TL.u32All, WINDOW_OFFSET_DISABLE_bit);
+	SETfield(r700->PA_SC_WINDOW_SCISSOR_TL.u32All, x1,
+		 PA_SC_WINDOW_SCISSOR_TL__TL_X_shift, PA_SC_WINDOW_SCISSOR_TL__TL_X_mask);
+	SETfield(r700->PA_SC_WINDOW_SCISSOR_TL.u32All, y1,
+		 PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift, PA_SC_WINDOW_SCISSOR_TL__TL_Y_mask);
+
+	SETfield(r700->PA_SC_WINDOW_SCISSOR_BR.u32All, x2,
+		 PA_SC_WINDOW_SCISSOR_BR__BR_X_shift, PA_SC_WINDOW_SCISSOR_BR__BR_X_mask);
+	SETfield(r700->PA_SC_WINDOW_SCISSOR_BR.u32All, y2,
+		 PA_SC_WINDOW_SCISSOR_BR__BR_Y_shift, PA_SC_WINDOW_SCISSOR_BR__BR_Y_mask);
+
+
+	SETfield(r700->PA_SC_CLIPRECT_0_TL.u32All, x1,
+		 PA_SC_CLIPRECT_0_TL__TL_X_shift, PA_SC_CLIPRECT_0_TL__TL_X_mask);
+	SETfield(r700->PA_SC_CLIPRECT_0_TL.u32All, y1,
+		 PA_SC_CLIPRECT_0_TL__TL_Y_shift, PA_SC_CLIPRECT_0_TL__TL_Y_mask);
+	SETfield(r700->PA_SC_CLIPRECT_0_BR.u32All, x2,
+		 PA_SC_CLIPRECT_0_BR__BR_X_shift, PA_SC_CLIPRECT_0_BR__BR_X_mask);
+	SETfield(r700->PA_SC_CLIPRECT_0_BR.u32All, y2,
+		 PA_SC_CLIPRECT_0_BR__BR_Y_shift, PA_SC_CLIPRECT_0_BR__BR_Y_mask);
+
+	r700->PA_SC_CLIPRECT_1_TL.u32All = r700->PA_SC_CLIPRECT_0_TL.u32All;
+	r700->PA_SC_CLIPRECT_1_BR.u32All = r700->PA_SC_CLIPRECT_0_BR.u32All;
+	r700->PA_SC_CLIPRECT_2_TL.u32All = r700->PA_SC_CLIPRECT_0_TL.u32All;
+	r700->PA_SC_CLIPRECT_2_BR.u32All = r700->PA_SC_CLIPRECT_0_BR.u32All;
+	r700->PA_SC_CLIPRECT_3_TL.u32All = r700->PA_SC_CLIPRECT_0_TL.u32All;
+	r700->PA_SC_CLIPRECT_3_BR.u32All = r700->PA_SC_CLIPRECT_0_BR.u32All;
+
+	/* more....2d clip */
+	SETbit(r700->PA_SC_GENERIC_SCISSOR_TL.u32All, WINDOW_OFFSET_DISABLE_bit);
+	SETfield(r700->PA_SC_GENERIC_SCISSOR_TL.u32All, x1,
+		 PA_SC_GENERIC_SCISSOR_TL__TL_X_shift, PA_SC_GENERIC_SCISSOR_TL__TL_X_mask);
+	SETfield(r700->PA_SC_GENERIC_SCISSOR_TL.u32All, y1,
+		 PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift, PA_SC_GENERIC_SCISSOR_TL__TL_Y_mask);
+	SETfield(r700->PA_SC_GENERIC_SCISSOR_BR.u32All, x2,
+		 PA_SC_GENERIC_SCISSOR_BR__BR_X_shift, PA_SC_GENERIC_SCISSOR_BR__BR_X_mask);
+	SETfield(r700->PA_SC_GENERIC_SCISSOR_BR.u32All, y2,
+		 PA_SC_GENERIC_SCISSOR_BR__BR_Y_shift, PA_SC_GENERIC_SCISSOR_BR__BR_Y_mask);
+
+	SETbit(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_TL.u32All, WINDOW_OFFSET_DISABLE_bit);
+	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_TL.u32All, x1,
+		 PA_SC_VPORT_SCISSOR_0_TL__TL_X_shift, PA_SC_VPORT_SCISSOR_0_TL__TL_X_mask);
+	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_TL.u32All, y1,
+		 PA_SC_VPORT_SCISSOR_0_TL__TL_Y_shift, PA_SC_VPORT_SCISSOR_0_TL__TL_Y_mask);
+	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_BR.u32All, x2,
+		 PA_SC_VPORT_SCISSOR_0_BR__BR_X_shift, PA_SC_VPORT_SCISSOR_0_BR__BR_X_mask);
+	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_BR.u32All, y2,
+		 PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift, PA_SC_VPORT_SCISSOR_0_BR__BR_Y_mask);
+
+	r700->viewport[id].PA_SC_VPORT_ZMIN_0.u32All = 0;
+	r700->viewport[id].PA_SC_VPORT_ZMAX_0.u32All = 0x3F800000;
+	r700->viewport[id].enabled = GL_TRUE;
+}
+
+static void r700InitSQConfig(GLcontext * ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    int ps_prio;
+    int vs_prio;
+    int gs_prio;
+    int es_prio;
+    int num_ps_gprs;
+    int num_vs_gprs;
+    int num_gs_gprs;
+    int num_es_gprs;
+    int num_temp_gprs;
+    int num_ps_threads;
+    int num_vs_threads;
+    int num_gs_threads;
+    int num_es_threads;
+    int num_ps_stack_entries;
+    int num_vs_stack_entries;
+    int num_gs_stack_entries;
+    int num_es_stack_entries;
+
+    R600_STATECHANGE(context, sq);
+
+    // SQ
+    ps_prio = 0;
+    vs_prio = 1;
+    gs_prio = 2;
+    es_prio = 3;
+    switch (context->radeon.radeonScreen->chip_family) {
+    case CHIP_FAMILY_R600:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV630:
+    case CHIP_FAMILY_RV635:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 144;
+	    num_vs_threads = 40;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV610:
+    case CHIP_FAMILY_RV620:
+    case CHIP_FAMILY_RS780:
+    case CHIP_FAMILY_RS880:
+    default:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV670:
+	    num_ps_gprs = 144;
+	    num_vs_gprs = 40;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV770:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 188;
+	    num_vs_threads = 60;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 256;
+	    num_vs_stack_entries = 256;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV730:
+    case CHIP_FAMILY_RV740:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 188;
+	    num_vs_threads = 60;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV710:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 144;
+	    num_vs_threads = 48;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    }
+
+    r700->sq_config.SQ_CONFIG.u32All = 0;
+    if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV610) ||
+        (context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV620) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS780) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS880) ||
+        (context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV710))
+	    CLEARbit(r700->sq_config.SQ_CONFIG.u32All, VC_ENABLE_bit);
+    else
+	    SETbit(r700->sq_config.SQ_CONFIG.u32All, VC_ENABLE_bit);
+    SETbit(r700->sq_config.SQ_CONFIG.u32All, DX9_CONSTS_bit);
+    SETbit(r700->sq_config.SQ_CONFIG.u32All, ALU_INST_PREFER_VECTOR_bit);
+    SETfield(r700->sq_config.SQ_CONFIG.u32All, ps_prio, PS_PRIO_shift, PS_PRIO_mask);
+    SETfield(r700->sq_config.SQ_CONFIG.u32All, ps_prio, VS_PRIO_shift, VS_PRIO_mask);
+    SETfield(r700->sq_config.SQ_CONFIG.u32All, ps_prio, GS_PRIO_shift, GS_PRIO_mask);
+    SETfield(r700->sq_config.SQ_CONFIG.u32All, ps_prio, ES_PRIO_shift, ES_PRIO_mask);
+
+    r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All = 0;
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All, num_ps_gprs, NUM_PS_GPRS_shift, NUM_PS_GPRS_mask);
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All, num_vs_gprs, NUM_VS_GPRS_shift, NUM_VS_GPRS_mask);
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All, num_temp_gprs,
+	     NUM_CLAUSE_TEMP_GPRS_shift, NUM_CLAUSE_TEMP_GPRS_mask);
+
+    r700->sq_config.SQ_GPR_RESOURCE_MGMT_2.u32All = 0;
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_2.u32All, num_gs_gprs, NUM_GS_GPRS_shift, NUM_GS_GPRS_mask);
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_2.u32All, num_es_gprs, NUM_ES_GPRS_shift, NUM_ES_GPRS_mask);
+
+    r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All = 0;
+    SETfield(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All, num_ps_threads,
+	     NUM_PS_THREADS_shift, NUM_PS_THREADS_mask);
+    SETfield(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All, num_vs_threads,
+	     NUM_VS_THREADS_shift, NUM_VS_THREADS_mask);
+    SETfield(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All, num_gs_threads,
+	     NUM_GS_THREADS_shift, NUM_GS_THREADS_mask);
+    SETfield(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All, num_es_threads,
+	     NUM_ES_THREADS_shift, NUM_ES_THREADS_mask);
+
+    r700->sq_config.SQ_STACK_RESOURCE_MGMT_1.u32All = 0;
+    SETfield(r700->sq_config.SQ_STACK_RESOURCE_MGMT_1.u32All, num_ps_stack_entries,
+	     NUM_PS_STACK_ENTRIES_shift, NUM_PS_STACK_ENTRIES_mask);
+    SETfield(r700->sq_config.SQ_STACK_RESOURCE_MGMT_1.u32All, num_vs_stack_entries,
+	     NUM_VS_STACK_ENTRIES_shift, NUM_VS_STACK_ENTRIES_mask);
+
+    r700->sq_config.SQ_STACK_RESOURCE_MGMT_2.u32All = 0;
+    SETfield(r700->sq_config.SQ_STACK_RESOURCE_MGMT_2.u32All, num_gs_stack_entries,
+	     NUM_GS_STACK_ENTRIES_shift, NUM_GS_STACK_ENTRIES_mask);
+    SETfield(r700->sq_config.SQ_STACK_RESOURCE_MGMT_2.u32All, num_es_stack_entries,
+	     NUM_ES_STACK_ENTRIES_shift, NUM_ES_STACK_ENTRIES_mask);
+
+}
+
+/**
+ * Calculate initial hardware state and register state functions.
+ * Assumes that the command buffer and state atoms have been
+ * initialized already.
+ */
+void r700InitState(GLcontext * ctx) //-------------------
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    int id = 0;
+
+    radeon_firevertices(&context->radeon);
+
+    r700->TA_CNTL_AUX.u32All = 0;
+    SETfield(r700->TA_CNTL_AUX.u32All, 28, TD_FIFO_CREDIT_shift, TD_FIFO_CREDIT_mask);
+    r700->VC_ENHANCE.u32All = 0;
+    r700->DB_WATERMARKS.u32All = 0;
+    SETfield(r700->DB_WATERMARKS.u32All, 4, DEPTH_FREE_shift, DEPTH_FREE_mask);
+    SETfield(r700->DB_WATERMARKS.u32All, 16, DEPTH_FLUSH_shift, DEPTH_FLUSH_mask);
+    SETfield(r700->DB_WATERMARKS.u32All, 0, FORCE_SUMMARIZE_shift, FORCE_SUMMARIZE_mask);
+    SETfield(r700->DB_WATERMARKS.u32All, 4, DEPTH_PENDING_FREE_shift, DEPTH_PENDING_FREE_mask);
+    r700->SQ_DYN_GPR_CNTL_PS_FLUSH_REQ.u32All = 0;
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770) {
+	    SETfield(r700->TA_CNTL_AUX.u32All, 3, GRADIENT_CREDIT_shift, GRADIENT_CREDIT_mask);
+	    r700->DB_DEBUG.u32All = 0x82000000;
+	    SETfield(r700->DB_WATERMARKS.u32All, 16, DEPTH_CACHELINE_FREE_shift, DEPTH_CACHELINE_FREE_mask);
+    } else {
+	    SETfield(r700->TA_CNTL_AUX.u32All, 2, GRADIENT_CREDIT_shift, GRADIENT_CREDIT_mask);
+	    SETfield(r700->DB_WATERMARKS.u32All, 4, DEPTH_CACHELINE_FREE_shift, DEPTH_CACHELINE_FREE_mask);
+	    SETbit(r700->SQ_DYN_GPR_CNTL_PS_FLUSH_REQ.u32All, VS_PC_LIMIT_ENABLE_bit);
+    }
+
+    /* Turn off vgt reuse */
+    r700->VGT_REUSE_OFF.u32All = 0;
+    SETbit(r700->VGT_REUSE_OFF.u32All, REUSE_OFF_bit);
+
+    /* Specify offsetting and clamp values for vertices */
+    r700->VGT_MAX_VTX_INDX.u32All      = 0xFFFFFF;
+    r700->VGT_MIN_VTX_INDX.u32All      = 0;
+    r700->VGT_INDX_OFFSET.u32All    = 0;
+
+    /* default shader connections. */
+    r700->SPI_VS_OUT_ID_0.u32All  = 0x03020100;
+    r700->SPI_VS_OUT_ID_1.u32All  = 0x07060504;
+    r700->SPI_VS_OUT_ID_2.u32All  = 0x0b0a0908;
+    r700->SPI_VS_OUT_ID_3.u32All  = 0x0f0e0d0c;
+
+    r700->SPI_THREAD_GROUPING.u32All = 0;
+    if (context->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV770)
+	    SETfield(r700->SPI_THREAD_GROUPING.u32All, 1, PS_GROUPING_shift, PS_GROUPING_mask);
+
+    /* 4 clip rectangles */ /* TODO : set these clip rects according to context->currentDraw->numClipRects */
+    r700->PA_SC_CLIPRECT_RULE.u32All = 0;
+    SETfield(r700->PA_SC_CLIPRECT_RULE.u32All, CLIP_RULE_mask, CLIP_RULE_shift, CLIP_RULE_mask);
+
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+	    r700->PA_SC_EDGERULE.u32All = 0;
+    else
+	    r700->PA_SC_EDGERULE.u32All = 0xAAAAAAAA;
+
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770) {
+	    r700->PA_SC_MODE_CNTL.u32All = 0;
+	    SETbit(r700->PA_SC_MODE_CNTL.u32All, WALK_ORDER_ENABLE_bit);
+	    SETbit(r700->PA_SC_MODE_CNTL.u32All, FORCE_EOV_CNTDWN_ENABLE_bit);
+    } else {
+	    r700->PA_SC_MODE_CNTL.u32All = 0x00500000;
+	    SETbit(r700->PA_SC_MODE_CNTL.u32All, FORCE_EOV_REZ_ENABLE_bit);
+	    SETbit(r700->PA_SC_MODE_CNTL.u32All, FORCE_EOV_CNTDWN_ENABLE_bit);
+    }
+
+    /* Do scale XY and Z by 1/W0. */
+    r700->bEnablePerspective = GL_TRUE;
+    CLEARbit(r700->PA_CL_VTE_CNTL.u32All, VTX_XY_FMT_bit);
+    CLEARbit(r700->PA_CL_VTE_CNTL.u32All, VTX_Z_FMT_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_W0_FMT_bit);
+
+    /* Enable viewport scaling for all three axis */
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_X_SCALE_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_X_OFFSET_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_Y_SCALE_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_Y_OFFSET_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_Z_SCALE_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_Z_OFFSET_ENA_bit);
+
+    /* GL uses last vtx for flat shading components */
+    SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, PROVOKING_VTX_LAST_bit);
+
+    /* Set up vertex control */
+    r700->PA_SU_VTX_CNTL.u32All = 0;
+    CLEARfield(r700->PA_SU_VTX_CNTL.u32All, QUANT_MODE_mask);
+    SETbit(r700->PA_SU_VTX_CNTL.u32All, PIX_CENTER_bit);
+    SETfield(r700->PA_SU_VTX_CNTL.u32All, X_ROUND_TO_EVEN,
+             PA_SU_VTX_CNTL__ROUND_MODE_shift, PA_SU_VTX_CNTL__ROUND_MODE_mask);
+
+    /* to 1.0 = no guard band */
+    r700->PA_CL_GB_VERT_CLIP_ADJ.u32All  = 0x3F800000;  /* 1.0 */
+    r700->PA_CL_GB_VERT_DISC_ADJ.u32All  = 0x3F800000;
+    r700->PA_CL_GB_HORZ_CLIP_ADJ.u32All  = 0x3F800000;
+    r700->PA_CL_GB_HORZ_DISC_ADJ.u32All  = 0x3F800000;
+
+    /* Enable all samples for multi-sample anti-aliasing */
+    r700->PA_SC_AA_MASK.u32All = 0xFFFFFFFF;
+    /* Turn off AA */
+    r700->PA_SC_AA_CONFIG.u32All = 0;
+
+    r700->SX_MISC.u32All = 0;
+
+    r700InitSQConfig(ctx);
+
+    r700ColorMask(ctx,
+		  ctx->Color.ColorMask[RCOMP],
+		  ctx->Color.ColorMask[GCOMP],
+		  ctx->Color.ColorMask[BCOMP],
+		  ctx->Color.ColorMask[ACOMP]);
+
+    r700Enable(ctx, GL_DEPTH_TEST, ctx->Depth.Test);
+    r700DepthMask(ctx, ctx->Depth.Mask);
+    r700DepthFunc(ctx, ctx->Depth.Func);
+    SETbit(r700->DB_SHADER_CONTROL.u32All, DUAL_EXPORT_ENABLE_bit);
+
+    r700->DB_DEPTH_CLEAR.u32All     = 0x3F800000;
+
+    r700->DB_RENDER_CONTROL.u32All  = 0;
+    SETbit(r700->DB_RENDER_CONTROL.u32All, STENCIL_COMPRESS_DISABLE_bit);
+    SETbit(r700->DB_RENDER_CONTROL.u32All, DEPTH_COMPRESS_DISABLE_bit);
+    r700->DB_RENDER_OVERRIDE.u32All = 0;
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+	    SETbit(r700->DB_RENDER_OVERRIDE.u32All, FORCE_SHADER_Z_ORDER_bit);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIZ_ENABLE_shift, FORCE_HIZ_ENABLE_mask);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE0_shift, FORCE_HIS_ENABLE0_mask);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE1_shift, FORCE_HIS_ENABLE1_mask);
+
+    r700->DB_ALPHA_TO_MASK.u32All = 0;
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET0_shift, ALPHA_TO_MASK_OFFSET0_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET1_shift, ALPHA_TO_MASK_OFFSET1_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET2_shift, ALPHA_TO_MASK_OFFSET2_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET3_shift, ALPHA_TO_MASK_OFFSET3_mask);
+
+    /* stencil */
+    r700Enable(ctx, GL_STENCIL_TEST, ctx->Stencil._Enabled);
+    r700StencilMaskSeparate(ctx, 0, ctx->Stencil.WriteMask[0]);
+    r700StencilFuncSeparate(ctx, 0, ctx->Stencil.Function[0],
+			    ctx->Stencil.Ref[0], ctx->Stencil.ValueMask[0]);
+    r700StencilOpSeparate(ctx, 0, ctx->Stencil.FailFunc[0],
+			  ctx->Stencil.ZFailFunc[0],
+			  ctx->Stencil.ZPassFunc[0]);
+
+    r700UpdateCulling(ctx);
+
+    r700SetBlendState(ctx);
+    r700SetLogicOpState(ctx);
+
+    r700AlphaFunc(ctx, ctx->Color.AlphaFunc, ctx->Color.AlphaRef);
+    r700Enable(ctx, GL_ALPHA_TEST, ctx->Color.AlphaEnabled);
+
+    r700PointSize(ctx, 1.0);
+
+    CLEARfield(r700->PA_SU_POINT_MINMAX.u32All, MIN_SIZE_mask);
+    SETfield(r700->PA_SU_POINT_MINMAX.u32All, 0x8000, MAX_SIZE_shift, MAX_SIZE_mask);
+
+    r700LineWidth(ctx, 1.0);
+
+    r700->PA_SC_LINE_CNTL.u32All = 0;
+    CLEARbit(r700->PA_SC_LINE_CNTL.u32All, EXPAND_LINE_WIDTH_bit);
+    SETbit(r700->PA_SC_LINE_CNTL.u32All, LAST_PIXEL_bit);
+
+    r700ShadeModel(ctx, ctx->Light.ShadeModel);
+    r700PolygonMode(ctx, GL_FRONT, ctx->Polygon.FrontMode);
+    r700PolygonMode(ctx, GL_BACK, ctx->Polygon.BackMode);
+    r700PolygonOffset(ctx, ctx->Polygon.OffsetFactor,
+		      ctx->Polygon.OffsetUnits);
+    r700Enable(ctx, GL_POLYGON_OFFSET_POINT, ctx->Polygon.OffsetPoint);
+    r700Enable(ctx, GL_POLYGON_OFFSET_LINE, ctx->Polygon.OffsetLine);
+    r700Enable(ctx, GL_POLYGON_OFFSET_FILL, ctx->Polygon.OffsetFill);
+
+    /* CB */
+    r700BlendColor(ctx, ctx->Color.BlendColor);
+
+    r700->CB_CLEAR_RED_R6XX.f32All = 1.0; //r6xx only
+    r700->CB_CLEAR_GREEN_R6XX.f32All = 0.0; //r6xx only
+    r700->CB_CLEAR_BLUE_R6XX.f32All = 1.0; //r6xx only
+    r700->CB_CLEAR_ALPHA_R6XX.f32All = 1.0; //r6xx only
+    r700->CB_FOG_RED_R6XX.u32All = 0; //r6xx only
+    r700->CB_FOG_GREEN_R6XX.u32All = 0; //r6xx only
+    r700->CB_FOG_BLUE_R6XX.u32All = 0; //r6xx only
+
+    /* Disable color compares */
+    SETfield(r700->CB_CLRCMP_CONTROL.u32All, CLRCMP_DRAW_ALWAYS,
+             CLRCMP_FCN_SRC_shift, CLRCMP_FCN_SRC_mask);
+    SETfield(r700->CB_CLRCMP_CONTROL.u32All, CLRCMP_DRAW_ALWAYS,
+             CLRCMP_FCN_DST_shift, CLRCMP_FCN_DST_mask);
+    SETfield(r700->CB_CLRCMP_CONTROL.u32All, CLRCMP_SEL_SRC,
+             CLRCMP_FCN_SEL_shift, CLRCMP_FCN_SEL_mask);
+
+    /* Zero out source */
+    r700->CB_CLRCMP_SRC.u32All = 0x00000000;
+
+    /* Put a compare color in for error checking */
+    r700->CB_CLRCMP_DST.u32All = 0x000000FF;
+
+    /* Set up color compare mask */
+    r700->CB_CLRCMP_MSK.u32All = 0xFFFFFFFF;
+
+    /* screen/window/view */
+    SETfield(r700->CB_TARGET_MASK.u32All, 0xF, (4 * id), TARGET0_ENABLE_mask);
+
+    context->radeon.hw.all_dirty = GL_TRUE;
+
+}
+
+void r700InitStateFuncs(struct dd_function_table *functions) //-----------------
+{
+	functions->UpdateState = r700InvalidateState;
+	functions->AlphaFunc = r700AlphaFunc;
+	functions->BlendColor = r700BlendColor;
+	functions->BlendEquationSeparate = r700BlendEquationSeparate;
+	functions->BlendFuncSeparate = r700BlendFuncSeparate;
+	functions->Enable = r700Enable;
+	functions->ColorMask = r700ColorMask;
+	functions->DepthFunc = r700DepthFunc;
+	functions->DepthMask = r700DepthMask;
+	functions->CullFace = r700CullFace;
+	functions->Fogfv = r700Fogfv;
+	functions->FrontFace = r700FrontFace;
+	functions->ShadeModel = r700ShadeModel;
+	functions->LogicOpcode = r700LogicOpcode;
+
+	/* ARB_point_parameters */
+	functions->PointParameterfv = r700PointParameter;
+
+	/* Stencil related */
+	functions->StencilFuncSeparate = r700StencilFuncSeparate;
+	functions->StencilMaskSeparate = r700StencilMaskSeparate;
+	functions->StencilOpSeparate = r700StencilOpSeparate;
+
+	/* Viewport related */
+	functions->Viewport = r700Viewport;
+	functions->DepthRange = r700DepthRange;
+	functions->PointSize = r700PointSize;
+	functions->LineWidth = r700LineWidth;
+	functions->LineStipple = r700LineStipple;
+
+	functions->PolygonOffset = r700PolygonOffset;
+	functions->PolygonMode = r700PolygonMode;
+
+	functions->RenderMode = r700RenderMode;
+
+	functions->ClipPlane = r700ClipPlane;
+
+	functions->Scissor = radeonScissor;
+
+	functions->DrawBuffer		= radeonDrawBuffer;
+	functions->ReadBuffer		= radeonReadBuffer;
+
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_state.h b/src/mesa/drivers/dri/r600/r700_state.h
new file mode 100644
index 0000000000..0f53d5b4c5
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_state.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#ifndef _R700_STATE_H
+#define _R700_STATE_H
+
+#include "main/mtypes.h"
+
+#include "r600_context.h"
+
+#include "r700_chip.h"
+
+extern void r700UpdateStateParameters(GLcontext * ctx, GLuint new_state);
+extern void r700UpdateShaders (GLcontext * ctx);
+
+extern void r700UpdateViewportOffset(GLcontext * ctx);
+
+extern void r700InitState (GLcontext * ctx);
+extern void r700InitStateFuncs (struct dd_function_table *functions);
+
+extern void r700SetScissor(context_t *context);
+
+#endif	/* _R600_SCREEN_H */
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c
new file mode 100644
index 0000000000..8c2b0071df
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.c
@@ -0,0 +1,474 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "main/imports.h"
+#include "main/mtypes.h"
+
+#include "tnl/t_context.h"
+#include "shader/program.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+
+#include "radeon_debug.h"
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+
+#include "r700_debug.h"
+#include "r700_vertprog.h"
+
+unsigned int Map_Vertex_Output(r700_AssemblerBase       *pAsm, 
+					           struct gl_vertex_program *mesa_vp,
+					           unsigned int unStart)
+{
+    unsigned int i;
+	unsigned int unBit;
+	unsigned int unTotal = unStart;
+
+    //!!!!!!! THE ORDER MATCH FS INPUT
+
+	unBit = 1 << VERT_RESULT_HPOS;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_HPOS] = unTotal++;
+	}
+
+	unBit = 1 << VERT_RESULT_COL0;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_COL0] = unTotal++;
+	}
+
+	unBit = 1 << VERT_RESULT_COL1;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_COL1] = unTotal++;
+	}
+
+	//TODO : dealing back face.
+	unBit = 1 << VERT_RESULT_BFC0;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_BFC0] = unTotal++;
+	}
+
+	unBit = 1 << VERT_RESULT_BFC1;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_BFC1] = unTotal++;
+	}
+
+	//TODO : dealing fog.
+	unBit = 1 << VERT_RESULT_FOGC;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_FOGC] = unTotal++;
+	}
+
+	//TODO : dealing point size.
+	unBit = 1 << VERT_RESULT_PSIZ;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_PSIZ] = unTotal++;
+	}
+
+	for(i=0; i<8; i++)
+	{
+		unBit = 1 << (VERT_RESULT_TEX0 + i);
+		if(mesa_vp->Base.OutputsWritten & unBit)
+		{
+			pAsm->ucVP_OutputMap[VERT_RESULT_TEX0 + i] = unTotal++;
+		}
+	}
+
+	return (unTotal - unStart);
+}
+
+unsigned int Map_Vertex_Input(r700_AssemblerBase       *pAsm, 
+					  struct gl_vertex_program *mesa_vp,
+					  unsigned int unStart)
+{
+	int i;
+	unsigned int unBit;
+	unsigned int unTotal = unStart;
+	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	{
+		unBit = 1 << i;
+		if(mesa_vp->Base.InputsRead & unBit)
+		{
+			pAsm->ucVP_AttributeMap[i] = unTotal++;
+		}
+	}
+	return (unTotal - unStart);
+}
+
+GLboolean Process_Vertex_Program_Vfetch_Instructions(
+						struct r700_vertex_program *vp,
+						struct gl_vertex_program   *mesa_vp)
+{
+	int i;
+    unsigned int unBit;
+	VTX_FETCH_METHOD vtxFetchMethod;
+	vtxFetchMethod.bEnableMini          = GL_FALSE;
+	vtxFetchMethod.mega_fetch_remainder = 0;
+
+	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	{
+		unBit = 1 << i;
+		if(mesa_vp->Base.InputsRead & unBit)
+		{
+			assemble_vfetch_instruction(&vp->r700AsmCode,
+						    i,
+						    vp->r700AsmCode.ucVP_AttributeMap[i],
+						    vp->aos_desc[i].size,
+						    vp->aos_desc[i].type,
+						    &vtxFetchMethod);
+		}
+	}
+	
+	return GL_TRUE;
+}
+
+void Map_Vertex_Program(struct r700_vertex_program *vp,
+						struct gl_vertex_program   *mesa_vp)
+{
+    GLuint ui;
+    r700_AssemblerBase *pAsm = &(vp->r700AsmCode);
+	unsigned int num_inputs;
+
+	// R0 will always be used for index into vertex buffer
+	pAsm->number_used_registers = 1;
+	pAsm->starting_vfetch_register_number = pAsm->number_used_registers;
+
+    // Map Inputs: Add 1 to mapping since R0 is used for index
+	num_inputs = Map_Vertex_Input(pAsm, mesa_vp, pAsm->number_used_registers);
+	pAsm->number_used_registers += num_inputs;
+
+	// Create VFETCH instructions for inputs
+	if (GL_TRUE != Process_Vertex_Program_Vfetch_Instructions(vp, mesa_vp) ) 
+	{
+		radeon_error("Calling Process_Vertex_Program_Vfetch_Instructions return error. \n");
+		return; //error
+	}
+
+	// Map Outputs
+	pAsm->number_of_exports = Map_Vertex_Output(pAsm, mesa_vp, pAsm->number_used_registers);
+
+	pAsm->starting_export_register_number = pAsm->number_used_registers;
+
+	pAsm->number_used_registers += pAsm->number_of_exports;
+
+    pAsm->pucOutMask = (unsigned char*) MALLOC(pAsm->number_of_exports);
+    
+    for(ui=0; ui<pAsm->number_of_exports; ui++)
+    {
+        pAsm->pucOutMask[ui] = 0x0;
+    }
+
+    /* Map temporary registers (GPRs) */
+    pAsm->starting_temp_register_number = pAsm->number_used_registers;
+
+    if(mesa_vp->Base.NumNativeTemporaries >= mesa_vp->Base.NumTemporaries)
+    {   /* arb uses NumNativeTemporaries */
+        pAsm->number_used_registers += mesa_vp->Base.NumNativeTemporaries;
+    }
+    else
+    {   /* fix func t_vp uses NumTemporaries */
+        pAsm->number_used_registers += mesa_vp->Base.NumTemporaries;
+    }
+	
+    pAsm->uFirstHelpReg = pAsm->number_used_registers;
+}
+
+GLboolean Find_Instruction_Dependencies_vp(struct r700_vertex_program *vp,
+					                	struct gl_vertex_program   *mesa_vp)
+{
+    GLuint i, j;
+    GLint * puiTEMPwrites;
+    struct prog_instruction *pILInst;
+    InstDeps         *pInstDeps;
+
+    puiTEMPwrites = (GLint*) MALLOC(sizeof(GLuint)*mesa_vp->Base.NumTemporaries);
+    for(i=0; i<mesa_vp->Base.NumTemporaries; i++)
+    {
+        puiTEMPwrites[i] = -1;
+    }
+
+    pInstDeps = (InstDeps*)MALLOC(sizeof(InstDeps)*mesa_vp->Base.NumInstructions);
+
+    for(i=0; i<mesa_vp->Base.NumInstructions; i++)
+    {
+        pInstDeps[i].nDstDep = -1;
+        pILInst = &(mesa_vp->Base.Instructions[i]);
+
+        //Dst
+        if(pILInst->DstReg.File == PROGRAM_TEMPORARY)
+        {
+            //Set lastwrite for the temp
+            puiTEMPwrites[pILInst->DstReg.Index] = i;
+        }
+
+        //Src
+        for(j=0; j<3; j++)
+        {
+            if(pILInst->SrcReg[j].File == PROGRAM_TEMPORARY)
+            {
+                //Set dep.
+                pInstDeps[i].nSrcDeps[j] = puiTEMPwrites[pILInst->SrcReg[j].Index];
+            }
+            else
+            {
+                pInstDeps[i].nSrcDeps[j] = -1;
+            }
+        }
+    }
+
+    vp->r700AsmCode.pInstDeps = pInstDeps;
+
+    FREE(puiTEMPwrites);
+
+    return GL_TRUE;
+}
+
+struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
+						struct gl_vertex_program *mesa_vp)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	struct r700_vertex_program *vp;
+	TNLcontext *tnl = TNL_CONTEXT(ctx);
+	struct vertex_buffer *vb = &tnl->vb;
+	unsigned int unBit;
+	unsigned int i;
+
+	vp = _mesa_calloc(sizeof(*vp));
+	vp->mesa_program = (struct gl_vertex_program *)_mesa_clone_program(ctx, &mesa_vp->Base);
+
+	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	{
+		unBit = 1 << i;
+		if(vp->mesa_program->Base.InputsRead & unBit) /* ctx->Array.ArrayObj->xxxxxxx */
+		{
+			vp->aos_desc[i].size   = vb->AttribPtr[i]->size;
+			vp->aos_desc[i].stride = vb->AttribPtr[i]->size * sizeof(GL_FLOAT);/* when emit array, data is packed. vb->AttribPtr[i]->stride;*/
+			vp->aos_desc[i].type   = GL_FLOAT;
+		}
+	}
+	
+	if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+	{
+		vp->r700AsmCode.bR6xx = 1;
+	}
+
+	//Init_Program
+	Init_r700_AssemblerBase(SPT_VP, &(vp->r700AsmCode), &(vp->r700Shader) );
+	Map_Vertex_Program( vp, mesa_vp );
+
+	if(GL_FALSE == Find_Instruction_Dependencies_vp(vp, mesa_vp))
+	{
+		return NULL;
+    }
+
+	if(GL_FALSE == AssembleInstr(mesa_vp->Base.NumInstructions,
+                                 &(mesa_vp->Base.Instructions[0]), 
+                                 &(vp->r700AsmCode)) )
+	{
+		return NULL;
+	} 
+
+    if(GL_FALSE == Process_Vertex_Exports(&(vp->r700AsmCode), mesa_vp->Base.OutputsWritten) )
+    {
+        return NULL;
+    }
+
+    vp->r700Shader.nRegs = (vp->r700AsmCode.number_used_registers == 0) ? 0 
+                         : (vp->r700AsmCode.number_used_registers - 1);
+
+	vp->r700Shader.nParamExports = vp->r700AsmCode.number_of_exports;
+
+    vp->translated = GL_TRUE;
+
+	return vp;
+}
+
+void r700SelectVertexShader(GLcontext *ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    struct r700_vertex_program_cont *vpc;
+    struct r700_vertex_program *vp;
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    struct vertex_buffer *vb = &tnl->vb;
+    unsigned int unBit;
+    unsigned int i;
+    GLboolean match;
+
+    vpc = (struct r700_vertex_program_cont *)ctx->VertexProgram._Current;
+
+#if 0
+    if (context->radeon.NewGLState & (_NEW_PROGRAM_CONSTANTS|_NEW_PROGRAM))
+    {
+	vpc->needUpdateVF = 1;
+    }
+#endif
+
+    for (vp = vpc->progs; vp; vp = vp->next)
+    {
+	match = GL_TRUE;	
+	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	{
+		unBit = 1 << i;
+                if(vpc->mesa_program.Base.InputsRead & unBit)
+		{
+			if (vp->aos_desc[i].size != vb->AttribPtr[i]->size)
+				match = GL_FALSE;
+				break;
+		}
+	}
+	if (match) 
+	{
+		context->selected_vp = vp;
+		return;
+	}
+    }
+
+    vp = r700TranslateVertexShader(ctx, &(vpc->mesa_program) );
+    if(!vp)
+    {
+	radeon_error("Failed to translate vertex shader. \n");
+	return;
+    }
+    vp->next = vpc->progs;
+    vpc->progs = vp;
+    context->selected_vp = vp;
+    return;
+}
+
+void * r700GetActiveVpShaderBo(GLcontext * ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    struct r700_vertex_program *vp = context->selected_vp;;
+
+    if (vp)
+	return vp->shaderbo;
+    else
+	return NULL;
+}
+
+GLboolean r700SetupVertexProgram(GLcontext * ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    struct r700_vertex_program *vp = context->selected_vp;
+
+    struct gl_program_parameter_list *paramList;
+    unsigned int unNumParamData;
+    unsigned int ui;
+
+    if(GL_FALSE == vp->loaded)
+    {
+	    if(vp->r700Shader.bNeedsAssembly == GL_TRUE)
+	    {
+		    Assemble( &(vp->r700Shader) );
+	    }
+
+        /* Load vp to gpu */
+        r600EmitShader(ctx,
+                       &(vp->shaderbo),
+                       (GLvoid *)(vp->r700Shader.pProgram),
+                       vp->r700Shader.uShaderBinaryDWORDSize,
+                       "VS");
+
+        vp->loaded = GL_TRUE;
+    }
+
+    DumpHwBinary(DUMP_VERTEX_SHADER, (GLvoid *)(vp->r700Shader.pProgram),
+                 vp->r700Shader.uShaderBinaryDWORDSize);
+
+    /* TODO : enable this after MemUse fixed *=
+    (context->chipobj.MemUse)(context, vp->shadercode.buf->id);
+    */
+
+    R600_STATECHANGE(context, vs);
+    R600_STATECHANGE(context, fs); /* hack */
+
+    r700->vs.SQ_PGM_RESOURCES_VS.u32All = 0;
+    SETbit(r700->vs.SQ_PGM_RESOURCES_VS.u32All, PGM_RESOURCES__PRIME_CACHE_ON_DRAW_bit);
+
+    r700->vs.SQ_PGM_START_VS.u32All = 0; /* set from buffer object. */
+
+    SETfield(r700->vs.SQ_PGM_RESOURCES_VS.u32All, vp->r700Shader.nRegs + 1,
+             NUM_GPRS_shift, NUM_GPRS_mask);
+
+    if(vp->r700Shader.uStackSize) /* we don't use branch for now, it should be zero. */
+	{
+        SETfield(r700->vs.SQ_PGM_RESOURCES_VS.u32All, vp->r700Shader.uStackSize,
+                 STACK_SIZE_shift, STACK_SIZE_mask);
+    }
+
+    R600_STATECHANGE(context, spi);
+
+    SETfield(r700->SPI_VS_OUT_CONFIG.u32All,
+	     vp->r700Shader.nParamExports ? (vp->r700Shader.nParamExports - 1) : 0,
+             VS_EXPORT_COUNT_shift, VS_EXPORT_COUNT_mask);
+    SETfield(r700->SPI_PS_IN_CONTROL_0.u32All, vp->r700Shader.nParamExports,
+             NUM_INTERP_shift, NUM_INTERP_mask);
+
+    /*
+    SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, PERSP_GRADIENT_ENA_bit);
+    CLEARbit(r700->SPI_PS_IN_CONTROL_0.u32All, LINEAR_GRADIENT_ENA_bit);
+    */
+
+    /* sent out shader constants. */
+    paramList = vp->mesa_program->Base.Parameters;
+
+    if(NULL != paramList) {
+	    _mesa_load_state_parameters(ctx, paramList);
+
+	    if (paramList->NumParameters > R700_MAX_DX9_CONSTS)
+		    return GL_FALSE;
+
+	    R600_STATECHANGE(context, vs_consts);
+
+	    r700->vs.num_consts = paramList->NumParameters;
+
+	    unNumParamData = paramList->NumParameters;
+
+	    for(ui=0; ui<unNumParamData; ui++) {
+		    r700->vs.consts[ui][0].f32All = paramList->ParameterValues[ui][0];
+		    r700->vs.consts[ui][1].f32All = paramList->ParameterValues[ui][1];
+		    r700->vs.consts[ui][2].f32All = paramList->ParameterValues[ui][2];
+		    r700->vs.consts[ui][3].f32All = paramList->ParameterValues[ui][3];
+	    }
+    } else
+	    r700->vs.num_consts = 0;
+
+    return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.h b/src/mesa/drivers/dri/r600/r700_vertprog.h
new file mode 100644
index 0000000000..c48764c43b
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+
+#ifndef _R700_VERTPROG_H_
+#define _R700_VERTPROG_H_
+
+#include "main/glheader.h"
+#include "main/mtypes.h" 
+
+#include "r700_shader.h"
+#include "r700_assembler.h"
+
+typedef struct ArrayDesc //TEMP
+{
+	GLint size;   //number of data element
+	GLenum type;  //data element type
+	GLsizei stride;
+} ArrayDesc;
+
+struct r700_vertex_program 
+{
+    struct gl_vertex_program *mesa_program; /* Must be first */
+
+    struct r700_vertex_program *next;
+
+    r700_AssemblerBase r700AsmCode;
+    R700_Shader        r700Shader;
+
+    GLboolean translated;
+    GLboolean loaded;
+    GLboolean needUpdateVF;
+	
+    void * shaderbo;
+
+    ArrayDesc              aos_desc[VERT_ATTRIB_MAX];
+};
+
+struct r700_vertex_program_cont
+{
+    struct gl_vertex_program mesa_program;
+
+    struct r700_vertex_program *progs;
+};
+
+//Internal
+unsigned int Map_Vertex_Output(r700_AssemblerBase       *pAsm, 
+			       struct gl_vertex_program *mesa_vp,
+			       unsigned int unStart);
+unsigned int Map_Vertex_Input(r700_AssemblerBase       *pAsm, 
+			      struct gl_vertex_program *mesa_vp,
+			      unsigned int unStart);
+GLboolean Process_Vertex_Program_Vfetch_Instructions(
+	struct r700_vertex_program *vp,
+	struct gl_vertex_program   *mesa_vp);
+void Map_Vertex_Program(struct r700_vertex_program *vp,
+			struct gl_vertex_program   *mesa_vp);
+GLboolean Find_Instruction_Dependencies_vp(struct r700_vertex_program *vp,
+					   struct gl_vertex_program   *mesa_vp);
+
+struct r700_vertex_program* r700TranslateVertexShader(GLcontext *ctx,
+				    struct gl_vertex_program   *mesa_vp);
+
+/* Interface */
+extern void r700SelectVertexShader(GLcontext *ctx);
+
+extern GLboolean r700SetupVertexProgram(GLcontext * ctx);
+
+extern void *    r700GetActiveVpShaderBo(GLcontext * ctx);
+
+#endif /* _R700_VERTPROG_H_ */
diff --git a/src/mesa/drivers/dri/r600/radeon_bo_legacy.c b/src/mesa/drivers/dri/r600/radeon_bo_legacy.c
new file mode 120000
index 0000000000..79ad050e6b
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_bo_legacy.c
@@ -0,0 +1 @@
+../radeon/radeon_bo_legacy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_bo_legacy.h b/src/mesa/drivers/dri/r600/radeon_bo_legacy.h
new file mode 120000
index 0000000000..83b0f7ffab
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_bo_legacy.h
@@ -0,0 +1 @@
+../radeon/radeon_bo_legacy.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_bocs_wrapper.h b/src/mesa/drivers/dri/r600/radeon_bocs_wrapper.h
new file mode 120000
index 0000000000..ca894b2443
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_bocs_wrapper.h
@@ -0,0 +1 @@
+../radeon/radeon_bocs_wrapper.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_chipset.h b/src/mesa/drivers/dri/r600/radeon_chipset.h
new file mode 120000
index 0000000000..eba99001ff
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_chipset.h
@@ -0,0 +1 @@
+../radeon/radeon_chipset.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_cmdbuf.h b/src/mesa/drivers/dri/r600/radeon_cmdbuf.h
new file mode 120000
index 0000000000..a799e1dc6d
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_cmdbuf.h
@@ -0,0 +1 @@
+../radeon/radeon_cmdbuf.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_common.c b/src/mesa/drivers/dri/r600/radeon_common.c
new file mode 120000
index 0000000000..67b19ba940
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_common.c
@@ -0,0 +1 @@
+../radeon/radeon_common.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_common.h b/src/mesa/drivers/dri/r600/radeon_common.h
new file mode 120000
index 0000000000..5bcb696a9f
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_common.h
@@ -0,0 +1 @@
+../radeon/radeon_common.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_common_context.c b/src/mesa/drivers/dri/r600/radeon_common_context.c
new file mode 120000
index 0000000000..86800f3819
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_common_context.c
@@ -0,0 +1 @@
+../radeon/radeon_common_context.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_common_context.h b/src/mesa/drivers/dri/r600/radeon_common_context.h
new file mode 120000
index 0000000000..4d66312550
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_common_context.h
@@ -0,0 +1 @@
+../radeon/radeon_common_context.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_cs_legacy.c b/src/mesa/drivers/dri/r600/radeon_cs_legacy.c
new file mode 120000
index 0000000000..006720f8a4
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_cs_legacy.c
@@ -0,0 +1 @@
+../radeon/radeon_cs_legacy.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_cs_legacy.h b/src/mesa/drivers/dri/r600/radeon_cs_legacy.h
new file mode 120000
index 0000000000..a5f95e0a3d
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_cs_legacy.h
@@ -0,0 +1 @@
+../radeon/radeon_cs_legacy.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_cs_space_drm.c b/src/mesa/drivers/dri/r600/radeon_cs_space_drm.c
new file mode 120000
index 0000000000..c248ea7d1a
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_cs_space_drm.c
@@ -0,0 +1 @@
+../radeon/radeon_cs_space_drm.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_debug.c b/src/mesa/drivers/dri/r600/radeon_debug.c
new file mode 120000
index 0000000000..c98c2e074c
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_debug.c
@@ -0,0 +1 @@
+../radeon/radeon_debug.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_debug.h b/src/mesa/drivers/dri/r600/radeon_debug.h
new file mode 120000
index 0000000000..bd8aa28e89
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_debug.h
@@ -0,0 +1 @@
+../radeon/radeon_debug.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_dma.c b/src/mesa/drivers/dri/r600/radeon_dma.c
new file mode 120000
index 0000000000..43be000625
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_dma.c
@@ -0,0 +1 @@
+../radeon/radeon_dma.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_dma.h b/src/mesa/drivers/dri/r600/radeon_dma.h
new file mode 120000
index 0000000000..82e50634e3
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_dma.h
@@ -0,0 +1 @@
+../radeon/radeon_dma.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_fbo.c b/src/mesa/drivers/dri/r600/radeon_fbo.c
new file mode 120000
index 0000000000..0d738d8d78
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_fbo.c
@@ -0,0 +1 @@
+../radeon/radeon_fbo.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_lock.c b/src/mesa/drivers/dri/r600/radeon_lock.c
new file mode 120000
index 0000000000..af4108a8e3
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_lock.c
@@ -0,0 +1 @@
+../radeon/radeon_lock.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_lock.h b/src/mesa/drivers/dri/r600/radeon_lock.h
new file mode 120000
index 0000000000..64bdf94ee7
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_lock.h
@@ -0,0 +1 @@
+../radeon/radeon_lock.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_mipmap_tree.c b/src/mesa/drivers/dri/r600/radeon_mipmap_tree.c
new file mode 120000
index 0000000000..31c0cfbe94
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_mipmap_tree.c
@@ -0,0 +1 @@
+../radeon/radeon_mipmap_tree.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_mipmap_tree.h b/src/mesa/drivers/dri/r600/radeon_mipmap_tree.h
new file mode 120000
index 0000000000..254d50cf8c
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_mipmap_tree.h
@@ -0,0 +1 @@
+../radeon/radeon_mipmap_tree.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_queryobj.c b/src/mesa/drivers/dri/r600/radeon_queryobj.c
new file mode 120000
index 0000000000..1d6ebc1c48
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_queryobj.c
@@ -0,0 +1 @@
+../radeon/radeon_queryobj.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_queryobj.h b/src/mesa/drivers/dri/r600/radeon_queryobj.h
new file mode 120000
index 0000000000..8f6f842b0a
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_queryobj.h
@@ -0,0 +1 @@
+../radeon/radeon_queryobj.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_screen.c b/src/mesa/drivers/dri/r600/radeon_screen.c
new file mode 120000
index 0000000000..86161118dd
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_screen.c
@@ -0,0 +1 @@
+../radeon/radeon_screen.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_screen.h b/src/mesa/drivers/dri/r600/radeon_screen.h
new file mode 120000
index 0000000000..23bb6bd459
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_screen.h
@@ -0,0 +1 @@
+../radeon/radeon_screen.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_span.c b/src/mesa/drivers/dri/r600/radeon_span.c
new file mode 120000
index 0000000000..232868c4c9
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_span.c
@@ -0,0 +1 @@
+../radeon/radeon_span.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_span.h b/src/mesa/drivers/dri/r600/radeon_span.h
new file mode 120000
index 0000000000..f9d634508c
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_span.h
@@ -0,0 +1 @@
+../radeon/radeon_span.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_texture.c b/src/mesa/drivers/dri/r600/radeon_texture.c
new file mode 120000
index 0000000000..a822710915
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_texture.c
@@ -0,0 +1 @@
+../radeon/radeon_texture.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/radeon_texture.h b/src/mesa/drivers/dri/r600/radeon_texture.h
new file mode 120000
index 0000000000..17fac3d5ea
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/radeon_texture.h
@@ -0,0 +1 @@
+../radeon/radeon_texture.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/server/radeon.h b/src/mesa/drivers/dri/r600/server/radeon.h
new file mode 120000
index 0000000000..81274a54f1
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/server/radeon.h
@@ -0,0 +1 @@
+../../radeon/server/radeon.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/server/radeon_dri.c b/src/mesa/drivers/dri/r600/server/radeon_dri.c
new file mode 120000
index 0000000000..d05847d650
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/server/radeon_dri.c
@@ -0,0 +1 @@
+../../radeon/server/radeon_dri.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/server/radeon_dri.h b/src/mesa/drivers/dri/r600/server/radeon_dri.h
new file mode 120000
index 0000000000..27c591d3c9
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/server/radeon_dri.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_dri.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/server/radeon_egl.c b/src/mesa/drivers/dri/r600/server/radeon_egl.c
new file mode 120000
index 0000000000..d7735a7643
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/server/radeon_egl.c
@@ -0,0 +1 @@
+../../radeon/server/radeon_egl.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/server/radeon_macros.h b/src/mesa/drivers/dri/r600/server/radeon_macros.h
new file mode 120000
index 0000000000..c56cd735b8
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/server/radeon_macros.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_macros.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/server/radeon_reg.h b/src/mesa/drivers/dri/r600/server/radeon_reg.h
new file mode 120000
index 0000000000..e2349dcb68
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/server/radeon_reg.h
@@ -0,0 +1 @@
+../../radeon/server/radeon_reg.h
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/r600/sq_micro_reg.h b/src/mesa/drivers/dri/r600/sq_micro_reg.h
new file mode 100644
index 0000000000..bfd21cef62
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/sq_micro_reg.h
@@ -0,0 +1,2008 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Contacts:
+ *   Richard Li <RichardZ.Li@amd.com>, <richardradeon@gmail.com>
+ */
+
+#if !defined (_SQ_MICRO_REG_H)
+#define _SQ_MICRO_REG_H
+
+#if defined(LITTLEENDIAN_CPU)
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+/*
+ * SQ_ALU_SRC_GPR_BASE value
+ */
+
+#define SQ_ALU_SRC_GPR_BASE            0x00000000
+
+/*
+ * SQ_ALU_SRC_GPR_SIZE value
+ */
+
+#define SQ_ALU_SRC_GPR_SIZE            0x00000080
+
+/*
+ * SQ_ALU_SRC_KCACHE0_BASE value
+ */
+
+#define SQ_ALU_SRC_KCACHE0_BASE        0x00000080
+
+/*
+ * SQ_ALU_SRC_KCACHE0_SIZE value
+ */
+
+#define SQ_ALU_SRC_KCACHE0_SIZE        0x00000020
+
+/*
+ * SQ_ALU_SRC_KCACHE1_BASE value
+ */
+
+#define SQ_ALU_SRC_KCACHE1_BASE        0x000000a0
+
+/*
+ * SQ_ALU_SRC_KCACHE1_SIZE value
+ */
+
+#define SQ_ALU_SRC_KCACHE1_SIZE        0x00000020
+
+/*
+ * SQ_ALU_SRC_CFILE_BASE value
+ */
+
+#define SQ_ALU_SRC_CFILE_BASE          0x00000100
+
+/*
+ * SQ_ALU_SRC_CFILE_SIZE value
+ */
+
+#define SQ_ALU_SRC_CFILE_SIZE          0x00000100
+
+/*
+ * SQ_SP_OP_REDUC_BEGIN value
+ */
+
+#define SQ_SP_OP_REDUC_BEGIN           0x00000050
+
+/*
+ * SQ_SP_OP_REDUC_END value
+ */
+
+#define SQ_SP_OP_REDUC_END             0x00000053
+
+/*
+ * SQ_SP_OP_TRANS_BEGIN value
+ */
+
+#define SQ_SP_OP_TRANS_BEGIN           0x00000060
+
+/*
+ * SQ_SP_OP_TRANS_END value
+ */
+
+#define SQ_SP_OP_TRANS_END             0x0000007f
+
+/*
+ * SQ_CF_WORD0 struct
+ */
+
+#define SQ_CF_WORD0_ADDR_SIZE          32
+
+#define SQ_CF_WORD0_ADDR_SHIFT         0
+
+#define SQ_CF_WORD0_ADDR_MASK          0xffffffff
+
+#define SQ_CF_WORD0_MASK \
+     (SQ_CF_WORD0_ADDR_MASK)
+
+#define SQ_CF_WORD0_DEFAULT            0xcdcdcdcd
+
+#define SQ_CF_WORD0_GET_ADDR(sq_cf_word0) \
+     ((sq_cf_word0 & SQ_CF_WORD0_ADDR_MASK) >> SQ_CF_WORD0_ADDR_SHIFT)
+
+#define SQ_CF_WORD0_SET_ADDR(sq_cf_word0_reg, addr) \
+     sq_cf_word0_reg = (sq_cf_word0_reg & ~SQ_CF_WORD0_ADDR_MASK) | (addr << SQ_CF_WORD0_ADDR_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_word0_t {
+          unsigned int addr                           : SQ_CF_WORD0_ADDR_SIZE;
+     } sq_cf_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_word0_t {
+          unsigned int addr                           : SQ_CF_WORD0_ADDR_SIZE;
+     } sq_cf_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_word0_t f;
+} sq_cf_word0_u;
+
+
+/*
+ * SQ_CF_WORD1 struct
+ */
+
+#define SQ_CF_WORD1_POP_COUNT_SIZE     3
+#define SQ_CF_WORD1_CF_CONST_SIZE      5
+#define SQ_CF_WORD1_COND_SIZE          2
+#define SQ_CF_WORD1_COUNT_SIZE         3
+#define SQ_CF_WORD1_CALL_COUNT_SIZE    6
+#define SQ_CF_WORD1_COUNT_3_SIZE       1
+#define SQ_CF_WORD1_END_OF_PROGRAM_SIZE 1
+#define SQ_CF_WORD1_VALID_PIXEL_MODE_SIZE 1
+#define SQ_CF_WORD1_CF_INST_SIZE       7
+#define SQ_CF_WORD1_WHOLE_QUAD_MODE_SIZE 1
+#define SQ_CF_WORD1_BARRIER_SIZE       1
+
+#define SQ_CF_WORD1_POP_COUNT_SHIFT    0
+#define SQ_CF_WORD1_CF_CONST_SHIFT     3
+#define SQ_CF_WORD1_COND_SHIFT         8
+#define SQ_CF_WORD1_COUNT_SHIFT        10
+#define SQ_CF_WORD1_CALL_COUNT_SHIFT   13
+#define SQ_CF_WORD1_COUNT_3_SHIFT      19
+#define SQ_CF_WORD1_END_OF_PROGRAM_SHIFT 21
+#define SQ_CF_WORD1_VALID_PIXEL_MODE_SHIFT 22
+#define SQ_CF_WORD1_CF_INST_SHIFT      23
+#define SQ_CF_WORD1_WHOLE_QUAD_MODE_SHIFT 30
+#define SQ_CF_WORD1_BARRIER_SHIFT      31
+
+#define SQ_CF_WORD1_POP_COUNT_MASK     0x00000007
+#define SQ_CF_WORD1_CF_CONST_MASK      0x000000f8
+#define SQ_CF_WORD1_COND_MASK          0x00000300
+#define SQ_CF_WORD1_COUNT_MASK         0x00001c00
+#define SQ_CF_WORD1_CALL_COUNT_MASK    0x0007e000
+#define SQ_CF_WORD1_COUNT_3_MASK       0x00080000
+#define SQ_CF_WORD1_END_OF_PROGRAM_MASK 0x00200000
+#define SQ_CF_WORD1_VALID_PIXEL_MODE_MASK 0x00400000
+#define SQ_CF_WORD1_CF_INST_MASK       0x3f800000
+#define SQ_CF_WORD1_WHOLE_QUAD_MODE_MASK 0x40000000
+#define SQ_CF_WORD1_BARRIER_MASK       0x80000000
+
+#define SQ_CF_WORD1_MASK \
+     (SQ_CF_WORD1_POP_COUNT_MASK | \
+      SQ_CF_WORD1_CF_CONST_MASK | \
+      SQ_CF_WORD1_COND_MASK | \
+      SQ_CF_WORD1_COUNT_MASK | \
+      SQ_CF_WORD1_CALL_COUNT_MASK | \
+      SQ_CF_WORD1_COUNT_3_MASK | \
+      SQ_CF_WORD1_END_OF_PROGRAM_MASK | \
+      SQ_CF_WORD1_VALID_PIXEL_MODE_MASK | \
+      SQ_CF_WORD1_CF_INST_MASK | \
+      SQ_CF_WORD1_WHOLE_QUAD_MODE_MASK | \
+      SQ_CF_WORD1_BARRIER_MASK)
+
+#define SQ_CF_WORD1_DEFAULT            0xcdcdcdcd
+
+#define SQ_CF_WORD1_GET_POP_COUNT(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_POP_COUNT_MASK) >> SQ_CF_WORD1_POP_COUNT_SHIFT)
+#define SQ_CF_WORD1_GET_CF_CONST(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_CF_CONST_MASK) >> SQ_CF_WORD1_CF_CONST_SHIFT)
+#define SQ_CF_WORD1_GET_COND(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_COND_MASK) >> SQ_CF_WORD1_COND_SHIFT)
+#define SQ_CF_WORD1_GET_COUNT(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_COUNT_MASK) >> SQ_CF_WORD1_COUNT_SHIFT)
+#define SQ_CF_WORD1_GET_CALL_COUNT(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_CALL_COUNT_MASK) >> SQ_CF_WORD1_CALL_COUNT_SHIFT)
+#define SQ_CF_WORD1_GET_COUNT_3(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_COUNT_3_MASK) >> SQ_CF_WORD1_COUNT_3_SHIFT)
+#define SQ_CF_WORD1_GET_END_OF_PROGRAM(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_END_OF_PROGRAM_MASK) >> SQ_CF_WORD1_END_OF_PROGRAM_SHIFT)
+#define SQ_CF_WORD1_GET_VALID_PIXEL_MODE(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_VALID_PIXEL_MODE_MASK) >> SQ_CF_WORD1_VALID_PIXEL_MODE_SHIFT)
+#define SQ_CF_WORD1_GET_CF_INST(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_CF_INST_MASK) >> SQ_CF_WORD1_CF_INST_SHIFT)
+#define SQ_CF_WORD1_GET_WHOLE_QUAD_MODE(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_WHOLE_QUAD_MODE_MASK) >> SQ_CF_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_WORD1_GET_BARRIER(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_BARRIER_MASK) >> SQ_CF_WORD1_BARRIER_SHIFT)
+
+#define SQ_CF_WORD1_SET_POP_COUNT(sq_cf_word1_reg, pop_count) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_POP_COUNT_MASK) | (pop_count << SQ_CF_WORD1_POP_COUNT_SHIFT)
+#define SQ_CF_WORD1_SET_CF_CONST(sq_cf_word1_reg, cf_const) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_CF_CONST_MASK) | (cf_const << SQ_CF_WORD1_CF_CONST_SHIFT)
+#define SQ_CF_WORD1_SET_COND(sq_cf_word1_reg, cond) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_COND_MASK) | (cond << SQ_CF_WORD1_COND_SHIFT)
+#define SQ_CF_WORD1_SET_COUNT(sq_cf_word1_reg, count) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_COUNT_MASK) | (count << SQ_CF_WORD1_COUNT_SHIFT)
+#define SQ_CF_WORD1_SET_CALL_COUNT(sq_cf_word1_reg, call_count) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_CALL_COUNT_MASK) | (call_count << SQ_CF_WORD1_CALL_COUNT_SHIFT)
+#define SQ_CF_WORD1_SET_COUNT_3(sq_cf_word1_reg, count_3) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_COUNT_3_MASK) | (count_3 << SQ_CF_WORD1_COUNT_3_SHIFT)
+#define SQ_CF_WORD1_SET_END_OF_PROGRAM(sq_cf_word1_reg, end_of_program) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_END_OF_PROGRAM_MASK) | (end_of_program << SQ_CF_WORD1_END_OF_PROGRAM_SHIFT)
+#define SQ_CF_WORD1_SET_VALID_PIXEL_MODE(sq_cf_word1_reg, valid_pixel_mode) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_VALID_PIXEL_MODE_MASK) | (valid_pixel_mode << SQ_CF_WORD1_VALID_PIXEL_MODE_SHIFT)
+#define SQ_CF_WORD1_SET_CF_INST(sq_cf_word1_reg, cf_inst) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_CF_INST_MASK) | (cf_inst << SQ_CF_WORD1_CF_INST_SHIFT)
+#define SQ_CF_WORD1_SET_WHOLE_QUAD_MODE(sq_cf_word1_reg, whole_quad_mode) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_WHOLE_QUAD_MODE_MASK) | (whole_quad_mode << SQ_CF_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_WORD1_SET_BARRIER(sq_cf_word1_reg, barrier) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_BARRIER_MASK) | (barrier << SQ_CF_WORD1_BARRIER_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_word1_t {
+          unsigned int pop_count                      : SQ_CF_WORD1_POP_COUNT_SIZE;
+          unsigned int cf_const                       : SQ_CF_WORD1_CF_CONST_SIZE;
+          unsigned int cond                           : SQ_CF_WORD1_COND_SIZE;
+          unsigned int count                          : SQ_CF_WORD1_COUNT_SIZE;
+          unsigned int call_count                     : SQ_CF_WORD1_CALL_COUNT_SIZE;
+          unsigned int count_3                        : SQ_CF_WORD1_COUNT_3_SIZE;
+          unsigned int                                : 1;
+          unsigned int end_of_program                 : SQ_CF_WORD1_END_OF_PROGRAM_SIZE;
+          unsigned int valid_pixel_mode               : SQ_CF_WORD1_VALID_PIXEL_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_WORD1_CF_INST_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int barrier                        : SQ_CF_WORD1_BARRIER_SIZE;
+     } sq_cf_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_word1_t {
+          unsigned int barrier                        : SQ_CF_WORD1_BARRIER_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_WORD1_CF_INST_SIZE;
+          unsigned int valid_pixel_mode               : SQ_CF_WORD1_VALID_PIXEL_MODE_SIZE;
+          unsigned int end_of_program                 : SQ_CF_WORD1_END_OF_PROGRAM_SIZE;
+          unsigned int                                : 1;
+          unsigned int count_3                        : SQ_CF_WORD1_COUNT_3_SIZE;
+          unsigned int call_count                     : SQ_CF_WORD1_CALL_COUNT_SIZE;
+          unsigned int count                          : SQ_CF_WORD1_COUNT_SIZE;
+          unsigned int cond                           : SQ_CF_WORD1_COND_SIZE;
+          unsigned int cf_const                       : SQ_CF_WORD1_CF_CONST_SIZE;
+          unsigned int pop_count                      : SQ_CF_WORD1_POP_COUNT_SIZE;
+     } sq_cf_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_word1_t f;
+} sq_cf_word1_u;
+
+
+/*
+ * SQ_CF_ALU_WORD0 struct
+ */
+
+#define SQ_CF_ALU_WORD0_ADDR_SIZE      22
+#define SQ_CF_ALU_WORD0_KCACHE_BANK0_SIZE 4
+#define SQ_CF_ALU_WORD0_KCACHE_BANK1_SIZE 4
+#define SQ_CF_ALU_WORD0_KCACHE_MODE0_SIZE 2
+
+#define SQ_CF_ALU_WORD0_ADDR_SHIFT     0
+#define SQ_CF_ALU_WORD0_KCACHE_BANK0_SHIFT 22
+#define SQ_CF_ALU_WORD0_KCACHE_BANK1_SHIFT 26
+#define SQ_CF_ALU_WORD0_KCACHE_MODE0_SHIFT 30
+
+#define SQ_CF_ALU_WORD0_ADDR_MASK      0x003fffff
+#define SQ_CF_ALU_WORD0_KCACHE_BANK0_MASK 0x03c00000
+#define SQ_CF_ALU_WORD0_KCACHE_BANK1_MASK 0x3c000000
+#define SQ_CF_ALU_WORD0_KCACHE_MODE0_MASK 0xc0000000
+
+#define SQ_CF_ALU_WORD0_MASK \
+     (SQ_CF_ALU_WORD0_ADDR_MASK | \
+      SQ_CF_ALU_WORD0_KCACHE_BANK0_MASK | \
+      SQ_CF_ALU_WORD0_KCACHE_BANK1_MASK | \
+      SQ_CF_ALU_WORD0_KCACHE_MODE0_MASK)
+
+#define SQ_CF_ALU_WORD0_DEFAULT        0xcdcdcdcd
+
+#define SQ_CF_ALU_WORD0_GET_ADDR(sq_cf_alu_word0) \
+     ((sq_cf_alu_word0 & SQ_CF_ALU_WORD0_ADDR_MASK) >> SQ_CF_ALU_WORD0_ADDR_SHIFT)
+#define SQ_CF_ALU_WORD0_GET_KCACHE_BANK0(sq_cf_alu_word0) \
+     ((sq_cf_alu_word0 & SQ_CF_ALU_WORD0_KCACHE_BANK0_MASK) >> SQ_CF_ALU_WORD0_KCACHE_BANK0_SHIFT)
+#define SQ_CF_ALU_WORD0_GET_KCACHE_BANK1(sq_cf_alu_word0) \
+     ((sq_cf_alu_word0 & SQ_CF_ALU_WORD0_KCACHE_BANK1_MASK) >> SQ_CF_ALU_WORD0_KCACHE_BANK1_SHIFT)
+#define SQ_CF_ALU_WORD0_GET_KCACHE_MODE0(sq_cf_alu_word0) \
+     ((sq_cf_alu_word0 & SQ_CF_ALU_WORD0_KCACHE_MODE0_MASK) >> SQ_CF_ALU_WORD0_KCACHE_MODE0_SHIFT)
+
+#define SQ_CF_ALU_WORD0_SET_ADDR(sq_cf_alu_word0_reg, addr) \
+     sq_cf_alu_word0_reg = (sq_cf_alu_word0_reg & ~SQ_CF_ALU_WORD0_ADDR_MASK) | (addr << SQ_CF_ALU_WORD0_ADDR_SHIFT)
+#define SQ_CF_ALU_WORD0_SET_KCACHE_BANK0(sq_cf_alu_word0_reg, kcache_bank0) \
+     sq_cf_alu_word0_reg = (sq_cf_alu_word0_reg & ~SQ_CF_ALU_WORD0_KCACHE_BANK0_MASK) | (kcache_bank0 << SQ_CF_ALU_WORD0_KCACHE_BANK0_SHIFT)
+#define SQ_CF_ALU_WORD0_SET_KCACHE_BANK1(sq_cf_alu_word0_reg, kcache_bank1) \
+     sq_cf_alu_word0_reg = (sq_cf_alu_word0_reg & ~SQ_CF_ALU_WORD0_KCACHE_BANK1_MASK) | (kcache_bank1 << SQ_CF_ALU_WORD0_KCACHE_BANK1_SHIFT)
+#define SQ_CF_ALU_WORD0_SET_KCACHE_MODE0(sq_cf_alu_word0_reg, kcache_mode0) \
+     sq_cf_alu_word0_reg = (sq_cf_alu_word0_reg & ~SQ_CF_ALU_WORD0_KCACHE_MODE0_MASK) | (kcache_mode0 << SQ_CF_ALU_WORD0_KCACHE_MODE0_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alu_word0_t {
+          unsigned int addr                           : SQ_CF_ALU_WORD0_ADDR_SIZE;
+          unsigned int kcache_bank0                   : SQ_CF_ALU_WORD0_KCACHE_BANK0_SIZE;
+          unsigned int kcache_bank1                   : SQ_CF_ALU_WORD0_KCACHE_BANK1_SIZE;
+          unsigned int kcache_mode0                   : SQ_CF_ALU_WORD0_KCACHE_MODE0_SIZE;
+     } sq_cf_alu_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alu_word0_t {
+          unsigned int kcache_mode0                   : SQ_CF_ALU_WORD0_KCACHE_MODE0_SIZE;
+          unsigned int kcache_bank1                   : SQ_CF_ALU_WORD0_KCACHE_BANK1_SIZE;
+          unsigned int kcache_bank0                   : SQ_CF_ALU_WORD0_KCACHE_BANK0_SIZE;
+          unsigned int addr                           : SQ_CF_ALU_WORD0_ADDR_SIZE;
+     } sq_cf_alu_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alu_word0_t f;
+} sq_cf_alu_word0_u;
+
+
+/*
+ * SQ_CF_ALU_WORD1 struct
+ */
+
+#define SQ_CF_ALU_WORD1_KCACHE_MODE1_SIZE 2
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR0_SIZE 8
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR1_SIZE 8
+#define SQ_CF_ALU_WORD1_COUNT_SIZE     7
+#define SQ_CF_ALU_WORD1_ALT_CONST_SIZE 1
+#define SQ_CF_ALU_WORD1_CF_INST_SIZE   4
+#define SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SIZE 1
+#define SQ_CF_ALU_WORD1_BARRIER_SIZE   1
+
+#define SQ_CF_ALU_WORD1_KCACHE_MODE1_SHIFT 0
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR0_SHIFT 2
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR1_SHIFT 10
+#define SQ_CF_ALU_WORD1_COUNT_SHIFT    18
+#define SQ_CF_ALU_WORD1_ALT_CONST_SHIFT 25
+#define SQ_CF_ALU_WORD1_CF_INST_SHIFT  26
+#define SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SHIFT 30
+#define SQ_CF_ALU_WORD1_BARRIER_SHIFT  31
+
+#define SQ_CF_ALU_WORD1_KCACHE_MODE1_MASK 0x00000003
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR0_MASK 0x000003fc
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR1_MASK 0x0003fc00
+#define SQ_CF_ALU_WORD1_COUNT_MASK     0x01fc0000
+#define SQ_CF_ALU_WORD1_ALT_CONST_MASK 0x02000000
+#define SQ_CF_ALU_WORD1_CF_INST_MASK   0x3c000000
+#define SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_MASK 0x40000000
+#define SQ_CF_ALU_WORD1_BARRIER_MASK   0x80000000
+
+#define SQ_CF_ALU_WORD1_MASK \
+     (SQ_CF_ALU_WORD1_KCACHE_MODE1_MASK | \
+      SQ_CF_ALU_WORD1_KCACHE_ADDR0_MASK | \
+      SQ_CF_ALU_WORD1_KCACHE_ADDR1_MASK | \
+      SQ_CF_ALU_WORD1_COUNT_MASK | \
+      SQ_CF_ALU_WORD1_ALT_CONST_MASK | \
+      SQ_CF_ALU_WORD1_CF_INST_MASK | \
+      SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_MASK | \
+      SQ_CF_ALU_WORD1_BARRIER_MASK)
+
+#define SQ_CF_ALU_WORD1_DEFAULT        0xcdcdcdcd
+
+#define SQ_CF_ALU_WORD1_GET_KCACHE_MODE1(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_KCACHE_MODE1_MASK) >> SQ_CF_ALU_WORD1_KCACHE_MODE1_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_KCACHE_ADDR0(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_KCACHE_ADDR0_MASK) >> SQ_CF_ALU_WORD1_KCACHE_ADDR0_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_KCACHE_ADDR1(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_KCACHE_ADDR1_MASK) >> SQ_CF_ALU_WORD1_KCACHE_ADDR1_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_COUNT(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_COUNT_MASK) >> SQ_CF_ALU_WORD1_COUNT_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_ALT_CONST(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_ALT_CONST_MASK) >> SQ_CF_ALU_WORD1_ALT_CONST_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_CF_INST(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_CF_INST_MASK) >> SQ_CF_ALU_WORD1_CF_INST_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_WHOLE_QUAD_MODE(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_MASK) >> SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_BARRIER(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_BARRIER_MASK) >> SQ_CF_ALU_WORD1_BARRIER_SHIFT)
+
+#define SQ_CF_ALU_WORD1_SET_KCACHE_MODE1(sq_cf_alu_word1_reg, kcache_mode1) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_KCACHE_MODE1_MASK) | (kcache_mode1 << SQ_CF_ALU_WORD1_KCACHE_MODE1_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_KCACHE_ADDR0(sq_cf_alu_word1_reg, kcache_addr0) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_KCACHE_ADDR0_MASK) | (kcache_addr0 << SQ_CF_ALU_WORD1_KCACHE_ADDR0_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_KCACHE_ADDR1(sq_cf_alu_word1_reg, kcache_addr1) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_KCACHE_ADDR1_MASK) | (kcache_addr1 << SQ_CF_ALU_WORD1_KCACHE_ADDR1_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_COUNT(sq_cf_alu_word1_reg, count) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_COUNT_MASK) | (count << SQ_CF_ALU_WORD1_COUNT_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_ALT_CONST(sq_cf_alu_word1_reg, alt_const) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_ALT_CONST_MASK) | (alt_const << SQ_CF_ALU_WORD1_ALT_CONST_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_CF_INST(sq_cf_alu_word1_reg, cf_inst) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_CF_INST_MASK) | (cf_inst << SQ_CF_ALU_WORD1_CF_INST_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_WHOLE_QUAD_MODE(sq_cf_alu_word1_reg, whole_quad_mode) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_MASK) | (whole_quad_mode << SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_BARRIER(sq_cf_alu_word1_reg, barrier) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_BARRIER_MASK) | (barrier << SQ_CF_ALU_WORD1_BARRIER_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alu_word1_t {
+          unsigned int kcache_mode1                   : SQ_CF_ALU_WORD1_KCACHE_MODE1_SIZE;
+          unsigned int kcache_addr0                   : SQ_CF_ALU_WORD1_KCACHE_ADDR0_SIZE;
+          unsigned int kcache_addr1                   : SQ_CF_ALU_WORD1_KCACHE_ADDR1_SIZE;
+          unsigned int count                          : SQ_CF_ALU_WORD1_COUNT_SIZE;
+          unsigned int alt_const                      : SQ_CF_ALU_WORD1_ALT_CONST_SIZE;
+          unsigned int cf_inst                        : SQ_CF_ALU_WORD1_CF_INST_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int barrier                        : SQ_CF_ALU_WORD1_BARRIER_SIZE;
+     } sq_cf_alu_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alu_word1_t {
+          unsigned int barrier                        : SQ_CF_ALU_WORD1_BARRIER_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_ALU_WORD1_CF_INST_SIZE;
+          unsigned int alt_const                      : SQ_CF_ALU_WORD1_ALT_CONST_SIZE;
+          unsigned int count                          : SQ_CF_ALU_WORD1_COUNT_SIZE;
+          unsigned int kcache_addr1                   : SQ_CF_ALU_WORD1_KCACHE_ADDR1_SIZE;
+          unsigned int kcache_addr0                   : SQ_CF_ALU_WORD1_KCACHE_ADDR0_SIZE;
+          unsigned int kcache_mode1                   : SQ_CF_ALU_WORD1_KCACHE_MODE1_SIZE;
+     } sq_cf_alu_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alu_word1_t f;
+} sq_cf_alu_word1_u;
+
+
+/*
+ * SQ_CF_ALLOC_EXPORT_WORD0 struct
+ */
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SIZE 13
+#define SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SIZE 2
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SIZE 7
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SIZE 1
+#define SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SIZE 7
+#define SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SIZE 2
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SHIFT 0
+#define SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SHIFT 13
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SHIFT 15
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SHIFT 22
+#define SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SHIFT 23
+#define SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SHIFT 30
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_MASK 0x00001fff
+#define SQ_CF_ALLOC_EXPORT_WORD0_TYPE_MASK 0x00006000
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_MASK 0x003f8000
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_MASK 0x00400000
+#define SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_MASK 0x3f800000
+#define SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_MASK 0xc0000000
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_MASK \
+     (SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_TYPE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_MASK)
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_DEFAULT 0xcdcdcdcd
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_ARRAY_BASE(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_TYPE(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_TYPE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_RW_GPR(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_RW_REL(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_INDEX_GPR(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_ELEM_SIZE(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SHIFT)
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_ARRAY_BASE(sq_cf_alloc_export_word0_reg, array_base) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_MASK) | (array_base << SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_TYPE(sq_cf_alloc_export_word0_reg, type) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_TYPE_MASK) | (type << SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_RW_GPR(sq_cf_alloc_export_word0_reg, rw_gpr) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_MASK) | (rw_gpr << SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_RW_REL(sq_cf_alloc_export_word0_reg, rw_rel) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_MASK) | (rw_rel << SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_INDEX_GPR(sq_cf_alloc_export_word0_reg, index_gpr) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_MASK) | (index_gpr << SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_ELEM_SIZE(sq_cf_alloc_export_word0_reg, elem_size) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_MASK) | (elem_size << SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word0_t {
+          unsigned int array_base                     : SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SIZE;
+          unsigned int type                           : SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SIZE;
+          unsigned int rw_gpr                         : SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SIZE;
+          unsigned int rw_rel                         : SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SIZE;
+          unsigned int index_gpr                      : SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SIZE;
+          unsigned int elem_size                      : SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SIZE;
+     } sq_cf_alloc_export_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word0_t {
+          unsigned int elem_size                      : SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SIZE;
+          unsigned int index_gpr                      : SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SIZE;
+          unsigned int rw_rel                         : SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SIZE;
+          unsigned int rw_gpr                         : SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SIZE;
+          unsigned int type                           : SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SIZE;
+          unsigned int array_base                     : SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SIZE;
+     } sq_cf_alloc_export_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alloc_export_word0_t f;
+} sq_cf_alloc_export_word0_u;
+
+
+/*
+ * SQ_CF_ALLOC_EXPORT_WORD1 struct
+ */
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SIZE 4
+#define SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SIZE 1
+#define SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SIZE 1
+#define SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SIZE 7
+#define SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SIZE 1
+#define SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SIZE 1
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SHIFT 17
+#define SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SHIFT 21
+#define SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SHIFT 22
+#define SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SHIFT 23
+#define SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SHIFT 30
+#define SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SHIFT 31
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_MASK 0x001e0000
+#define SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_MASK 0x00200000
+#define SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_MASK 0x00400000
+#define SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_MASK 0x3f800000
+#define SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_MASK 0x40000000
+#define SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_MASK 0x80000000
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_MASK \
+     (SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_MASK)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_DEFAULT 0xcdcc0000
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_BURST_COUNT(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_END_OF_PROGRAM(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_VALID_PIXEL_MODE(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_CF_INST(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_WHOLE_QUAD_MODE(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_BARRIER(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SHIFT)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_BURST_COUNT(sq_cf_alloc_export_word1_reg, burst_count) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_MASK) | (burst_count << SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_END_OF_PROGRAM(sq_cf_alloc_export_word1_reg, end_of_program) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_MASK) | (end_of_program << SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_VALID_PIXEL_MODE(sq_cf_alloc_export_word1_reg, valid_pixel_mode) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_MASK) | (valid_pixel_mode << SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_CF_INST(sq_cf_alloc_export_word1_reg, cf_inst) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_MASK) | (cf_inst << SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_WHOLE_QUAD_MODE(sq_cf_alloc_export_word1_reg, whole_quad_mode) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_MASK) | (whole_quad_mode << SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_BARRIER(sq_cf_alloc_export_word1_reg, barrier) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_MASK) | (barrier << SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_t {
+          unsigned int                                : 17;
+          unsigned int burst_count                    : SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SIZE;
+          unsigned int end_of_program                 : SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SIZE;
+          unsigned int valid_pixel_mode               : SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int barrier                        : SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SIZE;
+     } sq_cf_alloc_export_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_t {
+          unsigned int barrier                        : SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SIZE;
+          unsigned int valid_pixel_mode               : SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SIZE;
+          unsigned int end_of_program                 : SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SIZE;
+          unsigned int burst_count                    : SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SIZE;
+          unsigned int                                : 17;
+     } sq_cf_alloc_export_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alloc_export_word1_t f;
+} sq_cf_alloc_export_word1_u;
+
+
+/*
+ * SQ_CF_ALLOC_EXPORT_WORD1_BUF struct
+ */
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SIZE 12
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SIZE 4
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SHIFT 0
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SHIFT 12
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_MASK 0x00000fff
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_MASK 0x0000f000
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_MASK \
+     (SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_MASK)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_DEFAULT 0x0000cdcd
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_GET_ARRAY_SIZE(sq_cf_alloc_export_word1_buf) \
+     ((sq_cf_alloc_export_word1_buf & SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_GET_COMP_MASK(sq_cf_alloc_export_word1_buf) \
+     ((sq_cf_alloc_export_word1_buf & SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SHIFT)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_SET_ARRAY_SIZE(sq_cf_alloc_export_word1_buf_reg, array_size) \
+     sq_cf_alloc_export_word1_buf_reg = (sq_cf_alloc_export_word1_buf_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_MASK) | (array_size << SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_SET_COMP_MASK(sq_cf_alloc_export_word1_buf_reg, comp_mask) \
+     sq_cf_alloc_export_word1_buf_reg = (sq_cf_alloc_export_word1_buf_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_MASK) | (comp_mask << SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_buf_t {
+          unsigned int array_size                     : SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SIZE;
+          unsigned int comp_mask                      : SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SIZE;
+          unsigned int                                : 16;
+     } sq_cf_alloc_export_word1_buf_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_buf_t {
+          unsigned int                                : 16;
+          unsigned int comp_mask                      : SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SIZE;
+          unsigned int array_size                     : SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SIZE;
+     } sq_cf_alloc_export_word1_buf_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alloc_export_word1_buf_t f;
+} sq_cf_alloc_export_word1_buf_u;
+
+
+/*
+ * SQ_CF_ALLOC_EXPORT_WORD1_SWIZ struct
+ */
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SIZE 3
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SIZE 3
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SIZE 3
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SIZE 3
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SHIFT 0
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SHIFT 3
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SHIFT 6
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SHIFT 9
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_MASK 0x00000007
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_MASK 0x00000038
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_MASK 0x000001c0
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_MASK 0x00000e00
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_MASK \
+     (SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_MASK)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_DEFAULT 0x00000dcd
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_GET_SEL_X(sq_cf_alloc_export_word1_swiz) \
+     ((sq_cf_alloc_export_word1_swiz & SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_GET_SEL_Y(sq_cf_alloc_export_word1_swiz) \
+     ((sq_cf_alloc_export_word1_swiz & SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_GET_SEL_Z(sq_cf_alloc_export_word1_swiz) \
+     ((sq_cf_alloc_export_word1_swiz & SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_GET_SEL_W(sq_cf_alloc_export_word1_swiz) \
+     ((sq_cf_alloc_export_word1_swiz & SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SHIFT)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SET_SEL_X(sq_cf_alloc_export_word1_swiz_reg, sel_x) \
+     sq_cf_alloc_export_word1_swiz_reg = (sq_cf_alloc_export_word1_swiz_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_MASK) | (sel_x << SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SET_SEL_Y(sq_cf_alloc_export_word1_swiz_reg, sel_y) \
+     sq_cf_alloc_export_word1_swiz_reg = (sq_cf_alloc_export_word1_swiz_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_MASK) | (sel_y << SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SET_SEL_Z(sq_cf_alloc_export_word1_swiz_reg, sel_z) \
+     sq_cf_alloc_export_word1_swiz_reg = (sq_cf_alloc_export_word1_swiz_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_MASK) | (sel_z << SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SET_SEL_W(sq_cf_alloc_export_word1_swiz_reg, sel_w) \
+     sq_cf_alloc_export_word1_swiz_reg = (sq_cf_alloc_export_word1_swiz_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_MASK) | (sel_w << SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_swiz_t {
+          unsigned int sel_x                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SIZE;
+          unsigned int sel_y                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SIZE;
+          unsigned int sel_z                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SIZE;
+          unsigned int sel_w                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SIZE;
+          unsigned int                                : 20;
+     } sq_cf_alloc_export_word1_swiz_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_swiz_t {
+          unsigned int                                : 20;
+          unsigned int sel_w                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SIZE;
+          unsigned int sel_z                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SIZE;
+          unsigned int sel_y                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SIZE;
+          unsigned int sel_x                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SIZE;
+     } sq_cf_alloc_export_word1_swiz_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alloc_export_word1_swiz_t f;
+} sq_cf_alloc_export_word1_swiz_u;
+
+
+/*
+ * SQ_ALU_WORD0 struct
+ */
+
+#define SQ_ALU_WORD0_SRC0_SEL_SIZE     9
+#define SQ_ALU_WORD0_SRC0_REL_SIZE     1
+#define SQ_ALU_WORD0_SRC0_CHAN_SIZE    2
+#define SQ_ALU_WORD0_SRC0_NEG_SIZE     1
+#define SQ_ALU_WORD0_SRC1_SEL_SIZE     9
+#define SQ_ALU_WORD0_SRC1_REL_SIZE     1
+#define SQ_ALU_WORD0_SRC1_CHAN_SIZE    2
+#define SQ_ALU_WORD0_SRC1_NEG_SIZE     1
+#define SQ_ALU_WORD0_INDEX_MODE_SIZE   3
+#define SQ_ALU_WORD0_PRED_SEL_SIZE     2
+#define SQ_ALU_WORD0_LAST_SIZE         1
+
+#define SQ_ALU_WORD0_SRC0_SEL_SHIFT    0
+#define SQ_ALU_WORD0_SRC0_REL_SHIFT    9
+#define SQ_ALU_WORD0_SRC0_CHAN_SHIFT   10
+#define SQ_ALU_WORD0_SRC0_NEG_SHIFT    12
+#define SQ_ALU_WORD0_SRC1_SEL_SHIFT    13
+#define SQ_ALU_WORD0_SRC1_REL_SHIFT    22
+#define SQ_ALU_WORD0_SRC1_CHAN_SHIFT   23
+#define SQ_ALU_WORD0_SRC1_NEG_SHIFT    25
+#define SQ_ALU_WORD0_INDEX_MODE_SHIFT  26
+#define SQ_ALU_WORD0_PRED_SEL_SHIFT    29
+#define SQ_ALU_WORD0_LAST_SHIFT        31
+
+#define SQ_ALU_WORD0_SRC0_SEL_MASK     0x000001ff
+#define SQ_ALU_WORD0_SRC0_REL_MASK     0x00000200
+#define SQ_ALU_WORD0_SRC0_CHAN_MASK    0x00000c00
+#define SQ_ALU_WORD0_SRC0_NEG_MASK     0x00001000
+#define SQ_ALU_WORD0_SRC1_SEL_MASK     0x003fe000
+#define SQ_ALU_WORD0_SRC1_REL_MASK     0x00400000
+#define SQ_ALU_WORD0_SRC1_CHAN_MASK    0x01800000
+#define SQ_ALU_WORD0_SRC1_NEG_MASK     0x02000000
+#define SQ_ALU_WORD0_INDEX_MODE_MASK   0x1c000000
+#define SQ_ALU_WORD0_PRED_SEL_MASK     0x60000000
+#define SQ_ALU_WORD0_LAST_MASK         0x80000000
+
+#define SQ_ALU_WORD0_MASK \
+     (SQ_ALU_WORD0_SRC0_SEL_MASK | \
+      SQ_ALU_WORD0_SRC0_REL_MASK | \
+      SQ_ALU_WORD0_SRC0_CHAN_MASK | \
+      SQ_ALU_WORD0_SRC0_NEG_MASK | \
+      SQ_ALU_WORD0_SRC1_SEL_MASK | \
+      SQ_ALU_WORD0_SRC1_REL_MASK | \
+      SQ_ALU_WORD0_SRC1_CHAN_MASK | \
+      SQ_ALU_WORD0_SRC1_NEG_MASK | \
+      SQ_ALU_WORD0_INDEX_MODE_MASK | \
+      SQ_ALU_WORD0_PRED_SEL_MASK | \
+      SQ_ALU_WORD0_LAST_MASK)
+
+#define SQ_ALU_WORD0_DEFAULT           0xcdcdcdcd
+
+#define SQ_ALU_WORD0_GET_SRC0_SEL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC0_SEL_MASK) >> SQ_ALU_WORD0_SRC0_SEL_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC0_REL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC0_REL_MASK) >> SQ_ALU_WORD0_SRC0_REL_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC0_CHAN(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC0_CHAN_MASK) >> SQ_ALU_WORD0_SRC0_CHAN_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC0_NEG(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC0_NEG_MASK) >> SQ_ALU_WORD0_SRC0_NEG_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC1_SEL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC1_SEL_MASK) >> SQ_ALU_WORD0_SRC1_SEL_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC1_REL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC1_REL_MASK) >> SQ_ALU_WORD0_SRC1_REL_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC1_CHAN(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC1_CHAN_MASK) >> SQ_ALU_WORD0_SRC1_CHAN_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC1_NEG(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC1_NEG_MASK) >> SQ_ALU_WORD0_SRC1_NEG_SHIFT)
+#define SQ_ALU_WORD0_GET_INDEX_MODE(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_INDEX_MODE_MASK) >> SQ_ALU_WORD0_INDEX_MODE_SHIFT)
+#define SQ_ALU_WORD0_GET_PRED_SEL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_PRED_SEL_MASK) >> SQ_ALU_WORD0_PRED_SEL_SHIFT)
+#define SQ_ALU_WORD0_GET_LAST(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_LAST_MASK) >> SQ_ALU_WORD0_LAST_SHIFT)
+
+#define SQ_ALU_WORD0_SET_SRC0_SEL(sq_alu_word0_reg, src0_sel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC0_SEL_MASK) | (src0_sel << SQ_ALU_WORD0_SRC0_SEL_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC0_REL(sq_alu_word0_reg, src0_rel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC0_REL_MASK) | (src0_rel << SQ_ALU_WORD0_SRC0_REL_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC0_CHAN(sq_alu_word0_reg, src0_chan) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC0_CHAN_MASK) | (src0_chan << SQ_ALU_WORD0_SRC0_CHAN_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC0_NEG(sq_alu_word0_reg, src0_neg) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC0_NEG_MASK) | (src0_neg << SQ_ALU_WORD0_SRC0_NEG_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC1_SEL(sq_alu_word0_reg, src1_sel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC1_SEL_MASK) | (src1_sel << SQ_ALU_WORD0_SRC1_SEL_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC1_REL(sq_alu_word0_reg, src1_rel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC1_REL_MASK) | (src1_rel << SQ_ALU_WORD0_SRC1_REL_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC1_CHAN(sq_alu_word0_reg, src1_chan) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC1_CHAN_MASK) | (src1_chan << SQ_ALU_WORD0_SRC1_CHAN_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC1_NEG(sq_alu_word0_reg, src1_neg) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC1_NEG_MASK) | (src1_neg << SQ_ALU_WORD0_SRC1_NEG_SHIFT)
+#define SQ_ALU_WORD0_SET_INDEX_MODE(sq_alu_word0_reg, index_mode) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_INDEX_MODE_MASK) | (index_mode << SQ_ALU_WORD0_INDEX_MODE_SHIFT)
+#define SQ_ALU_WORD0_SET_PRED_SEL(sq_alu_word0_reg, pred_sel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_PRED_SEL_MASK) | (pred_sel << SQ_ALU_WORD0_PRED_SEL_SHIFT)
+#define SQ_ALU_WORD0_SET_LAST(sq_alu_word0_reg, last) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_LAST_MASK) | (last << SQ_ALU_WORD0_LAST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word0_t {
+          unsigned int src0_sel                       : SQ_ALU_WORD0_SRC0_SEL_SIZE;
+          unsigned int src0_rel                       : SQ_ALU_WORD0_SRC0_REL_SIZE;
+          unsigned int src0_chan                      : SQ_ALU_WORD0_SRC0_CHAN_SIZE;
+          unsigned int src0_neg                       : SQ_ALU_WORD0_SRC0_NEG_SIZE;
+          unsigned int src1_sel                       : SQ_ALU_WORD0_SRC1_SEL_SIZE;
+          unsigned int src1_rel                       : SQ_ALU_WORD0_SRC1_REL_SIZE;
+          unsigned int src1_chan                      : SQ_ALU_WORD0_SRC1_CHAN_SIZE;
+          unsigned int src1_neg                       : SQ_ALU_WORD0_SRC1_NEG_SIZE;
+          unsigned int index_mode                     : SQ_ALU_WORD0_INDEX_MODE_SIZE;
+          unsigned int pred_sel                       : SQ_ALU_WORD0_PRED_SEL_SIZE;
+          unsigned int last                           : SQ_ALU_WORD0_LAST_SIZE;
+     } sq_alu_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word0_t {
+          unsigned int last                           : SQ_ALU_WORD0_LAST_SIZE;
+          unsigned int pred_sel                       : SQ_ALU_WORD0_PRED_SEL_SIZE;
+          unsigned int index_mode                     : SQ_ALU_WORD0_INDEX_MODE_SIZE;
+          unsigned int src1_neg                       : SQ_ALU_WORD0_SRC1_NEG_SIZE;
+          unsigned int src1_chan                      : SQ_ALU_WORD0_SRC1_CHAN_SIZE;
+          unsigned int src1_rel                       : SQ_ALU_WORD0_SRC1_REL_SIZE;
+          unsigned int src1_sel                       : SQ_ALU_WORD0_SRC1_SEL_SIZE;
+          unsigned int src0_neg                       : SQ_ALU_WORD0_SRC0_NEG_SIZE;
+          unsigned int src0_chan                      : SQ_ALU_WORD0_SRC0_CHAN_SIZE;
+          unsigned int src0_rel                       : SQ_ALU_WORD0_SRC0_REL_SIZE;
+          unsigned int src0_sel                       : SQ_ALU_WORD0_SRC0_SEL_SIZE;
+     } sq_alu_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_alu_word0_t f;
+} sq_alu_word0_u;
+
+
+/*
+ * SQ_ALU_WORD1 struct
+ */
+
+#define SQ_ALU_WORD1_ENCODING_SIZE     3
+#define SQ_ALU_WORD1_BANK_SWIZZLE_SIZE 3
+#define SQ_ALU_WORD1_DST_GPR_SIZE      7
+#define SQ_ALU_WORD1_DST_REL_SIZE      1
+#define SQ_ALU_WORD1_DST_CHAN_SIZE     2
+#define SQ_ALU_WORD1_CLAMP_SIZE        1
+
+#define SQ_ALU_WORD1_ENCODING_SHIFT    15
+#define SQ_ALU_WORD1_BANK_SWIZZLE_SHIFT 18
+#define SQ_ALU_WORD1_DST_GPR_SHIFT     21
+#define SQ_ALU_WORD1_DST_REL_SHIFT     28
+#define SQ_ALU_WORD1_DST_CHAN_SHIFT    29
+#define SQ_ALU_WORD1_CLAMP_SHIFT       31
+
+#define SQ_ALU_WORD1_ENCODING_MASK     0x00038000
+#define SQ_ALU_WORD1_BANK_SWIZZLE_MASK 0x001c0000
+#define SQ_ALU_WORD1_DST_GPR_MASK      0x0fe00000
+#define SQ_ALU_WORD1_DST_REL_MASK      0x10000000
+#define SQ_ALU_WORD1_DST_CHAN_MASK     0x60000000
+#define SQ_ALU_WORD1_CLAMP_MASK        0x80000000
+
+#define SQ_ALU_WORD1_MASK \
+     (SQ_ALU_WORD1_ENCODING_MASK | \
+      SQ_ALU_WORD1_BANK_SWIZZLE_MASK | \
+      SQ_ALU_WORD1_DST_GPR_MASK | \
+      SQ_ALU_WORD1_DST_REL_MASK | \
+      SQ_ALU_WORD1_DST_CHAN_MASK | \
+      SQ_ALU_WORD1_CLAMP_MASK)
+
+#define SQ_ALU_WORD1_DEFAULT           0xcdcd8000
+
+#define SQ_ALU_WORD1_GET_ENCODING(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_ENCODING_MASK) >> SQ_ALU_WORD1_ENCODING_SHIFT)
+#define SQ_ALU_WORD1_GET_BANK_SWIZZLE(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_BANK_SWIZZLE_MASK) >> SQ_ALU_WORD1_BANK_SWIZZLE_SHIFT)
+#define SQ_ALU_WORD1_GET_DST_GPR(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_DST_GPR_MASK) >> SQ_ALU_WORD1_DST_GPR_SHIFT)
+#define SQ_ALU_WORD1_GET_DST_REL(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_DST_REL_MASK) >> SQ_ALU_WORD1_DST_REL_SHIFT)
+#define SQ_ALU_WORD1_GET_DST_CHAN(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_DST_CHAN_MASK) >> SQ_ALU_WORD1_DST_CHAN_SHIFT)
+#define SQ_ALU_WORD1_GET_CLAMP(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_CLAMP_MASK) >> SQ_ALU_WORD1_CLAMP_SHIFT)
+
+#define SQ_ALU_WORD1_SET_ENCODING(sq_alu_word1_reg, encoding) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_ENCODING_MASK) | (encoding << SQ_ALU_WORD1_ENCODING_SHIFT)
+#define SQ_ALU_WORD1_SET_BANK_SWIZZLE(sq_alu_word1_reg, bank_swizzle) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_BANK_SWIZZLE_MASK) | (bank_swizzle << SQ_ALU_WORD1_BANK_SWIZZLE_SHIFT)
+#define SQ_ALU_WORD1_SET_DST_GPR(sq_alu_word1_reg, dst_gpr) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_DST_GPR_MASK) | (dst_gpr << SQ_ALU_WORD1_DST_GPR_SHIFT)
+#define SQ_ALU_WORD1_SET_DST_REL(sq_alu_word1_reg, dst_rel) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_DST_REL_MASK) | (dst_rel << SQ_ALU_WORD1_DST_REL_SHIFT)
+#define SQ_ALU_WORD1_SET_DST_CHAN(sq_alu_word1_reg, dst_chan) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_DST_CHAN_MASK) | (dst_chan << SQ_ALU_WORD1_DST_CHAN_SHIFT)
+#define SQ_ALU_WORD1_SET_CLAMP(sq_alu_word1_reg, clamp) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_CLAMP_MASK) | (clamp << SQ_ALU_WORD1_CLAMP_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_t {
+          unsigned int                                : 15;
+          unsigned int encoding                       : SQ_ALU_WORD1_ENCODING_SIZE;
+          unsigned int bank_swizzle                   : SQ_ALU_WORD1_BANK_SWIZZLE_SIZE;
+          unsigned int dst_gpr                        : SQ_ALU_WORD1_DST_GPR_SIZE;
+          unsigned int dst_rel                        : SQ_ALU_WORD1_DST_REL_SIZE;
+          unsigned int dst_chan                       : SQ_ALU_WORD1_DST_CHAN_SIZE;
+          unsigned int clamp                          : SQ_ALU_WORD1_CLAMP_SIZE;
+     } sq_alu_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_t {
+          unsigned int clamp                          : SQ_ALU_WORD1_CLAMP_SIZE;
+          unsigned int dst_chan                       : SQ_ALU_WORD1_DST_CHAN_SIZE;
+          unsigned int dst_rel                        : SQ_ALU_WORD1_DST_REL_SIZE;
+          unsigned int dst_gpr                        : SQ_ALU_WORD1_DST_GPR_SIZE;
+          unsigned int bank_swizzle                   : SQ_ALU_WORD1_BANK_SWIZZLE_SIZE;
+          unsigned int encoding                       : SQ_ALU_WORD1_ENCODING_SIZE;
+          unsigned int                                : 15;
+     } sq_alu_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_alu_word1_t f;
+} sq_alu_word1_u;
+
+
+/*
+ * SQ_ALU_WORD1_OP2_V2 struct
+ */
+
+#define SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_OMOD_SIZE  2
+#define SQ_ALU_WORD1_OP2_V2_ALU_INST_SIZE 11
+
+#define SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SHIFT 0
+#define SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SHIFT 1
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SHIFT 2
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SHIFT 3
+#define SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SHIFT 4
+#define SQ_ALU_WORD1_OP2_V2_OMOD_SHIFT 5
+#define SQ_ALU_WORD1_OP2_V2_ALU_INST_SHIFT 7
+
+#define SQ_ALU_WORD1_OP2_V2_SRC0_ABS_MASK 0x00000001
+#define SQ_ALU_WORD1_OP2_V2_SRC1_ABS_MASK 0x00000002
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_MASK 0x00000004
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_MASK 0x00000008
+#define SQ_ALU_WORD1_OP2_V2_WRITE_MASK_MASK 0x00000010
+#define SQ_ALU_WORD1_OP2_V2_OMOD_MASK  0x00000060
+#define SQ_ALU_WORD1_OP2_V2_ALU_INST_MASK 0x0003ff80
+
+#define SQ_ALU_WORD1_OP2_V2_MASK \
+     (SQ_ALU_WORD1_OP2_V2_SRC0_ABS_MASK | \
+      SQ_ALU_WORD1_OP2_V2_SRC1_ABS_MASK | \
+      SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_MASK | \
+      SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_MASK | \
+      SQ_ALU_WORD1_OP2_V2_WRITE_MASK_MASK | \
+      SQ_ALU_WORD1_OP2_V2_OMOD_MASK | \
+      SQ_ALU_WORD1_OP2_V2_ALU_INST_MASK)
+
+#define SQ_ALU_WORD1_OP2_V2_DEFAULT    0x0001cdcd
+
+#define SQ_ALU_WORD1_OP2_V2_GET_SRC0_ABS(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_SRC0_ABS_MASK) >> SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_SRC1_ABS(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_SRC1_ABS_MASK) >> SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_UPDATE_EXECUTE_MASK(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_MASK) >> SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_UPDATE_PRED(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_MASK) >> SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_WRITE_MASK(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_WRITE_MASK_MASK) >> SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_OMOD(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_OMOD_MASK) >> SQ_ALU_WORD1_OP2_V2_OMOD_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_ALU_INST(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_ALU_INST_MASK) >> SQ_ALU_WORD1_OP2_V2_ALU_INST_SHIFT)
+
+#define SQ_ALU_WORD1_OP2_V2_SET_SRC0_ABS(sq_alu_word1_op2_v2_reg, src0_abs) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_SRC0_ABS_MASK) | (src0_abs << SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_SRC1_ABS(sq_alu_word1_op2_v2_reg, src1_abs) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_SRC1_ABS_MASK) | (src1_abs << SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_UPDATE_EXECUTE_MASK(sq_alu_word1_op2_v2_reg, update_execute_mask) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_MASK) | (update_execute_mask << SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_UPDATE_PRED(sq_alu_word1_op2_v2_reg, update_pred) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_MASK) | (update_pred << SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_WRITE_MASK(sq_alu_word1_op2_v2_reg, write_mask) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_WRITE_MASK_MASK) | (write_mask << SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_OMOD(sq_alu_word1_op2_v2_reg, omod) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_OMOD_MASK) | (omod << SQ_ALU_WORD1_OP2_V2_OMOD_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_ALU_INST(sq_alu_word1_op2_v2_reg, alu_inst) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_ALU_INST_MASK) | (alu_inst << SQ_ALU_WORD1_OP2_V2_ALU_INST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op2_v2_t {
+          unsigned int src0_abs                       : SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE;
+          unsigned int src1_abs                       : SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE;
+          unsigned int update_execute_mask            : SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE;
+          unsigned int update_pred                    : SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE;
+          unsigned int write_mask                     : SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE;
+          unsigned int omod                           : SQ_ALU_WORD1_OP2_V2_OMOD_SIZE;
+          unsigned int alu_inst                       : SQ_ALU_WORD1_OP2_V2_ALU_INST_SIZE;
+          unsigned int                                : 14;
+     } sq_alu_word1_op2_v2_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op2_v2_t {
+          unsigned int                                : 14;
+          unsigned int alu_inst                       : SQ_ALU_WORD1_OP2_V2_ALU_INST_SIZE;
+          unsigned int omod                           : SQ_ALU_WORD1_OP2_V2_OMOD_SIZE;
+          unsigned int write_mask                     : SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE;
+          unsigned int update_pred                    : SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE;
+          unsigned int update_execute_mask            : SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE;
+          unsigned int src1_abs                       : SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE;
+          unsigned int src0_abs                       : SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE;
+     } sq_alu_word1_op2_v2_t;
+
+#endif
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op2_r6xx_t {
+          unsigned int src0_abs                       : SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE;
+          unsigned int src1_abs                       : SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE;
+          unsigned int update_execute_mask            : SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE;
+          unsigned int update_pred                    : SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE;
+          unsigned int write_mask                     : SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE;
+          unsigned int fog_export                     : 1;
+          unsigned int omod                           : SQ_ALU_WORD1_OP2_V2_OMOD_SIZE;
+          unsigned int alu_inst                       : 10;
+          unsigned int                                : 14;
+     } sq_alu_word1_op2_v1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op2_r6xx_t {
+          unsigned int                                : 14;
+          unsigned int alu_inst                       : 10;
+          unsigned int omod                           : SQ_ALU_WORD1_OP2_V2_OMOD_SIZE;
+          unsigned int fog_export                     : 1;
+          unsigned int write_mask                     : SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE;
+          unsigned int update_pred                    : SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE;
+          unsigned int update_execute_mask            : SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE;
+          unsigned int src1_abs                       : SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE;
+          unsigned int src0_abs                       : SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE;
+     } sq_alu_word1_op2_v1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_alu_word1_op2_v2_t f;
+     sq_alu_word1_op2_v1_t f6;
+} sq_alu_word1_op2_v2_u;
+
+
+/*
+ * SQ_ALU_WORD1_OP3 struct
+ */
+
+#define SQ_ALU_WORD1_OP3_SRC2_SEL_SIZE 9
+#define SQ_ALU_WORD1_OP3_SRC2_REL_SIZE 1
+#define SQ_ALU_WORD1_OP3_SRC2_CHAN_SIZE 2
+#define SQ_ALU_WORD1_OP3_SRC2_NEG_SIZE 1
+#define SQ_ALU_WORD1_OP3_ALU_INST_SIZE 5
+
+#define SQ_ALU_WORD1_OP3_SRC2_SEL_SHIFT 0
+#define SQ_ALU_WORD1_OP3_SRC2_REL_SHIFT 9
+#define SQ_ALU_WORD1_OP3_SRC2_CHAN_SHIFT 10
+#define SQ_ALU_WORD1_OP3_SRC2_NEG_SHIFT 12
+#define SQ_ALU_WORD1_OP3_ALU_INST_SHIFT 13
+
+#define SQ_ALU_WORD1_OP3_SRC2_SEL_MASK 0x000001ff
+#define SQ_ALU_WORD1_OP3_SRC2_REL_MASK 0x00000200
+#define SQ_ALU_WORD1_OP3_SRC2_CHAN_MASK 0x00000c00
+#define SQ_ALU_WORD1_OP3_SRC2_NEG_MASK 0x00001000
+#define SQ_ALU_WORD1_OP3_ALU_INST_MASK 0x0003e000
+
+#define SQ_ALU_WORD1_OP3_MASK \
+     (SQ_ALU_WORD1_OP3_SRC2_SEL_MASK | \
+      SQ_ALU_WORD1_OP3_SRC2_REL_MASK | \
+      SQ_ALU_WORD1_OP3_SRC2_CHAN_MASK | \
+      SQ_ALU_WORD1_OP3_SRC2_NEG_MASK | \
+      SQ_ALU_WORD1_OP3_ALU_INST_MASK)
+
+#define SQ_ALU_WORD1_OP3_DEFAULT       0x0001cdcd
+
+#define SQ_ALU_WORD1_OP3_GET_SRC2_SEL(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_SRC2_SEL_MASK) >> SQ_ALU_WORD1_OP3_SRC2_SEL_SHIFT)
+#define SQ_ALU_WORD1_OP3_GET_SRC2_REL(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_SRC2_REL_MASK) >> SQ_ALU_WORD1_OP3_SRC2_REL_SHIFT)
+#define SQ_ALU_WORD1_OP3_GET_SRC2_CHAN(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_SRC2_CHAN_MASK) >> SQ_ALU_WORD1_OP3_SRC2_CHAN_SHIFT)
+#define SQ_ALU_WORD1_OP3_GET_SRC2_NEG(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_SRC2_NEG_MASK) >> SQ_ALU_WORD1_OP3_SRC2_NEG_SHIFT)
+#define SQ_ALU_WORD1_OP3_GET_ALU_INST(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_ALU_INST_MASK) >> SQ_ALU_WORD1_OP3_ALU_INST_SHIFT)
+
+#define SQ_ALU_WORD1_OP3_SET_SRC2_SEL(sq_alu_word1_op3_reg, src2_sel) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_SRC2_SEL_MASK) | (src2_sel << SQ_ALU_WORD1_OP3_SRC2_SEL_SHIFT)
+#define SQ_ALU_WORD1_OP3_SET_SRC2_REL(sq_alu_word1_op3_reg, src2_rel) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_SRC2_REL_MASK) | (src2_rel << SQ_ALU_WORD1_OP3_SRC2_REL_SHIFT)
+#define SQ_ALU_WORD1_OP3_SET_SRC2_CHAN(sq_alu_word1_op3_reg, src2_chan) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_SRC2_CHAN_MASK) | (src2_chan << SQ_ALU_WORD1_OP3_SRC2_CHAN_SHIFT)
+#define SQ_ALU_WORD1_OP3_SET_SRC2_NEG(sq_alu_word1_op3_reg, src2_neg) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_SRC2_NEG_MASK) | (src2_neg << SQ_ALU_WORD1_OP3_SRC2_NEG_SHIFT)
+#define SQ_ALU_WORD1_OP3_SET_ALU_INST(sq_alu_word1_op3_reg, alu_inst) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_ALU_INST_MASK) | (alu_inst << SQ_ALU_WORD1_OP3_ALU_INST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op3_t {
+          unsigned int src2_sel                       : SQ_ALU_WORD1_OP3_SRC2_SEL_SIZE;
+          unsigned int src2_rel                       : SQ_ALU_WORD1_OP3_SRC2_REL_SIZE;
+          unsigned int src2_chan                      : SQ_ALU_WORD1_OP3_SRC2_CHAN_SIZE;
+          unsigned int src2_neg                       : SQ_ALU_WORD1_OP3_SRC2_NEG_SIZE;
+          unsigned int alu_inst                       : SQ_ALU_WORD1_OP3_ALU_INST_SIZE;
+          unsigned int                                : 14;
+     } sq_alu_word1_op3_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op3_t {
+          unsigned int                                : 14;
+          unsigned int alu_inst                       : SQ_ALU_WORD1_OP3_ALU_INST_SIZE;
+          unsigned int src2_neg                       : SQ_ALU_WORD1_OP3_SRC2_NEG_SIZE;
+          unsigned int src2_chan                      : SQ_ALU_WORD1_OP3_SRC2_CHAN_SIZE;
+          unsigned int src2_rel                       : SQ_ALU_WORD1_OP3_SRC2_REL_SIZE;
+          unsigned int src2_sel                       : SQ_ALU_WORD1_OP3_SRC2_SEL_SIZE;
+     } sq_alu_word1_op3_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_alu_word1_op3_t f;
+} sq_alu_word1_op3_u;
+
+
+/*
+ * SQ_TEX_WORD0 struct
+ */
+
+#define SQ_TEX_WORD0_TEX_INST_SIZE     5
+#define SQ_TEX_WORD0_BC_FRAC_MODE_SIZE 1
+#define SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SIZE 1
+#define SQ_TEX_WORD0_RESOURCE_ID_SIZE  8
+#define SQ_TEX_WORD0_SRC_GPR_SIZE      7
+#define SQ_TEX_WORD0_SRC_REL_SIZE      1
+#define SQ_TEX_WORD0_ALT_CONST_SIZE    1
+
+#define SQ_TEX_WORD0_TEX_INST_SHIFT    0
+#define SQ_TEX_WORD0_BC_FRAC_MODE_SHIFT 5
+#define SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SHIFT 7
+#define SQ_TEX_WORD0_RESOURCE_ID_SHIFT 8
+#define SQ_TEX_WORD0_SRC_GPR_SHIFT     16
+#define SQ_TEX_WORD0_SRC_REL_SHIFT     23
+#define SQ_TEX_WORD0_ALT_CONST_SHIFT   24
+
+#define SQ_TEX_WORD0_TEX_INST_MASK     0x0000001f
+#define SQ_TEX_WORD0_BC_FRAC_MODE_MASK 0x00000020
+#define SQ_TEX_WORD0_FETCH_WHOLE_QUAD_MASK 0x00000080
+#define SQ_TEX_WORD0_RESOURCE_ID_MASK  0x0000ff00
+#define SQ_TEX_WORD0_SRC_GPR_MASK      0x007f0000
+#define SQ_TEX_WORD0_SRC_REL_MASK      0x00800000
+#define SQ_TEX_WORD0_ALT_CONST_MASK    0x01000000
+
+#define SQ_TEX_WORD0_MASK \
+     (SQ_TEX_WORD0_TEX_INST_MASK | \
+      SQ_TEX_WORD0_BC_FRAC_MODE_MASK | \
+      SQ_TEX_WORD0_FETCH_WHOLE_QUAD_MASK | \
+      SQ_TEX_WORD0_RESOURCE_ID_MASK | \
+      SQ_TEX_WORD0_SRC_GPR_MASK | \
+      SQ_TEX_WORD0_SRC_REL_MASK | \
+      SQ_TEX_WORD0_ALT_CONST_MASK)
+
+#define SQ_TEX_WORD0_DEFAULT           0x01cdcd8d
+
+#define SQ_TEX_WORD0_GET_TEX_INST(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_TEX_INST_MASK) >> SQ_TEX_WORD0_TEX_INST_SHIFT)
+#define SQ_TEX_WORD0_GET_BC_FRAC_MODE(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_BC_FRAC_MODE_MASK) >> SQ_TEX_WORD0_BC_FRAC_MODE_SHIFT)
+#define SQ_TEX_WORD0_GET_FETCH_WHOLE_QUAD(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_FETCH_WHOLE_QUAD_MASK) >> SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SHIFT)
+#define SQ_TEX_WORD0_GET_RESOURCE_ID(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_RESOURCE_ID_MASK) >> SQ_TEX_WORD0_RESOURCE_ID_SHIFT)
+#define SQ_TEX_WORD0_GET_SRC_GPR(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_SRC_GPR_MASK) >> SQ_TEX_WORD0_SRC_GPR_SHIFT)
+#define SQ_TEX_WORD0_GET_SRC_REL(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_SRC_REL_MASK) >> SQ_TEX_WORD0_SRC_REL_SHIFT)
+#define SQ_TEX_WORD0_GET_ALT_CONST(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_ALT_CONST_MASK) >> SQ_TEX_WORD0_ALT_CONST_SHIFT)
+
+#define SQ_TEX_WORD0_SET_TEX_INST(sq_tex_word0_reg, tex_inst) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_TEX_INST_MASK) | (tex_inst << SQ_TEX_WORD0_TEX_INST_SHIFT)
+#define SQ_TEX_WORD0_SET_BC_FRAC_MODE(sq_tex_word0_reg, bc_frac_mode) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_BC_FRAC_MODE_MASK) | (bc_frac_mode << SQ_TEX_WORD0_BC_FRAC_MODE_SHIFT)
+#define SQ_TEX_WORD0_SET_FETCH_WHOLE_QUAD(sq_tex_word0_reg, fetch_whole_quad) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_FETCH_WHOLE_QUAD_MASK) | (fetch_whole_quad << SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SHIFT)
+#define SQ_TEX_WORD0_SET_RESOURCE_ID(sq_tex_word0_reg, resource_id) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_RESOURCE_ID_MASK) | (resource_id << SQ_TEX_WORD0_RESOURCE_ID_SHIFT)
+#define SQ_TEX_WORD0_SET_SRC_GPR(sq_tex_word0_reg, src_gpr) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_SRC_GPR_MASK) | (src_gpr << SQ_TEX_WORD0_SRC_GPR_SHIFT)
+#define SQ_TEX_WORD0_SET_SRC_REL(sq_tex_word0_reg, src_rel) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_SRC_REL_MASK) | (src_rel << SQ_TEX_WORD0_SRC_REL_SHIFT)
+#define SQ_TEX_WORD0_SET_ALT_CONST(sq_tex_word0_reg, alt_const) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_ALT_CONST_MASK) | (alt_const << SQ_TEX_WORD0_ALT_CONST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_tex_word0_t {
+          unsigned int tex_inst                       : SQ_TEX_WORD0_TEX_INST_SIZE;
+          unsigned int bc_frac_mode                   : SQ_TEX_WORD0_BC_FRAC_MODE_SIZE;
+          unsigned int                                : 1;
+          unsigned int fetch_whole_quad               : SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SIZE;
+          unsigned int resource_id                    : SQ_TEX_WORD0_RESOURCE_ID_SIZE;
+          unsigned int src_gpr                        : SQ_TEX_WORD0_SRC_GPR_SIZE;
+          unsigned int src_rel                        : SQ_TEX_WORD0_SRC_REL_SIZE;
+          unsigned int alt_const                      : SQ_TEX_WORD0_ALT_CONST_SIZE;
+          unsigned int                                : 7;
+     } sq_tex_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_tex_word0_t {
+          unsigned int                                : 7;
+          unsigned int alt_const                      : SQ_TEX_WORD0_ALT_CONST_SIZE;
+          unsigned int src_rel                        : SQ_TEX_WORD0_SRC_REL_SIZE;
+          unsigned int src_gpr                        : SQ_TEX_WORD0_SRC_GPR_SIZE;
+          unsigned int resource_id                    : SQ_TEX_WORD0_RESOURCE_ID_SIZE;
+          unsigned int fetch_whole_quad               : SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SIZE;
+          unsigned int                                : 1;
+          unsigned int bc_frac_mode                   : SQ_TEX_WORD0_BC_FRAC_MODE_SIZE;
+          unsigned int tex_inst                       : SQ_TEX_WORD0_TEX_INST_SIZE;
+     } sq_tex_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_tex_word0_t f;
+} sq_tex_word0_u;
+
+
+/*
+ * SQ_TEX_WORD1 struct
+ */
+
+#define SQ_TEX_WORD1_DST_GPR_SIZE      7
+#define SQ_TEX_WORD1_DST_REL_SIZE      1
+#define SQ_TEX_WORD1_DST_SEL_X_SIZE    3
+#define SQ_TEX_WORD1_DST_SEL_Y_SIZE    3
+#define SQ_TEX_WORD1_DST_SEL_Z_SIZE    3
+#define SQ_TEX_WORD1_DST_SEL_W_SIZE    3
+#define SQ_TEX_WORD1_LOD_BIAS_SIZE     7
+#define SQ_TEX_WORD1_COORD_TYPE_X_SIZE 1
+#define SQ_TEX_WORD1_COORD_TYPE_Y_SIZE 1
+#define SQ_TEX_WORD1_COORD_TYPE_Z_SIZE 1
+#define SQ_TEX_WORD1_COORD_TYPE_W_SIZE 1
+
+#define SQ_TEX_WORD1_DST_GPR_SHIFT     0
+#define SQ_TEX_WORD1_DST_REL_SHIFT     7
+#define SQ_TEX_WORD1_DST_SEL_X_SHIFT   9
+#define SQ_TEX_WORD1_DST_SEL_Y_SHIFT   12
+#define SQ_TEX_WORD1_DST_SEL_Z_SHIFT   15
+#define SQ_TEX_WORD1_DST_SEL_W_SHIFT   18
+#define SQ_TEX_WORD1_LOD_BIAS_SHIFT    21
+#define SQ_TEX_WORD1_COORD_TYPE_X_SHIFT 28
+#define SQ_TEX_WORD1_COORD_TYPE_Y_SHIFT 29
+#define SQ_TEX_WORD1_COORD_TYPE_Z_SHIFT 30
+#define SQ_TEX_WORD1_COORD_TYPE_W_SHIFT 31
+
+#define SQ_TEX_WORD1_DST_GPR_MASK      0x0000007f
+#define SQ_TEX_WORD1_DST_REL_MASK      0x00000080
+#define SQ_TEX_WORD1_DST_SEL_X_MASK    0x00000e00
+#define SQ_TEX_WORD1_DST_SEL_Y_MASK    0x00007000
+#define SQ_TEX_WORD1_DST_SEL_Z_MASK    0x00038000
+#define SQ_TEX_WORD1_DST_SEL_W_MASK    0x001c0000
+#define SQ_TEX_WORD1_LOD_BIAS_MASK     0x0fe00000
+#define SQ_TEX_WORD1_COORD_TYPE_X_MASK 0x10000000
+#define SQ_TEX_WORD1_COORD_TYPE_Y_MASK 0x20000000
+#define SQ_TEX_WORD1_COORD_TYPE_Z_MASK 0x40000000
+#define SQ_TEX_WORD1_COORD_TYPE_W_MASK 0x80000000
+
+#define SQ_TEX_WORD1_MASK \
+     (SQ_TEX_WORD1_DST_GPR_MASK | \
+      SQ_TEX_WORD1_DST_REL_MASK | \
+      SQ_TEX_WORD1_DST_SEL_X_MASK | \
+      SQ_TEX_WORD1_DST_SEL_Y_MASK | \
+      SQ_TEX_WORD1_DST_SEL_Z_MASK | \
+      SQ_TEX_WORD1_DST_SEL_W_MASK | \
+      SQ_TEX_WORD1_LOD_BIAS_MASK | \
+      SQ_TEX_WORD1_COORD_TYPE_X_MASK | \
+      SQ_TEX_WORD1_COORD_TYPE_Y_MASK | \
+      SQ_TEX_WORD1_COORD_TYPE_Z_MASK | \
+      SQ_TEX_WORD1_COORD_TYPE_W_MASK)
+
+#define SQ_TEX_WORD1_DEFAULT           0xcdcdcccd
+
+#define SQ_TEX_WORD1_GET_DST_GPR(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_GPR_MASK) >> SQ_TEX_WORD1_DST_GPR_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_REL(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_REL_MASK) >> SQ_TEX_WORD1_DST_REL_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_SEL_X(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_SEL_X_MASK) >> SQ_TEX_WORD1_DST_SEL_X_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_SEL_Y(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_SEL_Y_MASK) >> SQ_TEX_WORD1_DST_SEL_Y_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_SEL_Z(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_SEL_Z_MASK) >> SQ_TEX_WORD1_DST_SEL_Z_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_SEL_W(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_SEL_W_MASK) >> SQ_TEX_WORD1_DST_SEL_W_SHIFT)
+#define SQ_TEX_WORD1_GET_LOD_BIAS(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_LOD_BIAS_MASK) >> SQ_TEX_WORD1_LOD_BIAS_SHIFT)
+#define SQ_TEX_WORD1_GET_COORD_TYPE_X(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_COORD_TYPE_X_MASK) >> SQ_TEX_WORD1_COORD_TYPE_X_SHIFT)
+#define SQ_TEX_WORD1_GET_COORD_TYPE_Y(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_COORD_TYPE_Y_MASK) >> SQ_TEX_WORD1_COORD_TYPE_Y_SHIFT)
+#define SQ_TEX_WORD1_GET_COORD_TYPE_Z(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_COORD_TYPE_Z_MASK) >> SQ_TEX_WORD1_COORD_TYPE_Z_SHIFT)
+#define SQ_TEX_WORD1_GET_COORD_TYPE_W(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_COORD_TYPE_W_MASK) >> SQ_TEX_WORD1_COORD_TYPE_W_SHIFT)
+
+#define SQ_TEX_WORD1_SET_DST_GPR(sq_tex_word1_reg, dst_gpr) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_GPR_MASK) | (dst_gpr << SQ_TEX_WORD1_DST_GPR_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_REL(sq_tex_word1_reg, dst_rel) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_REL_MASK) | (dst_rel << SQ_TEX_WORD1_DST_REL_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_SEL_X(sq_tex_word1_reg, dst_sel_x) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_SEL_X_MASK) | (dst_sel_x << SQ_TEX_WORD1_DST_SEL_X_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_SEL_Y(sq_tex_word1_reg, dst_sel_y) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_SEL_Y_MASK) | (dst_sel_y << SQ_TEX_WORD1_DST_SEL_Y_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_SEL_Z(sq_tex_word1_reg, dst_sel_z) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_SEL_Z_MASK) | (dst_sel_z << SQ_TEX_WORD1_DST_SEL_Z_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_SEL_W(sq_tex_word1_reg, dst_sel_w) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_SEL_W_MASK) | (dst_sel_w << SQ_TEX_WORD1_DST_SEL_W_SHIFT)
+#define SQ_TEX_WORD1_SET_LOD_BIAS(sq_tex_word1_reg, lod_bias) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_LOD_BIAS_MASK) | (lod_bias << SQ_TEX_WORD1_LOD_BIAS_SHIFT)
+#define SQ_TEX_WORD1_SET_COORD_TYPE_X(sq_tex_word1_reg, coord_type_x) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_COORD_TYPE_X_MASK) | (coord_type_x << SQ_TEX_WORD1_COORD_TYPE_X_SHIFT)
+#define SQ_TEX_WORD1_SET_COORD_TYPE_Y(sq_tex_word1_reg, coord_type_y) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_COORD_TYPE_Y_MASK) | (coord_type_y << SQ_TEX_WORD1_COORD_TYPE_Y_SHIFT)
+#define SQ_TEX_WORD1_SET_COORD_TYPE_Z(sq_tex_word1_reg, coord_type_z) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_COORD_TYPE_Z_MASK) | (coord_type_z << SQ_TEX_WORD1_COORD_TYPE_Z_SHIFT)
+#define SQ_TEX_WORD1_SET_COORD_TYPE_W(sq_tex_word1_reg, coord_type_w) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_COORD_TYPE_W_MASK) | (coord_type_w << SQ_TEX_WORD1_COORD_TYPE_W_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_tex_word1_t {
+          unsigned int dst_gpr                        : SQ_TEX_WORD1_DST_GPR_SIZE;
+          unsigned int dst_rel                        : SQ_TEX_WORD1_DST_REL_SIZE;
+          unsigned int                                : 1;
+          unsigned int dst_sel_x                      : SQ_TEX_WORD1_DST_SEL_X_SIZE;
+          unsigned int dst_sel_y                      : SQ_TEX_WORD1_DST_SEL_Y_SIZE;
+          unsigned int dst_sel_z                      : SQ_TEX_WORD1_DST_SEL_Z_SIZE;
+          unsigned int dst_sel_w                      : SQ_TEX_WORD1_DST_SEL_W_SIZE;
+          unsigned int lod_bias                       : SQ_TEX_WORD1_LOD_BIAS_SIZE;
+          unsigned int coord_type_x                   : SQ_TEX_WORD1_COORD_TYPE_X_SIZE;
+          unsigned int coord_type_y                   : SQ_TEX_WORD1_COORD_TYPE_Y_SIZE;
+          unsigned int coord_type_z                   : SQ_TEX_WORD1_COORD_TYPE_Z_SIZE;
+          unsigned int coord_type_w                   : SQ_TEX_WORD1_COORD_TYPE_W_SIZE;
+     } sq_tex_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_tex_word1_t {
+          unsigned int coord_type_w                   : SQ_TEX_WORD1_COORD_TYPE_W_SIZE;
+          unsigned int coord_type_z                   : SQ_TEX_WORD1_COORD_TYPE_Z_SIZE;
+          unsigned int coord_type_y                   : SQ_TEX_WORD1_COORD_TYPE_Y_SIZE;
+          unsigned int coord_type_x                   : SQ_TEX_WORD1_COORD_TYPE_X_SIZE;
+          unsigned int lod_bias                       : SQ_TEX_WORD1_LOD_BIAS_SIZE;
+          unsigned int dst_sel_w                      : SQ_TEX_WORD1_DST_SEL_W_SIZE;
+          unsigned int dst_sel_z                      : SQ_TEX_WORD1_DST_SEL_Z_SIZE;
+          unsigned int dst_sel_y                      : SQ_TEX_WORD1_DST_SEL_Y_SIZE;
+          unsigned int dst_sel_x                      : SQ_TEX_WORD1_DST_SEL_X_SIZE;
+          unsigned int                                : 1;
+          unsigned int dst_rel                        : SQ_TEX_WORD1_DST_REL_SIZE;
+          unsigned int dst_gpr                        : SQ_TEX_WORD1_DST_GPR_SIZE;
+     } sq_tex_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_tex_word1_t f;
+} sq_tex_word1_u;
+
+
+/*
+ * SQ_TEX_WORD2 struct
+ */
+
+#define SQ_TEX_WORD2_OFFSET_X_SIZE     5
+#define SQ_TEX_WORD2_OFFSET_Y_SIZE     5
+#define SQ_TEX_WORD2_OFFSET_Z_SIZE     5
+#define SQ_TEX_WORD2_SAMPLER_ID_SIZE   5
+#define SQ_TEX_WORD2_SRC_SEL_X_SIZE    3
+#define SQ_TEX_WORD2_SRC_SEL_Y_SIZE    3
+#define SQ_TEX_WORD2_SRC_SEL_Z_SIZE    3
+#define SQ_TEX_WORD2_SRC_SEL_W_SIZE    3
+
+#define SQ_TEX_WORD2_OFFSET_X_SHIFT    0
+#define SQ_TEX_WORD2_OFFSET_Y_SHIFT    5
+#define SQ_TEX_WORD2_OFFSET_Z_SHIFT    10
+#define SQ_TEX_WORD2_SAMPLER_ID_SHIFT  15
+#define SQ_TEX_WORD2_SRC_SEL_X_SHIFT   20
+#define SQ_TEX_WORD2_SRC_SEL_Y_SHIFT   23
+#define SQ_TEX_WORD2_SRC_SEL_Z_SHIFT   26
+#define SQ_TEX_WORD2_SRC_SEL_W_SHIFT   29
+
+#define SQ_TEX_WORD2_OFFSET_X_MASK     0x0000001f
+#define SQ_TEX_WORD2_OFFSET_Y_MASK     0x000003e0
+#define SQ_TEX_WORD2_OFFSET_Z_MASK     0x00007c00
+#define SQ_TEX_WORD2_SAMPLER_ID_MASK   0x000f8000
+#define SQ_TEX_WORD2_SRC_SEL_X_MASK    0x00700000
+#define SQ_TEX_WORD2_SRC_SEL_Y_MASK    0x03800000
+#define SQ_TEX_WORD2_SRC_SEL_Z_MASK    0x1c000000
+#define SQ_TEX_WORD2_SRC_SEL_W_MASK    0xe0000000
+
+#define SQ_TEX_WORD2_MASK \
+     (SQ_TEX_WORD2_OFFSET_X_MASK | \
+      SQ_TEX_WORD2_OFFSET_Y_MASK | \
+      SQ_TEX_WORD2_OFFSET_Z_MASK | \
+      SQ_TEX_WORD2_SAMPLER_ID_MASK | \
+      SQ_TEX_WORD2_SRC_SEL_X_MASK | \
+      SQ_TEX_WORD2_SRC_SEL_Y_MASK | \
+      SQ_TEX_WORD2_SRC_SEL_Z_MASK | \
+      SQ_TEX_WORD2_SRC_SEL_W_MASK)
+
+#define SQ_TEX_WORD2_DEFAULT           0xcdcdcdcd
+
+#define SQ_TEX_WORD2_GET_OFFSET_X(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_OFFSET_X_MASK) >> SQ_TEX_WORD2_OFFSET_X_SHIFT)
+#define SQ_TEX_WORD2_GET_OFFSET_Y(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_OFFSET_Y_MASK) >> SQ_TEX_WORD2_OFFSET_Y_SHIFT)
+#define SQ_TEX_WORD2_GET_OFFSET_Z(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_OFFSET_Z_MASK) >> SQ_TEX_WORD2_OFFSET_Z_SHIFT)
+#define SQ_TEX_WORD2_GET_SAMPLER_ID(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SAMPLER_ID_MASK) >> SQ_TEX_WORD2_SAMPLER_ID_SHIFT)
+#define SQ_TEX_WORD2_GET_SRC_SEL_X(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SRC_SEL_X_MASK) >> SQ_TEX_WORD2_SRC_SEL_X_SHIFT)
+#define SQ_TEX_WORD2_GET_SRC_SEL_Y(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SRC_SEL_Y_MASK) >> SQ_TEX_WORD2_SRC_SEL_Y_SHIFT)
+#define SQ_TEX_WORD2_GET_SRC_SEL_Z(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SRC_SEL_Z_MASK) >> SQ_TEX_WORD2_SRC_SEL_Z_SHIFT)
+#define SQ_TEX_WORD2_GET_SRC_SEL_W(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SRC_SEL_W_MASK) >> SQ_TEX_WORD2_SRC_SEL_W_SHIFT)
+
+#define SQ_TEX_WORD2_SET_OFFSET_X(sq_tex_word2_reg, offset_x) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_OFFSET_X_MASK) | (offset_x << SQ_TEX_WORD2_OFFSET_X_SHIFT)
+#define SQ_TEX_WORD2_SET_OFFSET_Y(sq_tex_word2_reg, offset_y) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_OFFSET_Y_MASK) | (offset_y << SQ_TEX_WORD2_OFFSET_Y_SHIFT)
+#define SQ_TEX_WORD2_SET_OFFSET_Z(sq_tex_word2_reg, offset_z) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_OFFSET_Z_MASK) | (offset_z << SQ_TEX_WORD2_OFFSET_Z_SHIFT)
+#define SQ_TEX_WORD2_SET_SAMPLER_ID(sq_tex_word2_reg, sampler_id) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SAMPLER_ID_MASK) | (sampler_id << SQ_TEX_WORD2_SAMPLER_ID_SHIFT)
+#define SQ_TEX_WORD2_SET_SRC_SEL_X(sq_tex_word2_reg, src_sel_x) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SRC_SEL_X_MASK) | (src_sel_x << SQ_TEX_WORD2_SRC_SEL_X_SHIFT)
+#define SQ_TEX_WORD2_SET_SRC_SEL_Y(sq_tex_word2_reg, src_sel_y) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SRC_SEL_Y_MASK) | (src_sel_y << SQ_TEX_WORD2_SRC_SEL_Y_SHIFT)
+#define SQ_TEX_WORD2_SET_SRC_SEL_Z(sq_tex_word2_reg, src_sel_z) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SRC_SEL_Z_MASK) | (src_sel_z << SQ_TEX_WORD2_SRC_SEL_Z_SHIFT)
+#define SQ_TEX_WORD2_SET_SRC_SEL_W(sq_tex_word2_reg, src_sel_w) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SRC_SEL_W_MASK) | (src_sel_w << SQ_TEX_WORD2_SRC_SEL_W_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_tex_word2_t {
+          unsigned int offset_x                       : SQ_TEX_WORD2_OFFSET_X_SIZE;
+          unsigned int offset_y                       : SQ_TEX_WORD2_OFFSET_Y_SIZE;
+          unsigned int offset_z                       : SQ_TEX_WORD2_OFFSET_Z_SIZE;
+          unsigned int sampler_id                     : SQ_TEX_WORD2_SAMPLER_ID_SIZE;
+          unsigned int src_sel_x                      : SQ_TEX_WORD2_SRC_SEL_X_SIZE;
+          unsigned int src_sel_y                      : SQ_TEX_WORD2_SRC_SEL_Y_SIZE;
+          unsigned int src_sel_z                      : SQ_TEX_WORD2_SRC_SEL_Z_SIZE;
+          unsigned int src_sel_w                      : SQ_TEX_WORD2_SRC_SEL_W_SIZE;
+     } sq_tex_word2_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_tex_word2_t {
+          unsigned int src_sel_w                      : SQ_TEX_WORD2_SRC_SEL_W_SIZE;
+          unsigned int src_sel_z                      : SQ_TEX_WORD2_SRC_SEL_Z_SIZE;
+          unsigned int src_sel_y                      : SQ_TEX_WORD2_SRC_SEL_Y_SIZE;
+          unsigned int src_sel_x                      : SQ_TEX_WORD2_SRC_SEL_X_SIZE;
+          unsigned int sampler_id                     : SQ_TEX_WORD2_SAMPLER_ID_SIZE;
+          unsigned int offset_z                       : SQ_TEX_WORD2_OFFSET_Z_SIZE;
+          unsigned int offset_y                       : SQ_TEX_WORD2_OFFSET_Y_SIZE;
+          unsigned int offset_x                       : SQ_TEX_WORD2_OFFSET_X_SIZE;
+     } sq_tex_word2_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_tex_word2_t f;
+} sq_tex_word2_u;
+
+
+/*
+ * SQ_VTX_WORD0 struct
+ */
+
+#define SQ_VTX_WORD0_VTX_INST_SIZE     5
+#define SQ_VTX_WORD0_FETCH_TYPE_SIZE   2
+#define SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SIZE 1
+#define SQ_VTX_WORD0_BUFFER_ID_SIZE    8
+#define SQ_VTX_WORD0_SRC_GPR_SIZE      7
+#define SQ_VTX_WORD0_SRC_REL_SIZE      1
+#define SQ_VTX_WORD0_SRC_SEL_X_SIZE    2
+#define SQ_VTX_WORD0_MEGA_FETCH_COUNT_SIZE 6
+
+#define SQ_VTX_WORD0_VTX_INST_SHIFT    0
+#define SQ_VTX_WORD0_FETCH_TYPE_SHIFT  5
+#define SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SHIFT 7
+#define SQ_VTX_WORD0_BUFFER_ID_SHIFT   8
+#define SQ_VTX_WORD0_SRC_GPR_SHIFT     16
+#define SQ_VTX_WORD0_SRC_REL_SHIFT     23
+#define SQ_VTX_WORD0_SRC_SEL_X_SHIFT   24
+#define SQ_VTX_WORD0_MEGA_FETCH_COUNT_SHIFT 26
+
+#define SQ_VTX_WORD0_VTX_INST_MASK     0x0000001f
+#define SQ_VTX_WORD0_FETCH_TYPE_MASK   0x00000060
+#define SQ_VTX_WORD0_FETCH_WHOLE_QUAD_MASK 0x00000080
+#define SQ_VTX_WORD0_BUFFER_ID_MASK    0x0000ff00
+#define SQ_VTX_WORD0_SRC_GPR_MASK      0x007f0000
+#define SQ_VTX_WORD0_SRC_REL_MASK      0x00800000
+#define SQ_VTX_WORD0_SRC_SEL_X_MASK    0x03000000
+#define SQ_VTX_WORD0_MEGA_FETCH_COUNT_MASK 0xfc000000
+
+#define SQ_VTX_WORD0_MASK \
+     (SQ_VTX_WORD0_VTX_INST_MASK | \
+      SQ_VTX_WORD0_FETCH_TYPE_MASK | \
+      SQ_VTX_WORD0_FETCH_WHOLE_QUAD_MASK | \
+      SQ_VTX_WORD0_BUFFER_ID_MASK | \
+      SQ_VTX_WORD0_SRC_GPR_MASK | \
+      SQ_VTX_WORD0_SRC_REL_MASK | \
+      SQ_VTX_WORD0_SRC_SEL_X_MASK | \
+      SQ_VTX_WORD0_MEGA_FETCH_COUNT_MASK)
+
+#define SQ_VTX_WORD0_DEFAULT           0xcdcdcdcd
+
+#define SQ_VTX_WORD0_GET_VTX_INST(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_VTX_INST_MASK) >> SQ_VTX_WORD0_VTX_INST_SHIFT)
+#define SQ_VTX_WORD0_GET_FETCH_TYPE(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_FETCH_TYPE_MASK) >> SQ_VTX_WORD0_FETCH_TYPE_SHIFT)
+#define SQ_VTX_WORD0_GET_FETCH_WHOLE_QUAD(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_FETCH_WHOLE_QUAD_MASK) >> SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SHIFT)
+#define SQ_VTX_WORD0_GET_BUFFER_ID(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_BUFFER_ID_MASK) >> SQ_VTX_WORD0_BUFFER_ID_SHIFT)
+#define SQ_VTX_WORD0_GET_SRC_GPR(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_SRC_GPR_MASK) >> SQ_VTX_WORD0_SRC_GPR_SHIFT)
+#define SQ_VTX_WORD0_GET_SRC_REL(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_SRC_REL_MASK) >> SQ_VTX_WORD0_SRC_REL_SHIFT)
+#define SQ_VTX_WORD0_GET_SRC_SEL_X(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_SRC_SEL_X_MASK) >> SQ_VTX_WORD0_SRC_SEL_X_SHIFT)
+#define SQ_VTX_WORD0_GET_MEGA_FETCH_COUNT(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_MEGA_FETCH_COUNT_MASK) >> SQ_VTX_WORD0_MEGA_FETCH_COUNT_SHIFT)
+
+#define SQ_VTX_WORD0_SET_VTX_INST(sq_vtx_word0_reg, vtx_inst) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_VTX_INST_MASK) | (vtx_inst << SQ_VTX_WORD0_VTX_INST_SHIFT)
+#define SQ_VTX_WORD0_SET_FETCH_TYPE(sq_vtx_word0_reg, fetch_type) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_FETCH_TYPE_MASK) | (fetch_type << SQ_VTX_WORD0_FETCH_TYPE_SHIFT)
+#define SQ_VTX_WORD0_SET_FETCH_WHOLE_QUAD(sq_vtx_word0_reg, fetch_whole_quad) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_FETCH_WHOLE_QUAD_MASK) | (fetch_whole_quad << SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SHIFT)
+#define SQ_VTX_WORD0_SET_BUFFER_ID(sq_vtx_word0_reg, buffer_id) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_BUFFER_ID_MASK) | (buffer_id << SQ_VTX_WORD0_BUFFER_ID_SHIFT)
+#define SQ_VTX_WORD0_SET_SRC_GPR(sq_vtx_word0_reg, src_gpr) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_SRC_GPR_MASK) | (src_gpr << SQ_VTX_WORD0_SRC_GPR_SHIFT)
+#define SQ_VTX_WORD0_SET_SRC_REL(sq_vtx_word0_reg, src_rel) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_SRC_REL_MASK) | (src_rel << SQ_VTX_WORD0_SRC_REL_SHIFT)
+#define SQ_VTX_WORD0_SET_SRC_SEL_X(sq_vtx_word0_reg, src_sel_x) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_SRC_SEL_X_MASK) | (src_sel_x << SQ_VTX_WORD0_SRC_SEL_X_SHIFT)
+#define SQ_VTX_WORD0_SET_MEGA_FETCH_COUNT(sq_vtx_word0_reg, mega_fetch_count) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_MEGA_FETCH_COUNT_MASK) | (mega_fetch_count << SQ_VTX_WORD0_MEGA_FETCH_COUNT_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word0_t {
+          unsigned int vtx_inst                       : SQ_VTX_WORD0_VTX_INST_SIZE;
+          unsigned int fetch_type                     : SQ_VTX_WORD0_FETCH_TYPE_SIZE;
+          unsigned int fetch_whole_quad               : SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SIZE;
+          unsigned int buffer_id                      : SQ_VTX_WORD0_BUFFER_ID_SIZE;
+          unsigned int src_gpr                        : SQ_VTX_WORD0_SRC_GPR_SIZE;
+          unsigned int src_rel                        : SQ_VTX_WORD0_SRC_REL_SIZE;
+          unsigned int src_sel_x                      : SQ_VTX_WORD0_SRC_SEL_X_SIZE;
+          unsigned int mega_fetch_count               : SQ_VTX_WORD0_MEGA_FETCH_COUNT_SIZE;
+     } sq_vtx_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word0_t {
+          unsigned int mega_fetch_count               : SQ_VTX_WORD0_MEGA_FETCH_COUNT_SIZE;
+          unsigned int src_sel_x                      : SQ_VTX_WORD0_SRC_SEL_X_SIZE;
+          unsigned int src_rel                        : SQ_VTX_WORD0_SRC_REL_SIZE;
+          unsigned int src_gpr                        : SQ_VTX_WORD0_SRC_GPR_SIZE;
+          unsigned int buffer_id                      : SQ_VTX_WORD0_BUFFER_ID_SIZE;
+          unsigned int fetch_whole_quad               : SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SIZE;
+          unsigned int fetch_type                     : SQ_VTX_WORD0_FETCH_TYPE_SIZE;
+          unsigned int vtx_inst                       : SQ_VTX_WORD0_VTX_INST_SIZE;
+     } sq_vtx_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word0_t f;
+} sq_vtx_word0_u;
+
+
+/*
+ * SQ_VTX_WORD1 struct
+ */
+
+#define SQ_VTX_WORD1_DST_SEL_X_SIZE    3
+#define SQ_VTX_WORD1_DST_SEL_Y_SIZE    3
+#define SQ_VTX_WORD1_DST_SEL_Z_SIZE    3
+#define SQ_VTX_WORD1_DST_SEL_W_SIZE    3
+#define SQ_VTX_WORD1_USE_CONST_FIELDS_SIZE 1
+#define SQ_VTX_WORD1_DATA_FORMAT_SIZE  6
+#define SQ_VTX_WORD1_NUM_FORMAT_ALL_SIZE 2
+#define SQ_VTX_WORD1_FORMAT_COMP_ALL_SIZE 1
+#define SQ_VTX_WORD1_SRF_MODE_ALL_SIZE 1
+
+#define SQ_VTX_WORD1_DST_SEL_X_SHIFT   9
+#define SQ_VTX_WORD1_DST_SEL_Y_SHIFT   12
+#define SQ_VTX_WORD1_DST_SEL_Z_SHIFT   15
+#define SQ_VTX_WORD1_DST_SEL_W_SHIFT   18
+#define SQ_VTX_WORD1_USE_CONST_FIELDS_SHIFT 21
+#define SQ_VTX_WORD1_DATA_FORMAT_SHIFT 22
+#define SQ_VTX_WORD1_NUM_FORMAT_ALL_SHIFT 28
+#define SQ_VTX_WORD1_FORMAT_COMP_ALL_SHIFT 30
+#define SQ_VTX_WORD1_SRF_MODE_ALL_SHIFT 31
+
+#define SQ_VTX_WORD1_DST_SEL_X_MASK    0x00000e00
+#define SQ_VTX_WORD1_DST_SEL_Y_MASK    0x00007000
+#define SQ_VTX_WORD1_DST_SEL_Z_MASK    0x00038000
+#define SQ_VTX_WORD1_DST_SEL_W_MASK    0x001c0000
+#define SQ_VTX_WORD1_USE_CONST_FIELDS_MASK 0x00200000
+#define SQ_VTX_WORD1_DATA_FORMAT_MASK  0x0fc00000
+#define SQ_VTX_WORD1_NUM_FORMAT_ALL_MASK 0x30000000
+#define SQ_VTX_WORD1_FORMAT_COMP_ALL_MASK 0x40000000
+#define SQ_VTX_WORD1_SRF_MODE_ALL_MASK 0x80000000
+
+#define SQ_VTX_WORD1_MASK \
+     (SQ_VTX_WORD1_DST_SEL_X_MASK | \
+      SQ_VTX_WORD1_DST_SEL_Y_MASK | \
+      SQ_VTX_WORD1_DST_SEL_Z_MASK | \
+      SQ_VTX_WORD1_DST_SEL_W_MASK | \
+      SQ_VTX_WORD1_USE_CONST_FIELDS_MASK | \
+      SQ_VTX_WORD1_DATA_FORMAT_MASK | \
+      SQ_VTX_WORD1_NUM_FORMAT_ALL_MASK | \
+      SQ_VTX_WORD1_FORMAT_COMP_ALL_MASK | \
+      SQ_VTX_WORD1_SRF_MODE_ALL_MASK)
+
+#define SQ_VTX_WORD1_DEFAULT           0xcdcdcc00
+
+#define SQ_VTX_WORD1_GET_DST_SEL_X(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DST_SEL_X_MASK) >> SQ_VTX_WORD1_DST_SEL_X_SHIFT)
+#define SQ_VTX_WORD1_GET_DST_SEL_Y(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DST_SEL_Y_MASK) >> SQ_VTX_WORD1_DST_SEL_Y_SHIFT)
+#define SQ_VTX_WORD1_GET_DST_SEL_Z(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DST_SEL_Z_MASK) >> SQ_VTX_WORD1_DST_SEL_Z_SHIFT)
+#define SQ_VTX_WORD1_GET_DST_SEL_W(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DST_SEL_W_MASK) >> SQ_VTX_WORD1_DST_SEL_W_SHIFT)
+#define SQ_VTX_WORD1_GET_USE_CONST_FIELDS(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_USE_CONST_FIELDS_MASK) >> SQ_VTX_WORD1_USE_CONST_FIELDS_SHIFT)
+#define SQ_VTX_WORD1_GET_DATA_FORMAT(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DATA_FORMAT_MASK) >> SQ_VTX_WORD1_DATA_FORMAT_SHIFT)
+#define SQ_VTX_WORD1_GET_NUM_FORMAT_ALL(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_NUM_FORMAT_ALL_MASK) >> SQ_VTX_WORD1_NUM_FORMAT_ALL_SHIFT)
+#define SQ_VTX_WORD1_GET_FORMAT_COMP_ALL(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_FORMAT_COMP_ALL_MASK) >> SQ_VTX_WORD1_FORMAT_COMP_ALL_SHIFT)
+#define SQ_VTX_WORD1_GET_SRF_MODE_ALL(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_SRF_MODE_ALL_MASK) >> SQ_VTX_WORD1_SRF_MODE_ALL_SHIFT)
+
+#define SQ_VTX_WORD1_SET_DST_SEL_X(sq_vtx_word1_reg, dst_sel_x) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DST_SEL_X_MASK) | (dst_sel_x << SQ_VTX_WORD1_DST_SEL_X_SHIFT)
+#define SQ_VTX_WORD1_SET_DST_SEL_Y(sq_vtx_word1_reg, dst_sel_y) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DST_SEL_Y_MASK) | (dst_sel_y << SQ_VTX_WORD1_DST_SEL_Y_SHIFT)
+#define SQ_VTX_WORD1_SET_DST_SEL_Z(sq_vtx_word1_reg, dst_sel_z) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DST_SEL_Z_MASK) | (dst_sel_z << SQ_VTX_WORD1_DST_SEL_Z_SHIFT)
+#define SQ_VTX_WORD1_SET_DST_SEL_W(sq_vtx_word1_reg, dst_sel_w) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DST_SEL_W_MASK) | (dst_sel_w << SQ_VTX_WORD1_DST_SEL_W_SHIFT)
+#define SQ_VTX_WORD1_SET_USE_CONST_FIELDS(sq_vtx_word1_reg, use_const_fields) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_USE_CONST_FIELDS_MASK) | (use_const_fields << SQ_VTX_WORD1_USE_CONST_FIELDS_SHIFT)
+#define SQ_VTX_WORD1_SET_DATA_FORMAT(sq_vtx_word1_reg, data_format) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DATA_FORMAT_MASK) | (data_format << SQ_VTX_WORD1_DATA_FORMAT_SHIFT)
+#define SQ_VTX_WORD1_SET_NUM_FORMAT_ALL(sq_vtx_word1_reg, num_format_all) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_NUM_FORMAT_ALL_MASK) | (num_format_all << SQ_VTX_WORD1_NUM_FORMAT_ALL_SHIFT)
+#define SQ_VTX_WORD1_SET_FORMAT_COMP_ALL(sq_vtx_word1_reg, format_comp_all) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_FORMAT_COMP_ALL_MASK) | (format_comp_all << SQ_VTX_WORD1_FORMAT_COMP_ALL_SHIFT)
+#define SQ_VTX_WORD1_SET_SRF_MODE_ALL(sq_vtx_word1_reg, srf_mode_all) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_SRF_MODE_ALL_MASK) | (srf_mode_all << SQ_VTX_WORD1_SRF_MODE_ALL_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_t {
+          unsigned int                                : 9;
+          unsigned int dst_sel_x                      : SQ_VTX_WORD1_DST_SEL_X_SIZE;
+          unsigned int dst_sel_y                      : SQ_VTX_WORD1_DST_SEL_Y_SIZE;
+          unsigned int dst_sel_z                      : SQ_VTX_WORD1_DST_SEL_Z_SIZE;
+          unsigned int dst_sel_w                      : SQ_VTX_WORD1_DST_SEL_W_SIZE;
+          unsigned int use_const_fields               : SQ_VTX_WORD1_USE_CONST_FIELDS_SIZE;
+          unsigned int data_format                    : SQ_VTX_WORD1_DATA_FORMAT_SIZE;
+          unsigned int num_format_all                 : SQ_VTX_WORD1_NUM_FORMAT_ALL_SIZE;
+          unsigned int format_comp_all                : SQ_VTX_WORD1_FORMAT_COMP_ALL_SIZE;
+          unsigned int srf_mode_all                   : SQ_VTX_WORD1_SRF_MODE_ALL_SIZE;
+     } sq_vtx_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_t {
+          unsigned int srf_mode_all                   : SQ_VTX_WORD1_SRF_MODE_ALL_SIZE;
+          unsigned int format_comp_all                : SQ_VTX_WORD1_FORMAT_COMP_ALL_SIZE;
+          unsigned int num_format_all                 : SQ_VTX_WORD1_NUM_FORMAT_ALL_SIZE;
+          unsigned int data_format                    : SQ_VTX_WORD1_DATA_FORMAT_SIZE;
+          unsigned int use_const_fields               : SQ_VTX_WORD1_USE_CONST_FIELDS_SIZE;
+          unsigned int dst_sel_w                      : SQ_VTX_WORD1_DST_SEL_W_SIZE;
+          unsigned int dst_sel_z                      : SQ_VTX_WORD1_DST_SEL_Z_SIZE;
+          unsigned int dst_sel_y                      : SQ_VTX_WORD1_DST_SEL_Y_SIZE;
+          unsigned int dst_sel_x                      : SQ_VTX_WORD1_DST_SEL_X_SIZE;
+          unsigned int                                : 9;
+     } sq_vtx_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word1_t f;
+} sq_vtx_word1_u;
+
+
+/*
+ * SQ_VTX_WORD1_GPR struct
+ */
+
+#define SQ_VTX_WORD1_GPR_DST_GPR_SIZE  7
+#define SQ_VTX_WORD1_GPR_DST_REL_SIZE  1
+
+#define SQ_VTX_WORD1_GPR_DST_GPR_SHIFT 0
+#define SQ_VTX_WORD1_GPR_DST_REL_SHIFT 7
+
+#define SQ_VTX_WORD1_GPR_DST_GPR_MASK  0x0000007f
+#define SQ_VTX_WORD1_GPR_DST_REL_MASK  0x00000080
+
+#define SQ_VTX_WORD1_GPR_MASK \
+     (SQ_VTX_WORD1_GPR_DST_GPR_MASK | \
+      SQ_VTX_WORD1_GPR_DST_REL_MASK)
+
+#define SQ_VTX_WORD1_GPR_DEFAULT       0x000000cd
+
+#define SQ_VTX_WORD1_GPR_GET_DST_GPR(sq_vtx_word1_gpr) \
+     ((sq_vtx_word1_gpr & SQ_VTX_WORD1_GPR_DST_GPR_MASK) >> SQ_VTX_WORD1_GPR_DST_GPR_SHIFT)
+#define SQ_VTX_WORD1_GPR_GET_DST_REL(sq_vtx_word1_gpr) \
+     ((sq_vtx_word1_gpr & SQ_VTX_WORD1_GPR_DST_REL_MASK) >> SQ_VTX_WORD1_GPR_DST_REL_SHIFT)
+
+#define SQ_VTX_WORD1_GPR_SET_DST_GPR(sq_vtx_word1_gpr_reg, dst_gpr) \
+     sq_vtx_word1_gpr_reg = (sq_vtx_word1_gpr_reg & ~SQ_VTX_WORD1_GPR_DST_GPR_MASK) | (dst_gpr << SQ_VTX_WORD1_GPR_DST_GPR_SHIFT)
+#define SQ_VTX_WORD1_GPR_SET_DST_REL(sq_vtx_word1_gpr_reg, dst_rel) \
+     sq_vtx_word1_gpr_reg = (sq_vtx_word1_gpr_reg & ~SQ_VTX_WORD1_GPR_DST_REL_MASK) | (dst_rel << SQ_VTX_WORD1_GPR_DST_REL_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_gpr_t {
+          unsigned int dst_gpr                        : SQ_VTX_WORD1_GPR_DST_GPR_SIZE;
+          unsigned int dst_rel                        : SQ_VTX_WORD1_GPR_DST_REL_SIZE;
+          unsigned int                                : 24;
+     } sq_vtx_word1_gpr_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_gpr_t {
+          unsigned int                                : 24;
+          unsigned int dst_rel                        : SQ_VTX_WORD1_GPR_DST_REL_SIZE;
+          unsigned int dst_gpr                        : SQ_VTX_WORD1_GPR_DST_GPR_SIZE;
+     } sq_vtx_word1_gpr_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word1_gpr_t f;
+} sq_vtx_word1_gpr_u;
+
+
+/*
+ * SQ_VTX_WORD1_SEM struct
+ */
+
+#define SQ_VTX_WORD1_SEM_SEMANTIC_ID_SIZE 8
+
+#define SQ_VTX_WORD1_SEM_SEMANTIC_ID_SHIFT 0
+
+#define SQ_VTX_WORD1_SEM_SEMANTIC_ID_MASK 0x000000ff
+
+#define SQ_VTX_WORD1_SEM_MASK \
+     (SQ_VTX_WORD1_SEM_SEMANTIC_ID_MASK)
+
+#define SQ_VTX_WORD1_SEM_DEFAULT       0x000000cd
+
+#define SQ_VTX_WORD1_SEM_GET_SEMANTIC_ID(sq_vtx_word1_sem) \
+     ((sq_vtx_word1_sem & SQ_VTX_WORD1_SEM_SEMANTIC_ID_MASK) >> SQ_VTX_WORD1_SEM_SEMANTIC_ID_SHIFT)
+
+#define SQ_VTX_WORD1_SEM_SET_SEMANTIC_ID(sq_vtx_word1_sem_reg, semantic_id) \
+     sq_vtx_word1_sem_reg = (sq_vtx_word1_sem_reg & ~SQ_VTX_WORD1_SEM_SEMANTIC_ID_MASK) | (semantic_id << SQ_VTX_WORD1_SEM_SEMANTIC_ID_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_sem_t {
+          unsigned int semantic_id                    : SQ_VTX_WORD1_SEM_SEMANTIC_ID_SIZE;
+          unsigned int                                : 24;
+     } sq_vtx_word1_sem_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_sem_t {
+          unsigned int                                : 24;
+          unsigned int semantic_id                    : SQ_VTX_WORD1_SEM_SEMANTIC_ID_SIZE;
+     } sq_vtx_word1_sem_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word1_sem_t f;
+} sq_vtx_word1_sem_u;
+
+
+/*
+ * SQ_VTX_WORD2 struct
+ */
+
+#define SQ_VTX_WORD2_OFFSET_SIZE       16
+#define SQ_VTX_WORD2_ENDIAN_SWAP_SIZE  2
+#define SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SIZE 1
+#define SQ_VTX_WORD2_MEGA_FETCH_SIZE   1
+#define SQ_VTX_WORD2_ALT_CONST_SIZE    1
+
+#define SQ_VTX_WORD2_OFFSET_SHIFT      0
+#define SQ_VTX_WORD2_ENDIAN_SWAP_SHIFT 16
+#define SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SHIFT 18
+#define SQ_VTX_WORD2_MEGA_FETCH_SHIFT  19
+#define SQ_VTX_WORD2_ALT_CONST_SHIFT   20
+
+#define SQ_VTX_WORD2_OFFSET_MASK       0x0000ffff
+#define SQ_VTX_WORD2_ENDIAN_SWAP_MASK  0x00030000
+#define SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_MASK 0x00040000
+#define SQ_VTX_WORD2_MEGA_FETCH_MASK   0x00080000
+#define SQ_VTX_WORD2_ALT_CONST_MASK    0x00100000
+
+#define SQ_VTX_WORD2_MASK \
+     (SQ_VTX_WORD2_OFFSET_MASK | \
+      SQ_VTX_WORD2_ENDIAN_SWAP_MASK | \
+      SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_MASK | \
+      SQ_VTX_WORD2_MEGA_FETCH_MASK | \
+      SQ_VTX_WORD2_ALT_CONST_MASK)
+
+#define SQ_VTX_WORD2_DEFAULT           0x000dcdcd
+
+#define SQ_VTX_WORD2_GET_OFFSET(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_OFFSET_MASK) >> SQ_VTX_WORD2_OFFSET_SHIFT)
+#define SQ_VTX_WORD2_GET_ENDIAN_SWAP(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_ENDIAN_SWAP_MASK) >> SQ_VTX_WORD2_ENDIAN_SWAP_SHIFT)
+#define SQ_VTX_WORD2_GET_CONST_BUF_NO_STRIDE(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_MASK) >> SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SHIFT)
+#define SQ_VTX_WORD2_GET_MEGA_FETCH(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_MEGA_FETCH_MASK) >> SQ_VTX_WORD2_MEGA_FETCH_SHIFT)
+#define SQ_VTX_WORD2_GET_ALT_CONST(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_ALT_CONST_MASK) >> SQ_VTX_WORD2_ALT_CONST_SHIFT)
+
+#define SQ_VTX_WORD2_SET_OFFSET(sq_vtx_word2_reg, offset) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_OFFSET_MASK) | (offset << SQ_VTX_WORD2_OFFSET_SHIFT)
+#define SQ_VTX_WORD2_SET_ENDIAN_SWAP(sq_vtx_word2_reg, endian_swap) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_ENDIAN_SWAP_MASK) | (endian_swap << SQ_VTX_WORD2_ENDIAN_SWAP_SHIFT)
+#define SQ_VTX_WORD2_SET_CONST_BUF_NO_STRIDE(sq_vtx_word2_reg, const_buf_no_stride) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_MASK) | (const_buf_no_stride << SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SHIFT)
+#define SQ_VTX_WORD2_SET_MEGA_FETCH(sq_vtx_word2_reg, mega_fetch) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_MEGA_FETCH_MASK) | (mega_fetch << SQ_VTX_WORD2_MEGA_FETCH_SHIFT)
+#define SQ_VTX_WORD2_SET_ALT_CONST(sq_vtx_word2_reg, alt_const) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_ALT_CONST_MASK) | (alt_const << SQ_VTX_WORD2_ALT_CONST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word2_t {
+          unsigned int offset                         : SQ_VTX_WORD2_OFFSET_SIZE;
+          unsigned int endian_swap                    : SQ_VTX_WORD2_ENDIAN_SWAP_SIZE;
+          unsigned int const_buf_no_stride            : SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SIZE;
+          unsigned int mega_fetch                     : SQ_VTX_WORD2_MEGA_FETCH_SIZE;
+          unsigned int alt_const                      : SQ_VTX_WORD2_ALT_CONST_SIZE;
+          unsigned int                                : 11;
+     } sq_vtx_word2_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word2_t {
+          unsigned int                                : 11;
+          unsigned int alt_const                      : SQ_VTX_WORD2_ALT_CONST_SIZE;
+          unsigned int mega_fetch                     : SQ_VTX_WORD2_MEGA_FETCH_SIZE;
+          unsigned int const_buf_no_stride            : SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SIZE;
+          unsigned int endian_swap                    : SQ_VTX_WORD2_ENDIAN_SWAP_SIZE;
+          unsigned int offset                         : SQ_VTX_WORD2_OFFSET_SIZE;
+     } sq_vtx_word2_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word2_t f;
+} sq_vtx_word2_u;
+
+#endif /* _SQ_MICRO_REG_H */
+
+
diff --git a/src/mesa/drivers/dri/radeon/Makefile b/src/mesa/drivers/dri/radeon/Makefile
index f223b2d922..1f286776b5 100644
--- a/src/mesa/drivers/dri/radeon/Makefile
+++ b/src/mesa/drivers/dri/radeon/Makefile
@@ -4,31 +4,53 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
+CFLAGS += $(RADEON_CFLAGS)
+
 LIBNAME = radeon_dri.so
 
 MINIGLX_SOURCES = server/radeon_dri.c 
 
+ifeq ($(RADEON_LDFLAGS),)
+CS_SOURCES = radeon_cs_space_drm.c
+endif
+
+RADEON_COMMON_SOURCES = \
+	radeon_bo_legacy.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_cs_legacy.c \
+	radeon_dma.c \
+	radeon_debug.c \
+	radeon_fbo.c \
+	radeon_lock.c \
+	radeon_mipmap_tree.c \
+	radeon_queryobj.c \
+	radeon_span.c \
+	radeon_texture.c
+
 DRIVER_SOURCES = \
 	radeon_context.c \
 	radeon_ioctl.c \
-	radeon_lock.c \
 	radeon_screen.c \
 	radeon_state.c \
 	radeon_state_init.c \
 	radeon_tex.c \
-	radeon_texmem.c \
 	radeon_texstate.c \
 	radeon_tcl.c \
 	radeon_swtcl.c \
-	radeon_span.c \
 	radeon_maos.c \
-	radeon_sanity.c 
+	radeon_sanity.c \
+	$(RADEON_COMMON_SOURCES)
 
 C_SOURCES = \
 	$(COMMON_SOURCES) \
-	$(DRIVER_SOURCES) 
+	$(DRIVER_SOURCES) \
+	$(CS_SOURCES)
+
+DRIVER_DEFINES = -DRADEON_COMMON=0 \
+				 -Wall
 
-DRIVER_DEFINES = -DRADEON_COMMON=0
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 X86_SOURCES = 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_drm.h b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
new file mode 100644
index 0000000000..7141371633
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
@@ -0,0 +1,219 @@
+/* 
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Jérôme Glisse <glisse@freedesktop.org>
+ */
+#ifndef RADEON_BO_H
+#define RADEON_BO_H
+
+#include <stdio.h>
+#include <stdint.h>
+//#include "radeon_track.h"
+
+/* bo object */
+#define RADEON_BO_FLAGS_MACRO_TILE  1
+#define RADEON_BO_FLAGS_MICRO_TILE  2
+
+struct radeon_bo_manager;
+
+struct radeon_bo {
+    uint32_t                    alignment;
+    uint32_t                    handle;
+    uint32_t                    size;
+    uint32_t                    domains;
+    uint32_t                    flags;
+    unsigned                    cref;
+#ifdef RADEON_BO_TRACK
+    struct radeon_track         *track;
+#endif
+    void                        *ptr;
+    struct radeon_bo_manager    *bom;
+    uint32_t                    space_accounted;
+};
+
+/* bo functions */
+struct radeon_bo_funcs {
+    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
+                                 uint32_t handle,
+                                 uint32_t size,
+                                 uint32_t alignment,
+                                 uint32_t domains,
+                                 uint32_t flags);
+    void (*bo_ref)(struct radeon_bo *bo);
+    struct radeon_bo *(*bo_unref)(struct radeon_bo *bo);
+    int (*bo_map)(struct radeon_bo *bo, int write);
+    int (*bo_unmap)(struct radeon_bo *bo);
+    int (*bo_wait)(struct radeon_bo *bo);
+    int (*bo_is_static)(struct radeon_bo *bo);
+    int (*bo_set_tiling)(struct radeon_bo *bo, uint32_t tiling_flags,
+			  uint32_t pitch);
+    int (*bo_get_tiling)(struct radeon_bo *bo, uint32_t *tiling_flags,
+			  uint32_t *pitch);
+    int (*bo_is_busy)(struct radeon_bo *bo, uint32_t *domain);
+};
+
+struct radeon_bo_manager {
+    struct radeon_bo_funcs  *funcs;
+    int                     fd;
+
+#ifdef RADEON_BO_TRACK
+    struct radeon_tracker   tracker;
+#endif
+};
+    
+static inline void _radeon_bo_debug(struct radeon_bo *bo,
+                                    const char *op,
+                                    const char *file,
+                                    const char *func,
+                                    int line)
+{
+    fprintf(stderr, "%s %p 0x%08X 0x%08X 0x%08X [%s %s %d]\n",
+            op, bo, bo->handle, bo->size, bo->cref, file, func, line);
+}
+
+static inline struct radeon_bo *_radeon_bo_open(struct radeon_bo_manager *bom,
+                                                uint32_t handle,
+                                                uint32_t size,
+                                                uint32_t alignment,
+                                                uint32_t domains,
+                                                uint32_t flags,
+                                                const char *file,
+                                                const char *func,
+                                                int line)
+{
+    struct radeon_bo *bo;
+
+    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags);
+
+#ifdef RADEON_BO_TRACK
+    if (bo) {
+        bo->track = radeon_tracker_add_track(&bom->tracker, bo->handle);
+        radeon_track_add_event(bo->track, file, func, "open", line);
+    }
+#endif
+    return bo;
+}
+
+static inline void _radeon_bo_ref(struct radeon_bo *bo,
+                                  const char *file,
+                                  const char *func,
+                                  int line)
+{
+    bo->cref++;
+#ifdef RADEON_BO_TRACK
+    radeon_track_add_event(bo->track, file, func, "ref", line); 
+#endif
+    bo->bom->funcs->bo_ref(bo);
+}
+
+static inline struct radeon_bo *_radeon_bo_unref(struct radeon_bo *bo,
+                                                 const char *file,
+                                                 const char *func,
+                                                 int line)
+{
+    bo->cref--;
+#ifdef RADEON_BO_TRACK
+    radeon_track_add_event(bo->track, file, func, "unref", line);
+    if (bo->cref <= 0) {
+        radeon_tracker_remove_track(&bo->bom->tracker, bo->track);
+        bo->track = NULL;
+    }
+#endif
+    return bo->bom->funcs->bo_unref(bo);
+}
+
+static inline int _radeon_bo_map(struct radeon_bo *bo,
+                                 int write,
+                                 const char *file,
+                                 const char *func,
+                                 int line)
+{
+    return bo->bom->funcs->bo_map(bo, write);
+}
+
+static inline int _radeon_bo_unmap(struct radeon_bo *bo,
+                                   const char *file,
+                                   const char *func,
+                                   int line)
+{
+    return bo->bom->funcs->bo_unmap(bo);
+}
+
+static inline int _radeon_bo_wait(struct radeon_bo *bo,
+                                  const char *file,
+                                  const char *func,
+                                  int line)
+{
+    return bo->bom->funcs->bo_wait(bo);
+}
+
+static inline int _radeon_bo_is_busy(struct radeon_bo *bo,
+				     uint32_t *domain,
+                                     const char *file,
+                                     const char *func,
+                                     int line)
+{
+    return bo->bom->funcs->bo_is_busy(bo, domain);
+}
+
+static inline int radeon_bo_set_tiling(struct radeon_bo *bo,
+				       uint32_t tiling_flags, uint32_t pitch)
+{
+    return bo->bom->funcs->bo_set_tiling(bo, tiling_flags, pitch);
+}
+
+static inline int radeon_bo_get_tiling(struct radeon_bo *bo,
+				       uint32_t *tiling_flags, uint32_t *pitch)
+{
+    return bo->bom->funcs->bo_get_tiling(bo, tiling_flags, pitch);
+}
+
+static inline int radeon_bo_is_static(struct radeon_bo *bo)
+{
+	if (bo->bom->funcs->bo_is_static)
+		return bo->bom->funcs->bo_is_static(bo);
+	return 0;
+}
+
+#define radeon_bo_open(bom, h, s, a, d, f)\
+    _radeon_bo_open(bom, h, s, a, d, f, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_ref(bo)\
+    _radeon_bo_ref(bo, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_unref(bo)\
+    _radeon_bo_unref(bo, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_map(bo, w)\
+    _radeon_bo_map(bo, w, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_unmap(bo)\
+    _radeon_bo_unmap(bo, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_debug(bo, opcode)\
+    _radeon_bo_debug(bo, opcode, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_wait(bo) \
+    _radeon_bo_wait(bo, __FILE__, __func__, __LINE__)
+#define radeon_bo_is_busy(bo, domain) \
+    _radeon_bo_is_busy(bo, domain, __FILE__, __func__, __LINE__)
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
new file mode 100644
index 0000000000..3e7547d2f9
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
@@ -0,0 +1,926 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Dave Airlie
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <aet@rasterburn.org>
+ *      Nicolai Haehnle <prefect_@gmx.net>
+ *      Dave Airlie
+ *      Jérôme Glisse <glisse@freedesktop.org>
+ */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include "xf86drm.h"
+#include "texmem.h"
+#include "main/simple_list.h"
+
+#include "drm.h"
+#include "radeon_drm.h"
+#include "radeon_common.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_macros.h"
+
+/* no seriously texmem.c is this screwed up */
+struct bo_legacy_texture_object {
+    driTextureObject    base;
+    struct bo_legacy *parent;
+};
+
+struct bo_legacy {
+    struct radeon_bo    base;
+    int                 map_count;
+    uint32_t            pending;
+    int                 is_pending;
+    int                 static_bo;
+    uint32_t            offset;
+    struct bo_legacy_texture_object *tobj;
+    int                 validated;
+    int                 dirty;
+    void                *ptr;
+    struct bo_legacy    *next, *prev;
+    struct bo_legacy    *pnext, *pprev;
+};
+
+struct bo_manager_legacy {
+    struct radeon_bo_manager    base;
+    unsigned                    nhandle;
+    unsigned                    nfree_handles;
+    unsigned                    cfree_handles;
+    uint32_t                    current_age;
+    struct bo_legacy            bos;
+    struct bo_legacy            pending_bos;
+    uint32_t                    fb_location;
+    uint32_t                    texture_offset;
+    unsigned                    dma_alloc_size;
+    uint32_t                    dma_buf_count;
+    unsigned                    cpendings;
+    driTextureObject            texture_swapped;
+    driTexHeap                  *texture_heap;
+    struct radeon_screen        *screen;
+    unsigned                    *free_handles;
+};
+
+static void bo_legacy_tobj_destroy(void *data, driTextureObject *t)
+{
+    struct bo_legacy_texture_object *tobj = (struct bo_legacy_texture_object *)t;
+    
+    if (tobj->parent) {
+        tobj->parent->tobj = NULL;
+        tobj->parent->validated = 0;
+    }
+}
+
+static void inline clean_handles(struct bo_manager_legacy *bom)
+{
+  while (bom->cfree_handles > 0 &&
+	 !bom->free_handles[bom->cfree_handles - 1])
+    bom->cfree_handles--;
+
+}
+static int legacy_new_handle(struct bo_manager_legacy *bom, uint32_t *handle)
+{
+    uint32_t tmp;
+
+    *handle = 0;
+    if (bom->nhandle == 0xFFFFFFFF) {
+        return -EINVAL;
+    }
+    if (bom->cfree_handles > 0) {
+        tmp = bom->free_handles[--bom->cfree_handles];
+	clean_handles(bom);
+    } else {
+        bom->cfree_handles = 0;
+        tmp = bom->nhandle++;
+    }
+    assert(tmp);
+    *handle = tmp;
+    return 0;
+}
+
+static int legacy_free_handle(struct bo_manager_legacy *bom, uint32_t handle)
+{
+    uint32_t *handles;
+
+    if (!handle) {
+        return 0;
+    }
+    if (handle == (bom->nhandle - 1)) {
+        int i;
+
+        bom->nhandle--;
+        for (i = bom->cfree_handles - 1; i >= 0; i--) {
+            if (bom->free_handles[i] == (bom->nhandle - 1)) {
+                bom->nhandle--;
+                bom->free_handles[i] = 0;
+            }
+        }
+        clean_handles(bom);
+        return 0;
+    }
+    if (bom->cfree_handles < bom->nfree_handles) {
+        bom->free_handles[bom->cfree_handles++] = handle;
+        return 0;
+    }
+    bom->nfree_handles += 0x100;
+    handles = (uint32_t*)realloc(bom->free_handles, bom->nfree_handles * 4);
+    if (handles == NULL) {
+        bom->nfree_handles -= 0x100;
+        return -ENOMEM;
+    }
+    bom->free_handles = handles;
+    bom->free_handles[bom->cfree_handles++] = handle;
+    return 0;
+}
+
+static void legacy_get_current_age(struct bo_manager_legacy *boml)
+{
+    drm_radeon_getparam_t gp;
+    unsigned char *RADEONMMIO = NULL;
+    int r;
+
+    if (   IS_R300_CLASS(boml->screen) 
+        || IS_R600_CLASS(boml->screen) ) 
+    {
+    	gp.param = RADEON_PARAM_LAST_CLEAR;
+    	gp.value = (int *)&boml->current_age;
+    	r = drmCommandWriteRead(boml->base.fd, DRM_RADEON_GETPARAM,
+       	                     &gp, sizeof(gp));
+    	if (r) {
+       	 fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, r);
+         exit(1);
+       }
+    } 
+    else {
+        RADEONMMIO = boml->screen->mmio.map;
+        boml->current_age = boml->screen->scratch[3];
+        boml->current_age = INREG(RADEON_GUI_SCRATCH_REG3);
+    }
+}
+
+static int legacy_is_pending(struct radeon_bo *bo)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (bo_legacy->is_pending <= 0) {
+        bo_legacy->is_pending = 0;
+        return 0;
+    }
+    if (boml->current_age >= bo_legacy->pending) {
+        if (boml->pending_bos.pprev == bo_legacy) {
+            boml->pending_bos.pprev = bo_legacy->pprev;
+        }
+        bo_legacy->pprev->pnext = bo_legacy->pnext;
+        if (bo_legacy->pnext) {
+            bo_legacy->pnext->pprev = bo_legacy->pprev;
+        }
+	assert(bo_legacy->is_pending <= bo->cref);
+        while (bo_legacy->is_pending--) {
+	    bo = radeon_bo_unref(bo);
+	    if (!bo)
+	      break;
+        }
+	if (bo)
+	  bo_legacy->is_pending = 0;
+        boml->cpendings--;
+        return 0;
+    }
+    return 1;
+}
+
+static int legacy_wait_pending(struct radeon_bo *bo)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (!bo_legacy->is_pending) {
+        return 0;
+    }
+    /* FIXME: lockup and userspace busy looping that's all the folks */
+    legacy_get_current_age(boml);
+    while (legacy_is_pending(bo)) {
+        usleep(10);
+        legacy_get_current_age(boml);
+    }
+    return 0;
+}
+
+void legacy_track_pending(struct radeon_bo_manager *bom, int debug)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy*) bom;
+    struct bo_legacy *bo_legacy;
+    struct bo_legacy *next;
+
+    legacy_get_current_age(boml);
+    bo_legacy = boml->pending_bos.pnext;
+    while (bo_legacy) {
+        if (debug)
+            fprintf(stderr,"pending %p %d %d %d\n", bo_legacy, bo_legacy->base.size,
+                    boml->current_age, bo_legacy->pending);
+        next = bo_legacy->pnext;
+        if (legacy_is_pending(&(bo_legacy->base))) {
+        }
+        bo_legacy = next;
+    } 
+}
+
+static int legacy_wait_any_pending(struct bo_manager_legacy *boml)
+{
+    struct bo_legacy *bo_legacy;
+
+    legacy_get_current_age(boml);
+    bo_legacy = boml->pending_bos.pnext;
+    if (!bo_legacy)
+      return -1;
+    legacy_wait_pending(&bo_legacy->base);
+    return 0;
+}
+
+static void legacy_kick_all_buffers(struct bo_manager_legacy *boml)
+{
+    struct bo_legacy *legacy;
+
+    legacy = boml->bos.next;
+    while (legacy != &boml->bos) {
+	if (legacy->tobj) {
+	    if (legacy->validated) {
+		driDestroyTextureObject(&legacy->tobj->base);
+		legacy->tobj = 0;
+		legacy->validated = 0;
+	    }
+	}
+	legacy = legacy->next;
+    }
+}
+
+static struct bo_legacy *bo_allocate(struct bo_manager_legacy *boml,
+                                     uint32_t size,
+                                     uint32_t alignment,
+                                     uint32_t domains,
+                                     uint32_t flags)
+{
+    struct bo_legacy *bo_legacy;
+    static int pgsize;
+
+    if (pgsize == 0)
+        pgsize = getpagesize() - 1;
+
+    size = (size + pgsize) & ~pgsize;
+
+    bo_legacy = (struct bo_legacy*)calloc(1, sizeof(struct bo_legacy));
+    if (bo_legacy == NULL) {
+        return NULL;
+    }
+    bo_legacy->base.bom = (struct radeon_bo_manager*)boml;
+    bo_legacy->base.handle = 0;
+    bo_legacy->base.size = size;
+    bo_legacy->base.alignment = alignment;
+    bo_legacy->base.domains = domains;
+    bo_legacy->base.flags = flags;
+    bo_legacy->base.ptr = NULL;
+    bo_legacy->map_count = 0;
+    bo_legacy->next = NULL;
+    bo_legacy->prev = NULL;
+    bo_legacy->pnext = NULL;
+    bo_legacy->pprev = NULL;
+    bo_legacy->next = boml->bos.next;
+    bo_legacy->prev = &boml->bos;
+    boml->bos.next = bo_legacy;
+    if (bo_legacy->next) {
+        bo_legacy->next->prev = bo_legacy;
+    }
+
+    return bo_legacy;
+}
+
+static int bo_dma_alloc(struct radeon_bo *bo)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    drm_radeon_mem_alloc_t alloc;
+    unsigned size;
+    int base_offset;
+    int r;
+
+    /* align size on 4Kb */
+    size = (((4 * 1024) - 1) + bo->size) & ~((4 * 1024) - 1);
+    alloc.region = RADEON_MEM_REGION_GART;
+    alloc.alignment = bo_legacy->base.alignment;
+    alloc.size = size;
+    alloc.region_offset = &base_offset;
+    r = drmCommandWriteRead(bo->bom->fd,
+                            DRM_RADEON_ALLOC,
+                            &alloc,
+                            sizeof(alloc));
+    if (r) {
+        /* ptr is set to NULL if dma allocation failed */
+        bo_legacy->ptr = NULL;
+        return r;
+    }
+    bo_legacy->ptr = boml->screen->gartTextures.map + base_offset;
+    bo_legacy->offset = boml->screen->gart_texture_offset + base_offset;
+    bo->size = size;
+    boml->dma_alloc_size += size;
+    boml->dma_buf_count++;
+    return 0;
+}
+
+static int bo_dma_free(struct radeon_bo *bo)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    drm_radeon_mem_free_t memfree;
+    int r;
+
+    if (bo_legacy->ptr == NULL) {
+        /* ptr is set to NULL if dma allocation failed */
+        return 0;
+    }
+    legacy_get_current_age(boml);
+    memfree.region = RADEON_MEM_REGION_GART;
+    memfree.region_offset  = bo_legacy->offset;
+    memfree.region_offset -= boml->screen->gart_texture_offset;
+    r = drmCommandWrite(boml->base.fd,
+                        DRM_RADEON_FREE,
+                        &memfree,
+                        sizeof(memfree));
+    if (r) {
+        fprintf(stderr, "Failed to free bo[%p] at %08x\n",
+                &bo_legacy->base, memfree.region_offset);
+        fprintf(stderr, "ret = %s\n", strerror(-r));
+        return r;
+    }
+    boml->dma_alloc_size -= bo_legacy->base.size;
+    boml->dma_buf_count--;
+    return 0;
+}
+
+static void bo_free(struct bo_legacy *bo_legacy)
+{
+    struct bo_manager_legacy *boml;
+
+    if (bo_legacy == NULL) {
+        return;
+    }
+    boml = (struct bo_manager_legacy *)bo_legacy->base.bom;
+    bo_legacy->prev->next = bo_legacy->next;
+    if (bo_legacy->next) {
+        bo_legacy->next->prev = bo_legacy->prev;
+    }
+    if (!bo_legacy->static_bo) {
+        legacy_free_handle(boml, bo_legacy->base.handle);
+        if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) {
+            /* dma buffers */
+            bo_dma_free(&bo_legacy->base);
+        } else {
+  	    driDestroyTextureObject(&bo_legacy->tobj->base);
+	    bo_legacy->tobj = NULL;
+            /* free backing store */
+            free(bo_legacy->ptr);
+        }
+    }
+    memset(bo_legacy, 0 , sizeof(struct bo_legacy));
+    free(bo_legacy);
+}
+
+static struct radeon_bo *bo_open(struct radeon_bo_manager *bom,
+                                 uint32_t handle,
+                                 uint32_t size,
+                                 uint32_t alignment,
+                                 uint32_t domains,
+                                 uint32_t flags)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
+    struct bo_legacy *bo_legacy;
+    int r;
+
+    if (handle) {
+        bo_legacy = boml->bos.next;
+        while (bo_legacy) {
+            if (bo_legacy->base.handle == handle) {
+                radeon_bo_ref(&(bo_legacy->base));
+                return (struct radeon_bo*)bo_legacy;
+            }
+            bo_legacy = bo_legacy->next;
+        }
+        return NULL;
+    }
+    bo_legacy = bo_allocate(boml, size, alignment, domains, flags);
+    bo_legacy->static_bo = 0;
+    r = legacy_new_handle(boml, &bo_legacy->base.handle);
+    if (r) {
+        bo_free(bo_legacy);
+        return NULL;
+    }
+    if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) 
+    {
+retry:
+        legacy_track_pending(&boml->base, 0);
+        /* dma buffers */
+
+        r = bo_dma_alloc(&(bo_legacy->base));
+        if (r) 
+        {
+	         if (legacy_wait_any_pending(boml) == -1) 
+             {
+                  bo_free(bo_legacy);
+	              return NULL;
+             }
+	         goto retry;
+	         return NULL;
+        }
+    } 
+    else 
+    {
+        bo_legacy->ptr = malloc(bo_legacy->base.size);
+        if (bo_legacy->ptr == NULL) {
+            bo_free(bo_legacy);
+            return NULL;
+        }
+    }
+    radeon_bo_ref(&(bo_legacy->base));
+
+    return (struct radeon_bo*)bo_legacy;
+}
+
+static void bo_ref(struct radeon_bo *bo)
+{
+}
+
+static struct radeon_bo *bo_unref(struct radeon_bo *bo)
+{
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (bo->cref <= 0) {
+        bo_legacy->prev->next = bo_legacy->next;
+        if (bo_legacy->next) {
+            bo_legacy->next->prev = bo_legacy->prev;
+        }
+        if (!bo_legacy->is_pending) {
+            bo_free(bo_legacy);
+        }
+        return NULL;
+    }
+    return bo;
+}
+
+static int bo_map(struct radeon_bo *bo, int write)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    legacy_wait_pending(bo);
+    bo_legacy->validated = 0;
+    bo_legacy->dirty = 1;
+    bo_legacy->map_count++;
+    bo->ptr = bo_legacy->ptr;
+    /* Read the first pixel in the frame buffer.  This should
+     * be a noop, right?  In fact without this conform fails as reading
+     * from the framebuffer sometimes produces old results -- the
+     * on-card read cache gets mixed up and doesn't notice that the
+     * framebuffer has been updated.
+     *
+     * Note that we should probably be reading some otherwise unused
+     * region of VRAM, otherwise we might get incorrect results when
+     * reading pixels from the top left of the screen.
+     *
+     * I found this problem on an R420 with glean's texCube test.
+     * Note that the R200 span code also *writes* the first pixel in the
+     * framebuffer, but I've found this to be unnecessary.
+     *  -- Nicolai Hähnle, June 2008
+     */
+    if (!(bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+        int p;
+        volatile int *buf = (int*)boml->screen->driScreen->pFB;
+        p = *buf;
+    }
+
+    return 0;
+}
+
+static int bo_unmap(struct radeon_bo *bo)
+{
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (--bo_legacy->map_count > 0) 
+    {
+        return 0;
+    }
+    
+    bo->ptr = NULL;
+
+    return 0;
+}
+
+static int bo_is_busy(struct radeon_bo *bo, uint32_t *domain)
+{
+    *domain = 0;
+    if (bo->domains & RADEON_GEM_DOMAIN_GTT)
+        *domain = RADEON_GEM_DOMAIN_GTT;
+    else
+        *domain = RADEON_GEM_DOMAIN_CPU;
+    if (legacy_is_pending(bo))
+        return -EBUSY;
+    else
+        return 0;
+}
+
+static int bo_is_static(struct radeon_bo *bo)
+{
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    return bo_legacy->static_bo;
+}
+
+static struct radeon_bo_funcs bo_legacy_funcs = {
+    bo_open,
+    bo_ref,
+    bo_unref,
+    bo_map,
+    bo_unmap,
+    NULL,
+    bo_is_static,
+    NULL,
+    NULL,
+    bo_is_busy
+};
+
+static int bo_vram_validate(struct radeon_bo *bo,
+                            uint32_t *soffset,
+                            uint32_t *eoffset)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    int r;
+    int retry_count = 0, pending_retry = 0;
+    
+    if (!bo_legacy->tobj) {
+	bo_legacy->tobj = CALLOC(sizeof(struct bo_legacy_texture_object));
+	bo_legacy->tobj->parent = bo_legacy;
+	make_empty_list(&bo_legacy->tobj->base);
+	bo_legacy->tobj->base.totalSize = bo->size;
+    retry:
+        r = driAllocateTexture(&boml->texture_heap, 1,
+                               &bo_legacy->tobj->base);
+        if (r) {
+		pending_retry = 0;
+		while(boml->cpendings && pending_retry++ < 10000) {
+			legacy_track_pending(&boml->base, 0);
+			retry_count++;
+			if (retry_count > 2) {
+				free(bo_legacy->tobj);
+				bo_legacy->tobj = NULL;
+				fprintf(stderr, "Ouch! vram_validate failed %d\n", r);
+				return -1;
+			}
+			goto retry;
+		}
+	}
+        bo_legacy->offset = boml->texture_offset +
+                            bo_legacy->tobj->base.memBlock->ofs;
+        bo_legacy->dirty = 1;
+    }
+
+    assert(bo_legacy->tobj->base.memBlock);
+
+    if (bo_legacy->tobj)
+	driUpdateTextureLRU(&bo_legacy->tobj->base);
+
+    if (bo_legacy->dirty || bo_legacy->tobj->base.dirty_images[0]) {
+	    if (IS_R600_CLASS(boml->screen)) {
+		    drm_radeon_texture_t tex;
+		    drm_radeon_tex_image_t tmp;
+		    int ret;
+
+		    tex.offset = bo_legacy->offset;
+		    tex.image = &tmp;
+		    assert(!(tex.offset & 1023));
+
+		    tmp.x = 0;
+		    tmp.y = 0;
+		    tmp.width = bo->size;
+		    tmp.height = 1;
+		    tmp.data = bo_legacy->ptr;
+		    tex.format = RADEON_TXFORMAT_ARGB8888;
+		    tex.width = tmp.width;
+		    tex.height = tmp.height;
+		    tex.pitch = bo->size;
+		    do {
+			    ret = drmCommandWriteRead(bo->bom->fd,
+						      DRM_RADEON_TEXTURE,
+						      &tex,
+						      sizeof(drm_radeon_texture_t));
+			    if (ret) {
+				    if (RADEON_DEBUG & RADEON_IOCTL)
+					    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
+				    usleep(1);
+			    }
+		    } while (ret == -EAGAIN);
+	    } else {
+		    /* Copy to VRAM using a blit.
+		     * All memory is 4K aligned. We're using 1024 pixels wide blits.
+		     */
+		    drm_radeon_texture_t tex;
+		    drm_radeon_tex_image_t tmp;
+		    int ret;
+
+		    tex.offset = bo_legacy->offset;
+		    tex.image = &tmp;
+		    assert(!(tex.offset & 1023));
+
+		    tmp.x = 0;
+		    tmp.y = 0;
+		    if (bo->size < 4096) {
+			    tmp.width = (bo->size + 3) / 4;
+			    tmp.height = 1;
+		    } else {
+			    tmp.width = 1024;
+			    tmp.height = (bo->size + 4095) / 4096;
+		    }
+		    tmp.data = bo_legacy->ptr;
+		    tex.format = RADEON_TXFORMAT_ARGB8888;
+		    tex.width = tmp.width;
+		    tex.height = tmp.height;
+		    tex.pitch = MAX2(tmp.width / 16, 1);
+		    do {
+			    ret = drmCommandWriteRead(bo->bom->fd,
+						      DRM_RADEON_TEXTURE,
+						      &tex,
+						      sizeof(drm_radeon_texture_t));
+			    if (ret) {
+				    if (RADEON_DEBUG & RADEON_IOCTL)
+					    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
+				    usleep(1);
+			    }
+		    } while (ret == -EAGAIN);
+	    }
+	    bo_legacy->dirty = 0;
+	    bo_legacy->tobj->base.dirty_images[0] = 0;
+    }
+    return 0;
+}
+
+/* 
+ *  radeon_bo_legacy_validate -
+ *  returns:
+ *  0 - all good
+ *  -EINVAL - mapped buffer can't be validated
+ *  -EAGAIN - restart validation we've kicked all the buffers out
+ */
+int radeon_bo_legacy_validate(struct radeon_bo *bo,
+                              uint32_t *soffset,
+                              uint32_t *eoffset)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    int r;
+    int retries = 0;
+
+    if (bo_legacy->map_count) {
+        fprintf(stderr, "bo(%p, %d) is mapped (%d) can't valide it.\n",
+                bo, bo->size, bo_legacy->map_count);
+        return -EINVAL;
+    }
+    if (bo_legacy->static_bo || bo_legacy->validated) {
+        *soffset = bo_legacy->offset;
+        *eoffset = bo_legacy->offset + bo->size;
+
+        return 0;
+    }
+    if (!(bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+
+        r = bo_vram_validate(bo, soffset, eoffset);
+        if (r) {
+	    legacy_track_pending(&boml->base, 0);
+	    legacy_kick_all_buffers(boml);
+	    retries++;
+	    if (retries == 2) {
+		fprintf(stderr,"legacy bo: failed to get relocations into aperture\n");
+		assert(0);
+		exit(-1);
+	    }
+	    return -EAGAIN;
+        }
+    }
+    *soffset = bo_legacy->offset;
+    *eoffset = bo_legacy->offset + bo->size;
+    bo_legacy->validated = 1;
+
+    return 0;
+}
+
+void radeon_bo_legacy_pending(struct radeon_bo *bo, uint32_t pending)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    bo_legacy->pending = pending;
+    bo_legacy->is_pending++;
+    /* add to pending list */
+    radeon_bo_ref(bo);
+    if (bo_legacy->is_pending > 1) {
+        return;    
+    }
+    bo_legacy->pprev = boml->pending_bos.pprev;
+    bo_legacy->pnext = NULL;
+    bo_legacy->pprev->pnext = bo_legacy;
+    boml->pending_bos.pprev = bo_legacy;
+    boml->cpendings++;
+}
+
+void radeon_bo_manager_legacy_dtor(struct radeon_bo_manager *bom)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
+    struct bo_legacy *bo_legacy;
+
+    if (bom == NULL) {
+        return;
+    }
+    bo_legacy = boml->bos.next;
+    while (bo_legacy) {
+        struct bo_legacy *next;
+
+        next = bo_legacy->next;
+        bo_free(bo_legacy);
+        bo_legacy = next;
+    }
+    driDestroyTextureHeap(boml->texture_heap);
+    free(boml->free_handles);
+    free(boml);
+}
+
+static struct bo_legacy *radeon_legacy_bo_alloc_static(struct bo_manager_legacy *bom,
+						       int size,
+						       uint32_t offset)
+{
+    struct bo_legacy *bo;
+
+    bo = bo_allocate(bom, size, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+    if (bo == NULL)
+	return NULL;
+    bo->static_bo = 1;
+    bo->offset = offset + bom->fb_location;
+    bo->base.handle = bo->offset;
+    bo->ptr = bom->screen->driScreen->pFB + offset;
+    if (bo->base.handle > bom->nhandle) {
+        bom->nhandle = bo->base.handle + 1;
+    }
+    radeon_bo_ref(&(bo->base));
+    return bo;
+}
+
+struct radeon_bo_manager *radeon_bo_manager_legacy_ctor(struct radeon_screen *scrn)
+{
+    struct bo_manager_legacy *bom;
+    struct bo_legacy *bo;
+    unsigned size;
+
+    bom = (struct bo_manager_legacy*)
+          calloc(1, sizeof(struct bo_manager_legacy));
+    if (bom == NULL) {
+        return NULL;
+    }
+
+    make_empty_list(&bom->texture_swapped);
+
+    bom->texture_heap = driCreateTextureHeap(0,
+                                             bom,
+                                             scrn->texSize[0],
+                                             12,
+                                             RADEON_NR_TEX_REGIONS,
+                                             (drmTextureRegionPtr)scrn->sarea->tex_list[0],
+                                             &scrn->sarea->tex_age[0],
+                                             &bom->texture_swapped,
+                                             sizeof(struct bo_legacy_texture_object),
+                                             &bo_legacy_tobj_destroy);
+    bom->texture_offset = scrn->texOffset[0];
+
+    bom->base.funcs = &bo_legacy_funcs;
+    bom->base.fd = scrn->driScreen->fd;
+    bom->bos.next = NULL;
+    bom->bos.prev = NULL;
+    bom->pending_bos.pprev = &bom->pending_bos;
+    bom->pending_bos.pnext = NULL;
+    bom->screen = scrn;
+    bom->fb_location = scrn->fbLocation;
+    bom->nhandle = 1;
+    bom->cfree_handles = 0;
+    bom->nfree_handles = 0x400;
+    bom->free_handles = (uint32_t*)malloc(bom->nfree_handles * 4);
+    if (bom->free_handles == NULL) {
+        radeon_bo_manager_legacy_dtor((struct radeon_bo_manager*)bom);
+        return NULL;
+    }
+
+    /* biggest framebuffer size */
+    size = 4096*4096*4; 
+
+    /* allocate front */
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->frontOffset);
+
+    if (!bo) {
+        radeon_bo_manager_legacy_dtor((struct radeon_bo_manager*)bom);
+        return NULL;
+    }
+    if (scrn->sarea->tiling_enabled) {
+        bo->base.flags = RADEON_BO_FLAGS_MACRO_TILE;
+    }
+
+    /* allocate back */
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->backOffset);
+
+    if (!bo) {
+        radeon_bo_manager_legacy_dtor((struct radeon_bo_manager*)bom);
+        return NULL;
+    }
+    if (scrn->sarea->tiling_enabled) {
+        bo->base.flags = RADEON_BO_FLAGS_MACRO_TILE;
+    }
+
+    /* allocate depth */
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->depthOffset);
+
+    if (!bo) {
+        radeon_bo_manager_legacy_dtor((struct radeon_bo_manager*)bom);
+        return NULL;
+    }
+    bo->base.flags = 0;
+    if (scrn->sarea->tiling_enabled) {
+        bo->base.flags |= RADEON_BO_FLAGS_MACRO_TILE;
+        bo->base.flags |= RADEON_BO_FLAGS_MICRO_TILE;
+    }
+    return (struct radeon_bo_manager*)bom;
+}
+
+void radeon_bo_legacy_texture_age(struct radeon_bo_manager *bom)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
+    DRI_AGE_TEXTURES(boml->texture_heap);
+}
+
+unsigned radeon_bo_legacy_relocs_size(struct radeon_bo *bo)
+{
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (bo_legacy->static_bo || (bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+        return 0;
+    }
+    return bo->size;
+}
+
+/*
+ * Fake up a bo for things like texture image_override.
+ * bo->offset already includes fb_location
+ */
+struct radeon_bo *radeon_legacy_bo_alloc_fake(struct radeon_bo_manager *bom,
+					      int size,
+	                                      uint32_t offset)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
+    struct bo_legacy *bo;
+
+    bo = bo_allocate(boml, size, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+    if (bo == NULL)
+	return NULL;
+    bo->static_bo = 1;
+    bo->offset = offset;
+    bo->base.handle = bo->offset;
+    bo->ptr = boml->screen->driScreen->pFB + (offset - boml->fb_location);
+    if (bo->base.handle > boml->nhandle) {
+        boml->nhandle = bo->base.handle + 1;
+    }
+    radeon_bo_ref(&(bo->base));
+    return &(bo->base);
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h
new file mode 100644
index 0000000000..2cf15dfaff
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h
@@ -0,0 +1,50 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <aet@rasterburn.org>
+ *      Nicolai Haehnle <prefect_@gmx.net>
+ *      Jérôme Glisse <glisse@freedesktop.org>
+ */
+#ifndef RADEON_BO_LEGACY_H
+#define RADEON_BO_LEGACY_H
+
+#include "radeon_screen.h"
+
+void radeon_bo_legacy_pending(struct radeon_bo *bo, uint32_t pending);
+int radeon_bo_legacy_validate(struct radeon_bo *bo,
+                              uint32_t *soffset,
+                              uint32_t *eoffset);
+struct radeon_bo_manager *radeon_bo_manager_legacy_ctor(struct radeon_screen *scrn);
+void radeon_bo_manager_legacy_dtor(struct radeon_bo_manager *bom);
+void radeon_bo_legacy_texture_age(struct radeon_bo_manager *bom);
+unsigned radeon_bo_legacy_relocs_size(struct radeon_bo *bo);
+struct radeon_bo *radeon_legacy_bo_alloc_fake(struct radeon_bo_manager *bom,
+					      int size,
+	                                      uint32_t offset);
+void legacy_track_pending(struct radeon_bo_manager *bom, int debug);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
new file mode 100644
index 0000000000..4520a7d7d4
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
@@ -0,0 +1,99 @@
+#ifndef RADEON_CS_WRAPPER_H
+#define RADEON_CS_WRAPPER_H
+
+#ifdef HAVE_LIBDRM_RADEON
+
+#include "radeon_bo.h"
+#include "radeon_bo_gem.h"
+#include "radeon_cs.h"
+#include "radeon_cs_gem.h"
+
+#else
+#include <stdint.h>
+
+#define RADEON_GEM_DOMAIN_CPU 0x1   // Cached CPU domain
+#define RADEON_GEM_DOMAIN_GTT 0x2   // GTT or cache flushed
+#define RADEON_GEM_DOMAIN_VRAM 0x4  // VRAM domain
+
+#define RADEON_TILING_MACRO 0x1
+#define RADEON_TILING_MICRO 0x2
+#define RADEON_TILING_SWAP 0x4
+#define RADEON_TILING_SURFACE 0x8 /* this object requires a surface
+				   * when mapped - i.e. front buffer */
+
+/* to be used to build locally in mesa with no libdrm bits */
+#include "../radeon/radeon_bo_drm.h"
+#include "../radeon/radeon_cs_drm.h"
+
+#ifndef DRM_RADEON_GEM_INFO
+#define DRM_RADEON_GEM_INFO 0x1c
+
+struct drm_radeon_gem_info {
+        uint64_t gart_size;
+        uint64_t vram_size;
+        uint64_t vram_visible;
+};
+
+struct drm_radeon_info {
+	uint32_t request;
+	uint32_t pad;
+	uint32_t value;
+};
+#endif
+
+#ifndef RADEON_PARAM_DEVICE_ID
+#define RADEON_PARAM_DEVICE_ID 16
+#endif
+
+#ifndef RADEON_PARAM_NUM_Z_PIPES
+#define RADEON_PARAM_NUM_Z_PIPES 17
+#endif
+
+#ifndef RADEON_INFO_DEVICE_ID
+#define RADEON_INFO_DEVICE_ID 0
+#endif
+#ifndef RADEON_INFO_NUM_GB_PIPES
+#define RADEON_INFO_NUM_GB_PIPES 0
+#endif
+
+#ifndef RADEON_INFO_NUM_Z_PIPES
+#define RADEON_INFO_NUM_Z_PIPES 0
+#endif
+
+#ifndef DRM_RADEON_INFO
+#define DRM_RADEON_INFO 0x1
+#endif
+
+
+static inline uint32_t radeon_gem_name_bo(struct radeon_bo *dummy)
+{
+  return 0;
+}
+
+static inline void *radeon_bo_manager_gem_ctor(int fd)
+{
+  return NULL;
+}
+
+static inline void radeon_bo_manager_gem_dtor(void *dummy)
+{
+}
+
+static inline void *radeon_cs_manager_gem_ctor(int fd)
+{
+  return NULL;
+}
+
+static inline void radeon_cs_manager_gem_dtor(void *dummy)
+{
+}
+
+static inline void radeon_tracker_print(void *ptr, int io)
+{
+}
+#endif
+
+#include "radeon_bo_legacy.h"
+#include "radeon_cs_legacy.h"
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
new file mode 100644
index 0000000000..a24b6dac26
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_buffer_objects.h"
+
+#include "main/imports.h"
+#include "main/mtypes.h"
+#include "main/bufferobj.h"
+
+#include "radeon_common.h"
+
+struct radeon_buffer_object *
+get_radeon_buffer_object(struct gl_buffer_object *obj)
+{
+    return (struct radeon_buffer_object *) obj;
+}
+
+static struct gl_buffer_object *
+radeonNewBufferObject(GLcontext * ctx,
+                      GLuint name,
+                      GLenum target)
+{
+    struct radeon_buffer_object *obj = CALLOC_STRUCT(radeon_buffer_object);
+
+    _mesa_initialize_buffer_object(&obj->Base, name, target);
+
+    obj->bo = NULL;
+
+    return &obj->Base;
+}
+
+/**
+ * Called via glDeleteBuffersARB().
+ */
+static void
+radeonDeleteBufferObject(GLcontext * ctx,
+                         struct gl_buffer_object *obj)
+{
+    struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
+
+    if (obj->Pointer) {
+        radeon_bo_unmap(radeon_obj->bo);
+    }
+
+    if (radeon_obj->bo) {
+        radeon_bo_unref(radeon_obj->bo);
+    }
+
+    _mesa_free(radeon_obj);
+}
+
+
+/**
+ * Allocate space for and store data in a buffer object.  Any data that was
+ * previously stored in the buffer object is lost.  If data is NULL,
+ * memory will be allocated, but no copy will occur.
+ * Called via ctx->Driver.BufferData().
+ * \return GL_TRUE for success, GL_FALSE if out of memory
+ */
+static GLboolean
+radeonBufferData(GLcontext * ctx,
+                 GLenum target,
+                 GLsizeiptrARB size,
+                 const GLvoid * data,
+                 GLenum usage,
+                 struct gl_buffer_object *obj)
+{
+    radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+    struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
+
+    radeon_obj->Base.Size = size;
+    radeon_obj->Base.Usage = usage;
+
+    if (radeon_obj->bo != NULL) {
+        radeon_bo_unref(radeon_obj->bo);
+        radeon_obj->bo = NULL;
+    }
+
+    if (size != 0) {
+        radeon_obj->bo = radeon_bo_open(radeon->radeonScreen->bom,
+                                        0,
+                                        size,
+                                        32,
+                                        RADEON_GEM_DOMAIN_GTT,
+                                        0);
+
+        if (!radeon_obj->bo)
+            return GL_FALSE;
+
+        if (data != NULL) {
+            radeon_bo_map(radeon_obj->bo, GL_TRUE);
+
+            _mesa_memcpy(radeon_obj->bo->ptr, data, size);
+
+            radeon_bo_unmap(radeon_obj->bo);
+        }
+    }
+    return GL_TRUE;
+}
+
+/**
+ * Replace data in a subrange of buffer object.  If the data range
+ * specified by size + offset extends beyond the end of the buffer or
+ * if data is NULL, no copy is performed.
+ * Called via glBufferSubDataARB().
+ */
+static void
+radeonBufferSubData(GLcontext * ctx,
+                    GLenum target,
+                    GLintptrARB offset,
+                    GLsizeiptrARB size,
+                    const GLvoid * data,
+                    struct gl_buffer_object *obj)
+{
+    struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
+
+    radeon_bo_map(radeon_obj->bo, GL_TRUE);
+
+    _mesa_memcpy(radeon_obj->bo->ptr + offset, data, size);
+
+    radeon_bo_unmap(radeon_obj->bo);
+}
+
+/**
+ * Called via glGetBufferSubDataARB()
+ */
+static void
+radeonGetBufferSubData(GLcontext * ctx,
+                       GLenum target,
+                       GLintptrARB offset,
+                       GLsizeiptrARB size,
+                       GLvoid * data,
+                       struct gl_buffer_object *obj)
+{
+    struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
+
+    radeon_bo_map(radeon_obj->bo, GL_FALSE);
+
+    _mesa_memcpy(data, radeon_obj->bo->ptr + offset, size);
+
+    radeon_bo_unmap(radeon_obj->bo);
+}
+
+/**
+ * Called via glMapBufferARB()
+ */
+static void *
+radeonMapBuffer(GLcontext * ctx,
+                GLenum target,
+                GLenum access,
+                struct gl_buffer_object *obj)
+{
+    struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
+
+    if (access == GL_WRITE_ONLY_ARB) {
+        ctx->Driver.Flush(ctx);
+    }
+
+    if (radeon_obj->bo == NULL) {
+        obj->Pointer = NULL;
+        return NULL;
+    }
+
+    radeon_bo_map(radeon_obj->bo, access == GL_WRITE_ONLY_ARB);
+
+    return obj->Pointer = radeon_obj->bo->ptr;
+}
+
+
+/**
+ * Called via glUnmapBufferARB()
+ */
+static GLboolean
+radeonUnmapBuffer(GLcontext * ctx,
+                  GLenum target,
+                  struct gl_buffer_object *obj)
+{
+    struct radeon_buffer_object *radeon_obj = get_radeon_buffer_object(obj);
+
+    if (radeon_obj->bo != NULL) {
+        radeon_bo_unmap(radeon_obj->bo);
+        obj->Pointer = NULL;
+    }
+
+    return GL_TRUE;
+}
+
+void
+radeonInitBufferObjectFuncs(struct dd_function_table *functions)
+{
+    functions->NewBufferObject = radeonNewBufferObject;
+    functions->DeleteBuffer = radeonDeleteBufferObject;
+    functions->BufferData = radeonBufferData;
+    functions->BufferSubData = radeonBufferSubData;
+    functions->GetBufferSubData = radeonGetBufferSubData;
+    functions->MapBuffer = radeonMapBuffer;
+    functions->UnmapBuffer = radeonUnmapBuffer;
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_buffer_objects.h b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.h
new file mode 100644
index 0000000000..d681960825
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_buffer_objects.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_BUFFER_OBJECTS_H
+#define RADEON_BUFFER_OBJECTS_H
+
+#include "main/mtypes.h"
+
+struct radeon_bo;
+
+/**
+ * Radeon vertex/pixel buffer object, derived from Mesa's gl_buffer_object.
+ */
+struct radeon_buffer_object
+{
+   struct gl_buffer_object Base;
+   struct radeon_bo *bo;
+};
+
+struct radeon_buffer_object *
+get_radeon_buffer_object(struct gl_buffer_object *obj);
+
+/**
+ * Hook the bufferobject implementation into mesa:
+ */
+void radeonInitBufferObjectFuncs(struct dd_function_table *functions);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index 55a73eab20..46a9cd5ff8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -247,11 +247,153 @@
 #define PCI_CHIP_RS350_7835		0x7835
 #define PCI_CHIP_RS690_791E             0x791E
 #define PCI_CHIP_RS690_791F             0x791F
+#define PCI_CHIP_RS600_793F             0x793F
+#define PCI_CHIP_RS600_7941             0x7941
+#define PCI_CHIP_RS600_7942             0x7942
 #define PCI_CHIP_RS740_796C             0x796C
 #define PCI_CHIP_RS740_796D             0x796D
 #define PCI_CHIP_RS740_796E             0x796E
 #define PCI_CHIP_RS740_796F             0x796F
 
+#define PCI_CHIP_R600_9400              0x9400
+#define PCI_CHIP_R600_9401              0x9401
+#define PCI_CHIP_R600_9402              0x9402
+#define PCI_CHIP_R600_9403              0x9403
+#define PCI_CHIP_R600_9405              0x9405
+#define PCI_CHIP_R600_940A              0x940A
+#define PCI_CHIP_R600_940B              0x940B
+#define PCI_CHIP_R600_940F              0x940F
+
+#define PCI_CHIP_RV610_94C0             0x94C0
+#define PCI_CHIP_RV610_94C1             0x94C1
+#define PCI_CHIP_RV610_94C3             0x94C3
+#define PCI_CHIP_RV610_94C4             0x94C4
+#define PCI_CHIP_RV610_94C5             0x94C5
+#define PCI_CHIP_RV610_94C6             0x94C6
+#define PCI_CHIP_RV610_94C7             0x94C7
+#define PCI_CHIP_RV610_94C8             0x94C8
+#define PCI_CHIP_RV610_94C9             0x94C9
+#define PCI_CHIP_RV610_94CB             0x94CB
+#define PCI_CHIP_RV610_94CC             0x94CC
+#define PCI_CHIP_RV610_94CD             0x94CD
+
+#define PCI_CHIP_RV630_9580             0x9580
+#define PCI_CHIP_RV630_9581             0x9581
+#define PCI_CHIP_RV630_9583             0x9583
+#define PCI_CHIP_RV630_9586             0x9586
+#define PCI_CHIP_RV630_9587             0x9587
+#define PCI_CHIP_RV630_9588             0x9588
+#define PCI_CHIP_RV630_9589             0x9589
+#define PCI_CHIP_RV630_958A             0x958A
+#define PCI_CHIP_RV630_958B             0x958B
+#define PCI_CHIP_RV630_958C             0x958C
+#define PCI_CHIP_RV630_958D             0x958D
+#define PCI_CHIP_RV630_958E             0x958E
+#define PCI_CHIP_RV630_958F             0x958F
+
+#define PCI_CHIP_RV670_9500             0x9500
+#define PCI_CHIP_RV670_9501             0x9501
+#define PCI_CHIP_RV670_9504             0x9504
+#define PCI_CHIP_RV670_9505             0x9505
+#define PCI_CHIP_RV670_9506             0x9506
+#define PCI_CHIP_RV670_9507             0x9507
+#define PCI_CHIP_RV670_9508             0x9508
+#define PCI_CHIP_RV670_9509             0x9509
+#define PCI_CHIP_RV670_950F             0x950F
+#define PCI_CHIP_RV670_9511             0x9511
+#define PCI_CHIP_RV670_9515             0x9515
+#define PCI_CHIP_RV670_9517             0x9517
+#define PCI_CHIP_RV670_9519             0x9519
+
+#define PCI_CHIP_RV620_95C0             0x95C0
+#define PCI_CHIP_RV620_95C2             0x95C2
+#define PCI_CHIP_RV620_95C4             0x95C4
+#define PCI_CHIP_RV620_95C5             0x95C5
+#define PCI_CHIP_RV620_95C6             0x95C6
+#define PCI_CHIP_RV620_95C7             0x95C7
+#define PCI_CHIP_RV620_95C9             0x95C9
+#define PCI_CHIP_RV620_95CC             0x95CC
+#define PCI_CHIP_RV620_95CD             0x95CD
+#define PCI_CHIP_RV620_95CE             0x95CE
+#define PCI_CHIP_RV620_95CF             0x95CF
+
+#define PCI_CHIP_RV635_9590             0x9590
+#define PCI_CHIP_RV635_9591             0x9591
+#define PCI_CHIP_RV635_9593             0x9593
+#define PCI_CHIP_RV635_9595             0x9595
+#define PCI_CHIP_RV635_9596             0x9596
+#define PCI_CHIP_RV635_9597             0x9597
+#define PCI_CHIP_RV635_9598             0x9598
+#define PCI_CHIP_RV635_9599             0x9599
+#define PCI_CHIP_RV635_959B             0x959B
+
+#define PCI_CHIP_RS780_9610             0x9610
+#define PCI_CHIP_RS780_9611             0x9611
+#define PCI_CHIP_RS780_9612             0x9612
+#define PCI_CHIP_RS780_9613             0x9613
+#define PCI_CHIP_RS780_9614             0x9614
+#define PCI_CHIP_RS780_9615             0x9615
+#define PCI_CHIP_RS780_9616             0x9616
+
+#define PCI_CHIP_RS880_9710             0x9710
+#define PCI_CHIP_RS880_9711             0x9711
+#define PCI_CHIP_RS880_9712             0x9712
+#define PCI_CHIP_RS880_9713             0x9713
+#define PCI_CHIP_RS880_9714             0x9714
+
+#define PCI_CHIP_RV770_9440             0x9440
+#define PCI_CHIP_RV770_9441             0x9441
+#define PCI_CHIP_RV770_9442             0x9442
+#define PCI_CHIP_RV770_9443             0x9443
+#define PCI_CHIP_RV770_9444             0x9444
+#define PCI_CHIP_RV770_9446             0x9446
+#define PCI_CHIP_RV770_944A             0x944A
+#define PCI_CHIP_RV770_944B             0x944B
+#define PCI_CHIP_RV770_944C             0x944C
+#define PCI_CHIP_RV770_944E             0x944E
+#define PCI_CHIP_RV770_9450             0x9450
+#define PCI_CHIP_RV770_9452             0x9452
+#define PCI_CHIP_RV770_9456             0x9456
+#define PCI_CHIP_RV770_945A             0x945A
+#define PCI_CHIP_RV770_945B             0x945B
+#define PCI_CHIP_RV790_9460             0x9460
+#define PCI_CHIP_RV790_9462             0x9462
+#define PCI_CHIP_RV770_946A             0x946A
+#define PCI_CHIP_RV770_946B             0x946B
+#define PCI_CHIP_RV770_947A             0x947A
+#define PCI_CHIP_RV770_947B             0x947B
+
+#define PCI_CHIP_RV730_9480             0x9480
+#define PCI_CHIP_RV730_9487             0x9487
+#define PCI_CHIP_RV730_9488             0x9488
+#define PCI_CHIP_RV730_9489             0x9489
+#define PCI_CHIP_RV730_948F             0x948F
+#define PCI_CHIP_RV730_9490             0x9490
+#define PCI_CHIP_RV730_9491             0x9491
+#define PCI_CHIP_RV730_9495             0x9495
+#define PCI_CHIP_RV730_9498             0x9498
+#define PCI_CHIP_RV730_949C             0x949C
+#define PCI_CHIP_RV730_949E             0x949E
+#define PCI_CHIP_RV730_949F             0x949F
+
+#define PCI_CHIP_RV710_9540             0x9540
+#define PCI_CHIP_RV710_9541             0x9541
+#define PCI_CHIP_RV710_9542             0x9542
+#define PCI_CHIP_RV710_954E             0x954E
+#define PCI_CHIP_RV710_954F             0x954F
+#define PCI_CHIP_RV710_9552             0x9552
+#define PCI_CHIP_RV710_9553             0x9553
+#define PCI_CHIP_RV710_9555             0x9555
+#define PCI_CHIP_RV710_9557             0x9557
+
+#define PCI_CHIP_RV740_94A0             0x94A0
+#define PCI_CHIP_RV740_94A1             0x94A1
+#define PCI_CHIP_RV740_94A3             0x94A3
+#define PCI_CHIP_RV740_94B1             0x94B1
+#define PCI_CHIP_RV740_94B3             0x94B3
+#define PCI_CHIP_RV740_94B4             0x94B4
+#define PCI_CHIP_RV740_94B5             0x94B5
+#define PCI_CHIP_RV740_94B9             0x94B9
 
 enum {
    CHIP_FAMILY_R100,
@@ -270,6 +412,7 @@ enum {
    CHIP_FAMILY_R420,
    CHIP_FAMILY_RV410,
    CHIP_FAMILY_RS400,
+   CHIP_FAMILY_RS600,
    CHIP_FAMILY_RS690,
    CHIP_FAMILY_RS740,
    CHIP_FAMILY_RV515,
@@ -278,6 +421,18 @@ enum {
    CHIP_FAMILY_R580,
    CHIP_FAMILY_RV560,
    CHIP_FAMILY_RV570,
+   CHIP_FAMILY_R600,
+   CHIP_FAMILY_RV610,
+   CHIP_FAMILY_RV630,
+   CHIP_FAMILY_RV670,
+   CHIP_FAMILY_RV620,
+   CHIP_FAMILY_RV635,
+   CHIP_FAMILY_RS780,
+   CHIP_FAMILY_RS880,
+   CHIP_FAMILY_RV770,
+   CHIP_FAMILY_RV730,
+   CHIP_FAMILY_RV710,
+   CHIP_FAMILY_RV740,
    CHIP_FAMILY_LAST
 };
 
@@ -285,6 +440,7 @@ enum {
 #define RADEON_CLASS_R100		(0 << 0)
 #define RADEON_CLASS_R200		(1 << 0)
 #define RADEON_CLASS_R300		(2 << 0)
+#define RADEON_CLASS_R600		(3 << 0)
 #define RADEON_CLASS_MASK		(3 << 0)
 
 #define RADEON_CHIPSET_TCL		(1 << 2)	/* tcl support - any radeon */
diff --git a/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h b/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
new file mode 100644
index 0000000000..6fcd1ce7ca
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
@@ -0,0 +1,121 @@
+#ifndef COMMON_CMDBUF_H
+#define COMMON_CMDBUF_H
+
+#include "radeon_bocs_wrapper.h"
+
+GLboolean rcommonEnsureCmdBufSpace(radeonContextPtr rmesa, int dwords, const char *caller);
+int rcommonFlushCmdBuf(radeonContextPtr rmesa, const char *caller);
+int rcommonFlushCmdBufLocked(radeonContextPtr rmesa, const char *caller);
+void rcommonInitCmdBuf(radeonContextPtr rmesa);
+void rcommonDestroyCmdBuf(radeonContextPtr rmesa);
+
+void rcommonBeginBatch(radeonContextPtr rmesa,
+		       int n,
+		       int dostate,
+		       const char *file,
+		       const char *function,
+		       int line);
+
+/* +r6/r7 : code here moved */
+
+#define CP_PACKET2  (2 << 30)
+#define CP_PACKET0(reg, n)	(RADEON_CP_PACKET0 | ((n)<<16) | ((reg)>>2))
+#define CP_PACKET0_ONE(reg, n)	(RADEON_CP_PACKET0 | RADEON_CP_PACKET0_ONE_REG_WR | ((n)<<16) | ((reg)>>2))
+#define CP_PACKET3(pkt, n)	(RADEON_CP_PACKET3 | (pkt) | ((n) << 16))
+
+/**
+ * Every function writing to the command buffer needs to declare this
+ * to get the necessary local variables.
+ */
+#define BATCH_LOCALS(rmesa) \
+	const radeonContextPtr b_l_rmesa = rmesa
+
+/**
+ * Prepare writing n dwords to the command buffer,
+ * including producing any necessary state emits on buffer wraparound.
+ */
+#define BEGIN_BATCH(n) rcommonBeginBatch(b_l_rmesa, n, 1, __FILE__, __FUNCTION__, __LINE__)
+
+/**
+ * Same as BEGIN_BATCH, but do not cause automatic state emits.
+ */
+#define BEGIN_BATCH_NO_AUTOSTATE(n) rcommonBeginBatch(b_l_rmesa, n, 0, __FILE__, __FUNCTION__, __LINE__)
+
+/**
+ * Write one dword to the command buffer.
+ */
+#define OUT_BATCH(data) \
+	do { \
+        radeon_cs_write_dword(b_l_rmesa->cmdbuf.cs, data);\
+	} while(0)
+
+/**
+ * Write a relocated dword to the command buffer.
+ */
+#define OUT_BATCH_RELOC(data, bo, offset, rd, wd, flags) 	\
+	do { 							\
+	int  __offset = (offset);				\
+        if (0 && __offset) {					\
+            fprintf(stderr, "(%s:%s:%d) offset : %d\n",		\
+            __FILE__, __FUNCTION__, __LINE__, __offset);	\
+        }							\
+        radeon_cs_write_dword(b_l_rmesa->cmdbuf.cs, __offset);	\
+        radeon_cs_write_reloc(b_l_rmesa->cmdbuf.cs, 		\
+                              bo, rd, wd, flags);		\
+	if (!b_l_rmesa->radeonScreen->kernel_mm) 		\
+		b_l_rmesa->cmdbuf.cs->section_cdw += 2;		\
+	} while(0)
+
+
+/**
+ * Write n dwords from ptr to the command buffer.
+ */
+#define OUT_BATCH_TABLE(ptr,n) \
+	do { \
+		radeon_cs_write_table(b_l_rmesa->cmdbuf.cs, (ptr), (n));\
+	} while(0)
+
+/**
+ * Finish writing dwords to the command buffer.
+ * The number of (direct or indirect) OUT_BATCH calls between the previous
+ * BEGIN_BATCH and END_BATCH must match the number specified at BEGIN_BATCH time.
+ */
+#define END_BATCH() \
+	do { \
+        radeon_cs_end(b_l_rmesa->cmdbuf.cs, __FILE__, __FUNCTION__, __LINE__);\
+	} while(0)
+
+/**
+ * After the last END_BATCH() of rendering, this indicates that flushing
+ * the command buffer now is okay.
+ */
+#define COMMIT_BATCH() \
+	do { \
+	} while(0)
+
+
+/** Single register write to command buffer; requires 2 dwords. */
+#define OUT_BATCH_REGVAL(reg, val) \
+	OUT_BATCH(cmdpacket0(b_l_rmesa->radeonScreen, (reg), 1)); \
+	OUT_BATCH((val))
+
+/** Continuous register range write to command buffer; requires 1 dword,
+ * expects count dwords afterwards for register contents. */
+#define OUT_BATCH_REGSEQ(reg, count) \
+	OUT_BATCH(cmdpacket0(b_l_rmesa->radeonScreen, (reg), (count)))
+
+/** Write a 32 bit float to the ring; requires 1 dword. */
+#define OUT_BATCH_FLOAT32(f) \
+	OUT_BATCH(radeonPackFloat32((f)))
+
+/* +r6/r7 : code here moved */
+
+/* Fire the buffered vertices no matter what.
+ */
+static INLINE void radeon_firevertices(radeonContextPtr radeon)
+{
+   if (radeon->cmdbuf.cs->cdw || radeon->dma.flush )
+      radeon->glCtx->Driver.Flush(radeon->glCtx); /* +r6/r7 */
+}
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
new file mode 100644
index 0000000000..a4c7b40798
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -0,0 +1,1349 @@
+/**************************************************************************
+
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+/*
+   - Scissor implementation
+   - buffer swap/copy ioctls
+   - finish/flush
+   - state emission
+   - cmdbuffer management
+*/
+
+#include <errno.h>
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/context.h"
+#include "main/enums.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "drivers/common/meta.h"
+
+#include "vblank.h"
+
+#include "radeon_common.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_lock.h"
+#include "radeon_drm.h"
+#include "radeon_queryobj.h"
+
+/**
+ * Enable verbose debug output for emit code.
+ * 0 no output
+ * 1 most output
+ * 2 also print state alues
+ */
+#define RADEON_CMDBUF         0
+
+/* =============================================================
+ * Scissoring
+ */
+
+static GLboolean intersect_rect(drm_clip_rect_t * out,
+				drm_clip_rect_t * a, drm_clip_rect_t * b)
+{
+	*out = *a;
+	if (b->x1 > out->x1)
+		out->x1 = b->x1;
+	if (b->y1 > out->y1)
+		out->y1 = b->y1;
+	if (b->x2 < out->x2)
+		out->x2 = b->x2;
+	if (b->y2 < out->y2)
+		out->y2 = b->y2;
+	if (out->x1 >= out->x2)
+		return GL_FALSE;
+	if (out->y1 >= out->y2)
+		return GL_FALSE;
+	return GL_TRUE;
+}
+
+void radeonRecalcScissorRects(radeonContextPtr radeon)
+{
+	drm_clip_rect_t *out;
+	int i;
+
+	/* Grow cliprect store?
+	 */
+	if (radeon->state.scissor.numAllocedClipRects < radeon->numClipRects) {
+		while (radeon->state.scissor.numAllocedClipRects <
+		       radeon->numClipRects) {
+			radeon->state.scissor.numAllocedClipRects += 1;	/* zero case */
+			radeon->state.scissor.numAllocedClipRects *= 2;
+		}
+
+		if (radeon->state.scissor.pClipRects)
+			FREE(radeon->state.scissor.pClipRects);
+
+		radeon->state.scissor.pClipRects =
+			MALLOC(radeon->state.scissor.numAllocedClipRects *
+			       sizeof(drm_clip_rect_t));
+
+		if (radeon->state.scissor.pClipRects == NULL) {
+			radeon->state.scissor.numAllocedClipRects = 0;
+			return;
+		}
+	}
+
+	out = radeon->state.scissor.pClipRects;
+	radeon->state.scissor.numClipRects = 0;
+
+	for (i = 0; i < radeon->numClipRects; i++) {
+		if (intersect_rect(out,
+				   &radeon->pClipRects[i],
+				   &radeon->state.scissor.rect)) {
+			radeon->state.scissor.numClipRects++;
+			out++;
+		}
+	}
+
+	if (radeon->vtbl.update_scissor)
+	   radeon->vtbl.update_scissor(radeon->glCtx);
+}
+
+void radeon_get_cliprects(radeonContextPtr radeon,
+			  struct drm_clip_rect **cliprects,
+			  unsigned int *num_cliprects,
+			  int *x_off, int *y_off)
+{
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(radeon);
+	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
+
+	if (radeon->constant_cliprect) {
+		radeon->fboRect.x1 = 0;
+		radeon->fboRect.y1 = 0;
+		radeon->fboRect.x2 = radeon->glCtx->DrawBuffer->Width;
+		radeon->fboRect.y2 = radeon->glCtx->DrawBuffer->Height;
+
+		*cliprects = &radeon->fboRect;
+		*num_cliprects = 1;
+		*x_off = 0;
+		*y_off = 0;
+	} else if (radeon->front_cliprects ||
+		   rfb->pf_active || dPriv->numBackClipRects == 0) {
+		*cliprects = dPriv->pClipRects;
+		*num_cliprects = dPriv->numClipRects;
+		*x_off = dPriv->x;
+		*y_off = dPriv->y;
+	} else {
+		*num_cliprects = dPriv->numBackClipRects;
+		*cliprects = dPriv->pBackClipRects;
+		*x_off = dPriv->backX;
+		*y_off = dPriv->backY;
+	}
+}
+
+/**
+ * Update cliprects and scissors.
+ */
+void radeonSetCliprects(radeonContextPtr radeon)
+{
+	__DRIdrawablePrivate *const drawable = radeon_get_drawable(radeon);
+	__DRIdrawablePrivate *const readable = radeon_get_readable(radeon);
+	struct radeon_framebuffer *const draw_rfb = drawable->driverPrivate;
+	struct radeon_framebuffer *const read_rfb = readable->driverPrivate;
+	int x_off, y_off;
+
+	radeon_get_cliprects(radeon, &radeon->pClipRects,
+			     &radeon->numClipRects, &x_off, &y_off);
+
+	if ((draw_rfb->base.Width != drawable->w) ||
+	    (draw_rfb->base.Height != drawable->h)) {
+		_mesa_resize_framebuffer(radeon->glCtx, &draw_rfb->base,
+					 drawable->w, drawable->h);
+		draw_rfb->base.Initialized = GL_TRUE;
+	}
+
+	if (drawable != readable) {
+		if ((read_rfb->base.Width != readable->w) ||
+		    (read_rfb->base.Height != readable->h)) {
+			_mesa_resize_framebuffer(radeon->glCtx, &read_rfb->base,
+						 readable->w, readable->h);
+			read_rfb->base.Initialized = GL_TRUE;
+		}
+	}
+
+	if (radeon->state.scissor.enabled)
+		radeonRecalcScissorRects(radeon);
+
+}
+
+
+
+void radeonUpdateScissor( GLcontext *ctx )
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	GLint x = ctx->Scissor.X, y = ctx->Scissor.Y;
+	GLsizei w = ctx->Scissor.Width, h = ctx->Scissor.Height;
+	int x1, y1, x2, y2;
+	int min_x, min_y, max_x, max_y;
+
+	if (!ctx->DrawBuffer)
+	    return;
+	min_x = min_y = 0;
+	max_x = ctx->DrawBuffer->Width - 1;
+	max_y = ctx->DrawBuffer->Height - 1;
+
+	if ( !ctx->DrawBuffer->Name ) {
+		x1 = x;
+		y1 = ctx->DrawBuffer->Height - (y + h);
+		x2 = x + w - 1;
+		y2 = y1 + h - 1;
+	} else {
+		x1 = x;
+		y1 = y;
+		x2 = x + w - 1;
+		y2 = y + h - 1;
+
+	}
+	if (!rmesa->radeonScreen->kernel_mm) {
+	   /* Fix scissors for dri 1 */
+
+	   __DRIdrawablePrivate *dPriv = radeon_get_drawable(rmesa);
+	   x1 += dPriv->x;
+	   x2 += dPriv->x + 1;
+	   min_x += dPriv->x;
+	   max_x += dPriv->x + 1;
+	   y1 += dPriv->y;
+	   y2 += dPriv->y + 1;
+	   min_y += dPriv->y;
+	   max_y += dPriv->y + 1;
+	}
+
+	rmesa->state.scissor.rect.x1 = CLAMP(x1,  min_x, max_x);
+	rmesa->state.scissor.rect.y1 = CLAMP(y1,  min_y, max_y);
+	rmesa->state.scissor.rect.x2 = CLAMP(x2,  min_x, max_x);
+	rmesa->state.scissor.rect.y2 = CLAMP(y2,  min_y, max_y);
+
+	radeonRecalcScissorRects( rmesa );
+}
+
+/* =============================================================
+ * Scissoring
+ */
+
+void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	if (ctx->Scissor.Enabled) {
+		/* We don't pipeline cliprect changes */
+		radeon_firevertices(radeon);
+		radeonUpdateScissor(ctx);
+	}
+}
+
+void radeonPolygonStipplePreKMS( GLcontext *ctx, const GLubyte *mask )
+{
+   radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+   GLuint i;
+   drm_radeon_stipple_t stipple;
+
+   /* Must flip pattern upside down.
+   */
+   for ( i = 0 ; i < 32 ; i++ ) {
+      stipple.mask[31 - i] = ((GLuint *) mask)[i];
+   }
+
+   /* TODO: push this into cmd mechanism
+   */
+   radeon_firevertices(radeon);
+   LOCK_HARDWARE( radeon );
+
+   drmCommandWrite( radeon->dri.fd, DRM_RADEON_STIPPLE,
+	 &stipple, sizeof(stipple) );
+   UNLOCK_HARDWARE( radeon );
+}
+
+
+/* ================================================================
+ * SwapBuffers with client-side throttling
+ */
+
+static uint32_t radeonGetLastFrame(radeonContextPtr radeon)
+{
+	drm_radeon_getparam_t gp;
+	int ret;
+	uint32_t frame = 0;
+
+	gp.param = RADEON_PARAM_LAST_FRAME;
+	gp.value = (int *)&frame;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+				  &gp, sizeof(gp));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+
+	return frame;
+}
+
+uint32_t radeonGetAge(radeonContextPtr radeon)
+{
+	drm_radeon_getparam_t gp;
+	int ret;
+	uint32_t age;
+
+	gp.param = RADEON_PARAM_LAST_CLEAR;
+	gp.value = (int *)&age;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+				  &gp, sizeof(gp));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+
+	return age;
+}
+
+static void radeonEmitIrqLocked(radeonContextPtr radeon)
+{
+	drm_radeon_irq_emit_t ie;
+	int ret;
+
+	ie.irq_seq = &radeon->iw.irq_seq;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_IRQ_EMIT,
+				  &ie, sizeof(ie));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+}
+
+static void radeonWaitIrq(radeonContextPtr radeon)
+{
+	int ret;
+
+	do {
+		ret = drmCommandWrite(radeon->dri.fd, DRM_RADEON_IRQ_WAIT,
+				      &radeon->iw, sizeof(radeon->iw));
+	} while (ret && (errno == EINTR || errno == EBUSY));
+
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+}
+
+static void radeonWaitForFrameCompletion(radeonContextPtr radeon)
+{
+	drm_radeon_sarea_t *sarea = radeon->sarea;
+
+	if (radeon->do_irqs) {
+		if (radeonGetLastFrame(radeon) < sarea->last_frame) {
+			if (!radeon->irqsEmitted) {
+				while (radeonGetLastFrame(radeon) <
+				       sarea->last_frame) ;
+			} else {
+				UNLOCK_HARDWARE(radeon);
+				radeonWaitIrq(radeon);
+				LOCK_HARDWARE(radeon);
+			}
+			radeon->irqsEmitted = 10;
+		}
+
+		if (radeon->irqsEmitted) {
+			radeonEmitIrqLocked(radeon);
+			radeon->irqsEmitted--;
+		}
+	} else {
+		while (radeonGetLastFrame(radeon) < sarea->last_frame) {
+			UNLOCK_HARDWARE(radeon);
+			if (radeon->do_usleeps)
+				DO_USLEEP(1);
+			LOCK_HARDWARE(radeon);
+		}
+	}
+}
+
+/* wait for idle */
+void radeonWaitForIdleLocked(radeonContextPtr radeon)
+{
+	int ret;
+	int i = 0;
+
+	do {
+		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_CP_IDLE);
+		if (ret)
+			DO_USLEEP(1);
+	} while (ret && ++i < 100);
+
+	if (ret < 0) {
+		UNLOCK_HARDWARE(radeon);
+		fprintf(stderr, "Error: R300 timed out... exiting\n");
+		exit(-1);
+	}
+}
+
+static void radeonWaitForIdle(radeonContextPtr radeon)
+{
+	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+        LOCK_HARDWARE(radeon);
+	    radeonWaitForIdleLocked(radeon);
+	    UNLOCK_HARDWARE(radeon);
+    }
+}
+
+static void radeon_flip_renderbuffers(struct radeon_framebuffer *rfb)
+{
+	int current_page = rfb->pf_current_page;
+	int next_page = (current_page + 1) % rfb->pf_num_pages;
+	struct gl_renderbuffer *tmp_rb;
+
+	/* Exchange renderbuffers if necessary but make sure their
+	 * reference counts are preserved.
+	 */
+	if (rfb->color_rb[current_page] &&
+	    rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer !=
+	    &rfb->color_rb[current_page]->base) {
+		tmp_rb = NULL;
+		_mesa_reference_renderbuffer(&tmp_rb,
+					     rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer);
+		tmp_rb = &rfb->color_rb[current_page]->base;
+		_mesa_reference_renderbuffer(&rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer, tmp_rb);
+		_mesa_reference_renderbuffer(&tmp_rb, NULL);
+	}
+
+	if (rfb->color_rb[next_page] &&
+	    rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer !=
+	    &rfb->color_rb[next_page]->base) {
+		tmp_rb = NULL;
+		_mesa_reference_renderbuffer(&tmp_rb,
+					     rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer);
+		tmp_rb = &rfb->color_rb[next_page]->base;
+		_mesa_reference_renderbuffer(&rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer, tmp_rb);
+		_mesa_reference_renderbuffer(&tmp_rb, NULL);
+	}
+}
+
+/* Copy the back color buffer to the front color buffer.
+ */
+void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
+		       const drm_clip_rect_t	  *rect)
+{
+	radeonContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	GLint nbox, i, ret;
+
+	assert(dPriv);
+	assert(dPriv->driContextPriv);
+	assert(dPriv->driContextPriv->driverPrivate);
+
+	rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+
+	LOCK_HARDWARE(rmesa);
+
+	rfb = dPriv->driverPrivate;
+
+	if ( RADEON_DEBUG & RADEON_IOCTL ) {
+		fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *) rmesa->glCtx );
+	}
+
+	nbox = dPriv->numClipRects; /* must be in locked region */
+
+	for ( i = 0 ; i < nbox ; ) {
+		GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
+		drm_clip_rect_t *box = dPriv->pClipRects;
+		drm_clip_rect_t *b = rmesa->sarea->boxes;
+		GLint n = 0;
+
+		for ( ; i < nr ; i++ ) {
+
+			*b = box[i];
+
+			if (rect)
+			{
+				if (rect->x1 > b->x1)
+					b->x1 = rect->x1;
+				if (rect->y1 > b->y1)
+					b->y1 = rect->y1;
+				if (rect->x2 < b->x2)
+					b->x2 = rect->x2;
+				if (rect->y2 < b->y2)
+					b->y2 = rect->y2;
+
+				if (b->x1 >= b->x2 || b->y1 >= b->y2)
+					continue;
+			}
+
+			b++;
+			n++;
+		}
+		rmesa->sarea->nbox = n;
+
+		if (!n)
+			continue;
+
+		ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
+
+		if ( ret ) {
+			fprintf( stderr, "DRM_RADEON_SWAP_BUFFERS: return = %d\n", ret );
+			UNLOCK_HARDWARE( rmesa );
+			exit( 1 );
+		}
+	}
+
+	UNLOCK_HARDWARE( rmesa );
+}
+
+static int radeonScheduleSwap(__DRIdrawablePrivate *dPriv, GLboolean *missed_target)
+{
+	radeonContextPtr rmesa;
+
+	rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+	radeon_firevertices(rmesa);
+
+	LOCK_HARDWARE( rmesa );
+
+	if (!dPriv->numClipRects) {
+		UNLOCK_HARDWARE(rmesa);
+		usleep(10000);	/* throttle invisible client 10ms */
+		return 0;
+	}
+
+	radeonWaitForFrameCompletion(rmesa);
+
+	UNLOCK_HARDWARE(rmesa);
+	driWaitForVBlank(dPriv, missed_target);
+
+	return 0;
+}
+
+static GLboolean radeonPageFlip( __DRIdrawablePrivate *dPriv )
+{
+	radeonContextPtr radeon;
+	GLint ret;
+	__DRIscreenPrivate *psp;
+	struct radeon_renderbuffer *rrb;
+	struct radeon_framebuffer *rfb;
+
+	assert(dPriv);
+	assert(dPriv->driContextPriv);
+	assert(dPriv->driContextPriv->driverPrivate);
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+	rfb = dPriv->driverPrivate;
+	rrb = (void *)rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+
+	psp = dPriv->driScreenPriv;
+
+	LOCK_HARDWARE(radeon);
+
+	if ( RADEON_DEBUG & RADEON_IOCTL ) {
+		fprintf(stderr, "%s: pfCurrentPage: %d %d\n", __FUNCTION__,
+			radeon->sarea->pfCurrentPage, radeon->sarea->pfState);
+	}
+	drm_clip_rect_t *box = dPriv->pClipRects;
+	drm_clip_rect_t *b = radeon->sarea->boxes;
+	b[0] = box[0];
+	radeon->sarea->nbox = 1;
+
+	ret = drmCommandNone( radeon->dri.fd, DRM_RADEON_FLIP );
+
+	UNLOCK_HARDWARE(radeon);
+
+	if ( ret ) {
+		fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
+		return GL_FALSE;
+	}
+
+	if (!rfb->pf_active)
+		return GL_FALSE;
+
+	rfb->pf_current_page = radeon->sarea->pfCurrentPage;
+	radeon_flip_renderbuffers(rfb);
+	radeon_draw_buffer(radeon->glCtx, &rfb->base);
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Swap front and back buffer.
+ */
+void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
+{
+	int64_t ust;
+	__DRIscreenPrivate *psp;
+
+	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+		radeonContextPtr radeon;
+		GLcontext *ctx;
+
+		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+		ctx = radeon->glCtx;
+
+		if (ctx->Visual.doubleBufferMode) {
+			GLboolean missed_target;
+			struct radeon_framebuffer *rfb = dPriv->driverPrivate;
+			_mesa_notifySwapBuffers(ctx);/* flush pending rendering comands */
+
+			radeonScheduleSwap(dPriv, &missed_target);
+
+			if (rfb->pf_active) {
+				radeonPageFlip(dPriv);
+			} else {
+				radeonCopyBuffer(dPriv, NULL);
+			}
+
+			psp = dPriv->driScreenPriv;
+
+			rfb->swap_count++;
+			(*psp->systemTime->getUST)( & ust );
+			if ( missed_target ) {
+				rfb->swap_missed_count++;
+				rfb->swap_missed_ust = ust - rfb->swap_ust;
+			}
+
+			rfb->swap_ust = ust;
+			radeon->hw.all_dirty = GL_TRUE;
+		}
+	} else {
+		/* XXX this shouldn't be an error but we can't handle it for now */
+		_mesa_problem(NULL, "%s: drawable has no context!",
+			      __FUNCTION__);
+	}
+}
+
+void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+			 int x, int y, int w, int h )
+{
+	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+		radeonContextPtr radeon;
+		GLcontext *ctx;
+
+		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+		ctx = radeon->glCtx;
+
+		if (ctx->Visual.doubleBufferMode) {
+			drm_clip_rect_t rect;
+			rect.x1 = x + dPriv->x;
+			rect.y1 = (dPriv->h - y - h) + dPriv->y;
+			rect.x2 = rect.x1 + w;
+			rect.y2 = rect.y1 + h;
+			_mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+			radeonCopyBuffer(dPriv, &rect);
+		}
+	} else {
+		/* XXX this shouldn't be an error but we can't handle it for now */
+		_mesa_problem(NULL, "%s: drawable has no context!",
+			      __FUNCTION__);
+	}
+}
+
+void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct radeon_renderbuffer *rrbDepth = NULL, *rrbStencil = NULL,
+		*rrbColor = NULL;
+	uint32_t offset = 0;
+
+
+	if (!fb) {
+		/* this can happen during the initial context initialization */
+		return;
+	}
+
+	/* radeons only handle 1 color draw so far */
+	if (fb->_NumColorDrawBuffers != 1) {
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE);
+		return;
+	}
+
+	/* Do this here, note core Mesa, since this function is called from
+	 * many places within the driver.
+	 */
+	if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+		/* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
+		_mesa_update_framebuffer(ctx);
+		/* this updates the DrawBuffer's Width/Height if it's a FBO */
+		_mesa_update_draw_buffer_bounds(ctx);
+	}
+
+	if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+		/* this may occur when we're called by glBindFrameBuffer() during
+		 * the process of someone setting up renderbuffers, etc.
+		 */
+		/*_mesa_debug(ctx, "DrawBuffer: incomplete user FBO\n");*/
+		return;
+	}
+
+	if (fb->Name)
+		;/* do something depthy/stencily TODO */
+
+
+		/* none */
+	if (fb->Name == 0) {
+		if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
+			rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer);
+			radeon->front_cliprects = GL_TRUE;
+			radeon->front_buffer_dirty = GL_TRUE;
+		} else {
+			rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer);
+			radeon->front_cliprects = GL_FALSE;
+		}
+	} else {
+		/* user FBO in theory */
+		struct radeon_renderbuffer *rrb;
+		rrb = radeon_renderbuffer(fb->_ColorDrawBuffers[0]);
+		if (rrb) {
+			offset = rrb->draw_offset;
+			rrbColor = rrb;
+		}
+		radeon->constant_cliprect = GL_TRUE;
+	}
+
+	if (rrbColor == NULL)
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE);
+	else
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DRAW_BUFFER, GL_FALSE);
+
+
+	if (fb->_DepthBuffer && fb->_DepthBuffer->Wrapped) {
+		rrbDepth = radeon_renderbuffer(fb->_DepthBuffer->Wrapped);
+		if (rrbDepth && rrbDepth->bo) {
+			radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DEPTH_BUFFER, GL_FALSE);
+		} else {
+			radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DEPTH_BUFFER, GL_TRUE);
+		}
+	} else {
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DEPTH_BUFFER, GL_FALSE);
+		rrbDepth = NULL;
+	}
+
+	if (fb->_StencilBuffer && fb->_StencilBuffer->Wrapped) {
+		rrbStencil = radeon_renderbuffer(fb->_StencilBuffer->Wrapped);
+		if (rrbStencil && rrbStencil->bo) {
+			radeon->vtbl.fallback(ctx, RADEON_FALLBACK_STENCIL_BUFFER, GL_FALSE);
+			/* need to re-compute stencil hw state */
+			if (!rrbDepth)
+				rrbDepth = rrbStencil;
+		} else {
+			radeon->vtbl.fallback(ctx, RADEON_FALLBACK_STENCIL_BUFFER, GL_TRUE);
+		}
+	} else {
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_STENCIL_BUFFER, GL_FALSE);
+		if (ctx->Driver.Enable != NULL)
+			ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
+		else
+			ctx->NewState |= _NEW_STENCIL;
+	}
+
+	/* Update culling direction which changes depending on the
+	 * orientation of the buffer:
+	 */
+	if (ctx->Driver.FrontFace)
+		ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
+	else
+		ctx->NewState |= _NEW_POLYGON;
+
+	/*
+	 * Update depth test state
+	 */
+	if (ctx->Driver.Enable) {
+		ctx->Driver.Enable(ctx, GL_DEPTH_TEST,
+				   (ctx->Depth.Test && fb->Visual.depthBits > 0));
+		/* Need to update the derived ctx->Stencil._Enabled first */
+		ctx->Driver.Enable(ctx, GL_STENCIL_TEST,
+				   (ctx->Stencil.Enabled && fb->Visual.stencilBits > 0));
+	} else {
+		ctx->NewState |= (_NEW_DEPTH | _NEW_STENCIL);
+	}
+
+	_mesa_reference_renderbuffer(&radeon->state.depth.rb, &rrbDepth->base);
+	_mesa_reference_renderbuffer(&radeon->state.color.rb, &rrbColor->base);
+	radeon->state.color.draw_offset = offset;
+
+#if 0
+	/* update viewport since it depends on window size */
+	if (ctx->Driver.Viewport) {
+		ctx->Driver.Viewport(ctx, ctx->Viewport.X, ctx->Viewport.Y,
+				     ctx->Viewport.Width, ctx->Viewport.Height);
+	} else {
+
+	}
+#endif
+	ctx->NewState |= _NEW_VIEWPORT;
+
+	/* Set state we know depends on drawable parameters:
+	 */
+	radeonUpdateScissor(ctx);
+	radeon->NewGLState |= _NEW_SCISSOR;
+
+	if (ctx->Driver.DepthRange)
+		ctx->Driver.DepthRange(ctx,
+				       ctx->Viewport.Near,
+				       ctx->Viewport.Far);
+
+	/* Update culling direction which changes depending on the
+	 * orientation of the buffer:
+	 */
+	if (ctx->Driver.FrontFace)
+		ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
+	else
+		ctx->NewState |= _NEW_POLYGON;
+}
+
+/**
+ * Called via glDrawBuffer.
+ */
+void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
+{
+	if (RADEON_DEBUG & RADEON_DRI)
+		fprintf(stderr, "%s %s\n", __FUNCTION__,
+			_mesa_lookup_enum_by_nr( mode ));
+
+	if (ctx->DrawBuffer->Name == 0) {
+		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+		const GLboolean was_front_buffer_rendering =
+			radeon->is_front_buffer_rendering;
+
+		radeon->is_front_buffer_rendering = (mode == GL_FRONT_LEFT) ||
+                                            (mode == GL_FRONT);
+
+      /* If we weren't front-buffer rendering before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+		if (!was_front_buffer_rendering && radeon->is_front_buffer_rendering) {
+			radeon_update_renderbuffers(radeon->dri.context,
+				radeon->dri.context->driDrawablePriv);
+      }
+	}
+
+	radeon_draw_buffer(ctx, ctx->DrawBuffer);
+}
+
+void radeonReadBuffer( GLcontext *ctx, GLenum mode )
+{
+	if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+		struct radeon_context *const rmesa = RADEON_CONTEXT(ctx);
+		const GLboolean was_front_buffer_reading = rmesa->is_front_buffer_reading;
+		rmesa->is_front_buffer_reading = (mode == GL_FRONT_LEFT)
+					|| (mode == GL_FRONT);
+
+		if (!was_front_buffer_reading && rmesa->is_front_buffer_reading) {
+			radeon_update_renderbuffers(rmesa->dri.context,
+						    rmesa->dri.context->driReadablePriv);
+	 	}
+	}
+	/* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
+	if (ctx->ReadBuffer == ctx->DrawBuffer) {
+		/* This will update FBO completeness status.
+		 * A framebuffer will be incomplete if the GL_READ_BUFFER setting
+		 * refers to a missing renderbuffer.  Calling glReadBuffer can set
+		 * that straight and can make the drawing buffer complete.
+		 */
+		radeon_draw_buffer(ctx, ctx->DrawBuffer);
+	}
+}
+
+
+/* Turn on/off page flipping according to the flags in the sarea:
+ */
+void radeonUpdatePageFlipping(radeonContextPtr radeon)
+{
+	struct radeon_framebuffer *rfb = radeon_get_drawable(radeon)->driverPrivate;
+
+	rfb->pf_active = radeon->sarea->pfState;
+	rfb->pf_current_page = radeon->sarea->pfCurrentPage;
+	rfb->pf_num_pages = 2;
+	radeon_flip_renderbuffers(rfb);
+	radeon_draw_buffer(radeon->glCtx, radeon->glCtx->DrawBuffer);
+}
+
+void radeon_window_moved(radeonContextPtr radeon)
+{
+	/* Cliprects has to be updated before doing anything else */
+	radeonSetCliprects(radeon);
+	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+		radeonUpdatePageFlipping(radeon);
+	}
+}
+
+void radeon_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	__DRIcontext *driContext = radeon->dri.context;
+	void (*old_viewport)(GLcontext *ctx, GLint x, GLint y,
+			     GLsizei w, GLsizei h);
+
+	if (!driContext->driScreenPriv->dri2.enabled)
+		return;
+
+	if (!radeon->meta.internal_viewport_call && ctx->DrawBuffer->Name == 0) {
+		if (radeon->is_front_buffer_rendering) {
+			ctx->Driver.Flush(ctx);
+		}
+		radeon_update_renderbuffers(driContext, driContext->driDrawablePriv);
+		if (driContext->driDrawablePriv != driContext->driReadablePriv)
+			radeon_update_renderbuffers(driContext, driContext->driReadablePriv);
+	}
+
+	old_viewport = ctx->Driver.Viewport;
+	ctx->Driver.Viewport = NULL;
+	radeon_window_moved(radeon);
+	radeon_draw_buffer(ctx, radeon->glCtx->DrawBuffer);
+	ctx->Driver.Viewport = old_viewport;
+}
+
+static void radeon_print_state_atom_prekmm(radeonContextPtr radeon, struct radeon_state_atom *state)
+{
+	int i, j, reg;
+	int dwords = (*state->check) (radeon->glCtx, state);
+	drm_r300_cmd_header_t cmd;
+
+	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords, state->cmd_size);
+
+	if (radeon_is_debug_enabled(RADEON_STATE, RADEON_TRACE)) {
+		if (dwords > state->cmd_size)
+			dwords = state->cmd_size;
+
+		for (i = 0; i < dwords;) {
+			cmd = *((drm_r300_cmd_header_t *) &state->cmd[i]);
+			reg = (cmd.packet0.reghi << 8) | cmd.packet0.reglo;
+			fprintf(stderr, "      %s[%d]: cmdpacket0 (first reg=0x%04x, count=%d)\n",
+					state->name, i, reg, cmd.packet0.count);
+			++i;
+			for (j = 0; j < cmd.packet0.count && i < dwords; j++) {
+				fprintf(stderr, "      %s[%d]: 0x%04x = %08x\n",
+						state->name, i, reg, state->cmd[i]);
+				reg += 4;
+				++i;
+			}
+		}
+	}
+}
+
+static void radeon_print_state_atom(radeonContextPtr radeon, struct radeon_state_atom *state)
+{
+	int i, j, reg, count;
+	int dwords;
+	uint32_t packet0;
+	if (!radeon_is_debug_enabled(RADEON_STATE, RADEON_VERBOSE) )
+		return;
+
+	if (!radeon->radeonScreen->kernel_mm) {
+		radeon_print_state_atom_prekmm(radeon, state);
+		return;
+	}
+
+	dwords = (*state->check) (radeon->glCtx, state);
+
+	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords, state->cmd_size);
+
+	if (radeon_is_debug_enabled(RADEON_STATE, RADEON_TRACE)) {
+		if (dwords > state->cmd_size)
+			dwords = state->cmd_size;
+		for (i = 0; i < dwords;) {
+			packet0 = state->cmd[i];
+			reg = (packet0 & 0x1FFF) << 2;
+			count = ((packet0 & 0x3FFF0000) >> 16) + 1;
+			fprintf(stderr, "      %s[%d]: cmdpacket0 (first reg=0x%04x, count=%d)\n",
+					state->name, i, reg, count);
+			++i;
+			for (j = 0; j < count && i < dwords; j++) {
+				fprintf(stderr, "      %s[%d]: 0x%04x = %08x\n",
+						state->name, i, reg, state->cmd[i]);
+				reg += 4;
+				++i;
+			}
+		}
+	}
+}
+
+/**
+ * Count total size for next state emit.
+ **/
+GLuint radeonCountStateEmitSize(radeonContextPtr radeon)
+{
+	struct radeon_state_atom *atom;
+	GLuint dwords = 0;
+	/* check if we are going to emit full state */
+
+	if (radeon->cmdbuf.cs->cdw && !radeon->hw.all_dirty) {
+		if (!radeon->hw.is_dirty)
+			goto out;
+		foreach(atom, &radeon->hw.atomlist) {
+			if (atom->dirty) {
+				const GLuint atom_size = atom->check(radeon->glCtx, atom);
+				dwords += atom_size;
+				if (RADEON_CMDBUF && atom_size) {
+					radeon_print_state_atom(radeon, atom);
+				}
+			}
+		}
+	} else {
+		foreach(atom, &radeon->hw.atomlist) {
+			const GLuint atom_size = atom->check(radeon->glCtx, atom);
+			dwords += atom_size;
+			if (RADEON_CMDBUF && atom_size) {
+				radeon_print_state_atom(radeon, atom);
+			}
+
+		}
+	}
+out:
+	radeon_print(RADEON_STATE, RADEON_NORMAL, "%s %u\n", __func__, dwords);
+	return dwords;
+}
+
+static INLINE void radeon_emit_atom(radeonContextPtr radeon, struct radeon_state_atom *atom)
+{
+	BATCH_LOCALS(radeon);
+	int dwords;
+
+	dwords = (*atom->check) (radeon->glCtx, atom);
+	if (dwords) {
+
+		radeon_print_state_atom(radeon, atom);
+
+		if (atom->emit) {
+			(*atom->emit)(radeon->glCtx, atom);
+		} else {
+			BEGIN_BATCH_NO_AUTOSTATE(dwords);
+			OUT_BATCH_TABLE(atom->cmd, dwords);
+			END_BATCH();
+		}
+	} else {
+		radeon_print(RADEON_STATE, RADEON_VERBOSE, "  skip state %s\n", atom->name);
+	}
+	atom->dirty = GL_FALSE;
+
+}
+
+static INLINE void radeonEmitAtoms(radeonContextPtr radeon, GLboolean emitAll)
+{
+	struct radeon_state_atom *atom;
+
+	if (radeon->vtbl.pre_emit_atoms)
+		radeon->vtbl.pre_emit_atoms(radeon);
+
+	/* Emit actual atoms */
+	if (radeon->hw.all_dirty || emitAll) {
+		foreach(atom, &radeon->hw.atomlist)
+			radeon_emit_atom( radeon, atom );
+	} else {
+		foreach(atom, &radeon->hw.atomlist) {
+			if ( atom->dirty )
+				radeon_emit_atom( radeon, atom );
+		}
+	}
+
+	COMMIT_BATCH();
+}
+
+static GLboolean radeon_revalidate_bos(GLcontext *ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	int ret;
+
+	ret = radeon_cs_space_check(radeon->cmdbuf.cs);
+	if (ret == RADEON_CS_SPACE_FLUSH)
+		return GL_FALSE;
+	return GL_TRUE;
+}
+
+void radeonEmitState(radeonContextPtr radeon)
+{
+	radeon_print(RADEON_STATE, RADEON_NORMAL, "%s\n", __FUNCTION__);
+
+	if (radeon->vtbl.pre_emit_state)
+		radeon->vtbl.pre_emit_state(radeon);
+
+	/* this code used to return here but now it emits zbs */
+	if (radeon->cmdbuf.cs->cdw && !radeon->hw.is_dirty && !radeon->hw.all_dirty)
+		return;
+
+	if (!radeon->cmdbuf.cs->cdw) {
+		if (RADEON_DEBUG & RADEON_STATE)
+			fprintf(stderr, "Begin reemit state\n");
+
+		radeonEmitAtoms(radeon, GL_TRUE);
+	} else {
+
+		if (RADEON_DEBUG & RADEON_STATE)
+			fprintf(stderr, "Begin dirty state\n");
+
+		radeonEmitAtoms(radeon, GL_FALSE);
+	}
+
+	radeon->hw.is_dirty = GL_FALSE;
+	radeon->hw.all_dirty = GL_FALSE;
+}
+
+
+void radeonFlush(GLcontext *ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, radeon->cmdbuf.cs->cdw);
+
+	/* okay if we have no cmds in the buffer &&
+	   we have no DMA flush &&
+	   we have no DMA buffer allocated.
+	   then no point flushing anything at all.
+	*/
+	if (!radeon->dma.flush && !radeon->cmdbuf.cs->cdw && is_empty_list(&radeon->dma.reserved))
+		return;
+
+	if (radeon->dma.flush)
+		radeon->dma.flush( ctx );
+
+	radeonEmitState(radeon);
+
+	if (radeon->cmdbuf.cs->cdw)
+		rcommonFlushCmdBuf(radeon, __FUNCTION__);
+
+	if ((ctx->DrawBuffer->Name == 0) && radeon->front_buffer_dirty) {
+		__DRIscreen *const screen = radeon->radeonScreen->driScreen;
+
+		if (screen->dri2.loader && (screen->dri2.loader->base.version >= 2)
+			&& (screen->dri2.loader->flushFrontBuffer != NULL)) {
+			__DRIdrawablePrivate * drawable = radeon_get_drawable(radeon);
+			(*screen->dri2.loader->flushFrontBuffer)(drawable, drawable->loaderPrivate);
+
+			/* Only clear the dirty bit if front-buffer rendering is no longer
+			 * enabled.  This is done so that the dirty bit can only be set in
+			 * glDrawBuffer.  Otherwise the dirty bit would have to be set at
+			 * each of N places that do rendering.  This has worse performances,
+			 * but it is much easier to get correct.
+			 */
+			if (!radeon->is_front_buffer_rendering) {
+				radeon->front_buffer_dirty = GL_FALSE;
+			}
+		}
+	}
+
+	make_empty_list(&radeon->query.not_flushed_head);
+
+}
+
+/* Make sure all commands have been sent to the hardware and have
+ * completed processing.
+ */
+void radeonFinish(GLcontext * ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
+	int i;
+
+	if (ctx->Driver.Flush)
+		ctx->Driver.Flush(ctx); /* +r6/r7 */
+
+	if (radeon->radeonScreen->kernel_mm) {
+		for (i = 0; i < fb->_NumColorDrawBuffers; i++) {
+			struct radeon_renderbuffer *rrb;
+			rrb = radeon_renderbuffer(fb->_ColorDrawBuffers[i]);
+			if (rrb && rrb->bo)
+				radeon_bo_wait(rrb->bo);
+		}
+		{
+			struct radeon_renderbuffer *rrb;
+			rrb = radeon_get_depthbuffer(radeon);
+			if (rrb && rrb->bo)
+				radeon_bo_wait(rrb->bo);
+		}
+	} else if (radeon->do_irqs) {
+		LOCK_HARDWARE(radeon);
+		radeonEmitIrqLocked(radeon);
+		UNLOCK_HARDWARE(radeon);
+		radeonWaitIrq(radeon);
+	} else {
+		radeonWaitForIdle(radeon);
+	}
+}
+
+/* cmdbuffer */
+/**
+ * Send the current command buffer via ioctl to the hardware.
+ */
+int rcommonFlushCmdBufLocked(radeonContextPtr rmesa, const char *caller)
+{
+	int ret = 0;
+
+	if (rmesa->cmdbuf.flushing) {
+		fprintf(stderr, "Recursive call into r300FlushCmdBufLocked!\n");
+		exit(-1);
+	}
+	rmesa->cmdbuf.flushing = 1;
+
+	if (RADEON_DEBUG & RADEON_IOCTL) {
+		fprintf(stderr, "%s from %s - %i cliprects\n",
+			__FUNCTION__, caller, rmesa->numClipRects);
+	}
+
+	radeonEmitQueryEnd(rmesa->glCtx);
+
+	if (rmesa->cmdbuf.cs->cdw) {
+		ret = radeon_cs_emit(rmesa->cmdbuf.cs);
+		rmesa->hw.all_dirty = GL_TRUE;
+	}
+	radeon_cs_erase(rmesa->cmdbuf.cs);
+	rmesa->cmdbuf.flushing = 0;
+
+	if (radeon_revalidate_bos(rmesa->glCtx) == GL_FALSE) {
+		fprintf(stderr,"failed to revalidate buffers\n");
+	}
+
+	return ret;
+}
+
+int rcommonFlushCmdBuf(radeonContextPtr rmesa, const char *caller)
+{
+	int ret;
+
+	radeonReleaseDmaRegions(rmesa);
+
+	LOCK_HARDWARE(rmesa);
+	ret = rcommonFlushCmdBufLocked(rmesa, caller);
+	UNLOCK_HARDWARE(rmesa);
+
+	if (ret) {
+		fprintf(stderr, "drmRadeonCmdBuffer: %d. Kernel failed to "
+				"parse or rejected command stream. See dmesg "
+				"for more info.\n", ret);
+		_mesa_exit(ret);
+	}
+
+	return ret;
+}
+
+/**
+ * Make sure that enough space is available in the command buffer
+ * by flushing if necessary.
+ *
+ * \param dwords The number of dwords we need to be free on the command buffer
+ */
+GLboolean rcommonEnsureCmdBufSpace(radeonContextPtr rmesa, int dwords, const char *caller)
+{
+   if ((rmesa->cmdbuf.cs->cdw + dwords + 128) > rmesa->cmdbuf.size
+	 || radeon_cs_need_flush(rmesa->cmdbuf.cs)) {
+      /* If we try to flush empty buffer there is too big rendering operation. */
+      assert(rmesa->cmdbuf.cs->cdw);
+      rcommonFlushCmdBuf(rmesa, caller);
+      return GL_TRUE;
+   }
+   return GL_FALSE;
+}
+
+void rcommonInitCmdBuf(radeonContextPtr rmesa)
+{
+	GLuint size;
+	/* Initialize command buffer */
+	size = 256 * driQueryOptioni(&rmesa->optionCache,
+				     "command_buffer_size");
+	if (size < 2 * rmesa->hw.max_state_size) {
+		size = 2 * rmesa->hw.max_state_size + 65535;
+	}
+	if (size > 64 * 256)
+		size = 64 * 256;
+
+	radeon_print(RADEON_CS, RADEON_VERBOSE,
+			"sizeof(drm_r300_cmd_header_t)=%zd\n", sizeof(drm_r300_cmd_header_t));
+	radeon_print(RADEON_CS, RADEON_VERBOSE,
+			"sizeof(drm_radeon_cmd_buffer_t)=%zd\n", sizeof(drm_radeon_cmd_buffer_t));
+	radeon_print(RADEON_CS, RADEON_VERBOSE,
+			"Allocating %d bytes command buffer (max state is %d bytes)\n",
+			size * 4, rmesa->hw.max_state_size * 4);
+
+	if (rmesa->radeonScreen->kernel_mm) {
+		int fd = rmesa->radeonScreen->driScreen->fd;
+		rmesa->cmdbuf.csm = radeon_cs_manager_gem_ctor(fd);
+	} else {
+		rmesa->cmdbuf.csm = radeon_cs_manager_legacy_ctor(rmesa);
+	}
+	if (rmesa->cmdbuf.csm == NULL) {
+		/* FIXME: fatal error */
+		return;
+	}
+	rmesa->cmdbuf.cs = radeon_cs_create(rmesa->cmdbuf.csm, size);
+	assert(rmesa->cmdbuf.cs != NULL);
+	rmesa->cmdbuf.size = size;
+
+	radeon_cs_space_set_flush(rmesa->cmdbuf.cs,
+				  (void (*)(void *))rmesa->glCtx->Driver.Flush, rmesa->glCtx);
+
+	if (!rmesa->radeonScreen->kernel_mm) {
+		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, rmesa->radeonScreen->texSize[0]);
+		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, rmesa->radeonScreen->gartTextures.size);
+	} else {
+		struct drm_radeon_gem_info mminfo = { 0 };
+
+		if (!drmCommandWriteRead(rmesa->dri.fd, DRM_RADEON_GEM_INFO, &mminfo, sizeof(mminfo)))
+		{
+			radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, mminfo.vram_visible);
+			radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, mminfo.gart_size);
+		}
+	}
+
+}
+/**
+ * Destroy the command buffer
+ */
+void rcommonDestroyCmdBuf(radeonContextPtr rmesa)
+{
+	radeon_cs_destroy(rmesa->cmdbuf.cs);
+	if (rmesa->radeonScreen->driScreen->dri2.enabled || rmesa->radeonScreen->kernel_mm) {
+		radeon_cs_manager_gem_dtor(rmesa->cmdbuf.csm);
+	} else {
+		radeon_cs_manager_legacy_dtor(rmesa->cmdbuf.csm);
+	}
+}
+
+void rcommonBeginBatch(radeonContextPtr rmesa, int n,
+		       int dostate,
+		       const char *file,
+		       const char *function,
+		       int line)
+{
+	if (!rmesa->cmdbuf.cs->cdw && dostate) {
+		radeon_print(RADEON_STATE, RADEON_NORMAL,
+				"Reemit state after flush (from %s)\n", function);
+		radeonEmitState(rmesa);
+	}
+	radeon_cs_begin(rmesa->cmdbuf.cs, n, file, function, line);
+
+    radeon_print(RADEON_CS, RADEON_VERBOSE, "BEGIN_BATCH(%d) at %d, from %s:%i\n",
+                        n, rmesa->cmdbuf.cs->cdw, function, line);
+
+}
+
+void radeonUserClear(GLcontext *ctx, GLuint mask)
+{
+   _mesa_meta_clear(ctx, mask);
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.h b/src/mesa/drivers/dri/radeon/radeon_common.h
new file mode 100644
index 0000000000..f3201911ac
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_common.h
@@ -0,0 +1,87 @@
+#ifndef COMMON_MISC_H
+#define COMMON_MISC_H
+
+#include "radeon_common_context.h"
+#include "radeon_dma.h"
+#include "radeon_texture.h"
+
+void radeonUserClear(GLcontext *ctx, GLuint mask);
+void radeonRecalcScissorRects(radeonContextPtr radeon);
+void radeonSetCliprects(radeonContextPtr radeon);
+void radeonUpdateScissor( GLcontext *ctx );
+void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h);
+void radeonPolygonStipplePreKMS( GLcontext *ctx, const GLubyte *mask );
+
+void radeonWaitForIdleLocked(radeonContextPtr radeon);
+extern uint32_t radeonGetAge(radeonContextPtr radeon);
+void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
+		       const drm_clip_rect_t	  *rect);
+void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
+void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+			 int x, int y, int w, int h );
+
+void radeonUpdatePageFlipping(radeonContextPtr rmesa);
+
+void radeonFlush(GLcontext *ctx);
+void radeonFinish(GLcontext * ctx);
+void radeonEmitState(radeonContextPtr radeon);
+GLuint radeonCountStateEmitSize(radeonContextPtr radeon);
+
+void radeon_clear_tris(GLcontext *ctx, GLbitfield mask);
+
+void radeon_window_moved(radeonContextPtr radeon);
+void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb);
+void radeonDrawBuffer( GLcontext *ctx, GLenum mode );
+void radeonReadBuffer( GLcontext *ctx, GLenum mode );
+void radeon_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height);
+void radeon_get_cliprects(radeonContextPtr radeon,
+			  struct drm_clip_rect **cliprects,
+			  unsigned int *num_cliprects,
+			  int *x_off, int *y_off);
+void radeon_fbo_init(struct radeon_context *radeon);
+void
+radeon_renderbuffer_set_bo(struct radeon_renderbuffer *rb,
+			   struct radeon_bo *bo);
+struct radeon_renderbuffer *
+radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv);
+static inline struct radeon_renderbuffer *radeon_renderbuffer(struct gl_renderbuffer *rb)
+{
+	struct radeon_renderbuffer *rrb = (struct radeon_renderbuffer *)rb;
+	if (rrb && rrb->base.ClassID == RADEON_RB_CLASS)
+		return rrb;
+	else
+		return NULL;
+}
+
+static inline struct radeon_renderbuffer *radeon_get_renderbuffer(struct gl_framebuffer *fb, int att_index)
+{
+	if (att_index >= 0)
+		return radeon_renderbuffer(fb->Attachment[att_index].Renderbuffer);
+	else
+		return NULL;
+}
+
+static inline struct radeon_renderbuffer *radeon_get_depthbuffer(radeonContextPtr rmesa)
+{
+	struct radeon_renderbuffer *rrb;
+	rrb = radeon_renderbuffer(rmesa->state.depth.rb);
+	if (!rrb)
+		return NULL;
+
+	return rrb;
+}
+
+static inline struct radeon_renderbuffer *radeon_get_colorbuffer(radeonContextPtr rmesa)
+{
+	struct radeon_renderbuffer *rrb;
+
+	rrb = radeon_renderbuffer(rmesa->state.color.rb);
+	if (!rrb)
+		return NULL;
+	return rrb;
+}
+
+#include "radeon_cmdbuf.h"
+
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
new file mode 100644
index 0000000000..71ee06d9a7
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -0,0 +1,805 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+#include "radeon_common.h"
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+#include "utils.h"
+#include "vblank.h"
+#include "drirenderbuffer.h"
+#include "drivers/common/meta.h"
+#include "main/context.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "main/state.h"
+#include "main/simple_list.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "tnl/tnl.h"
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600) /* +r6/r7 */
+#include "r600_context.h"
+#endif
+
+#define DRIVER_DATE "20090101"
+
+#ifndef RADEON_DEBUG
+int RADEON_DEBUG = (0);
+#endif
+
+
+static const char* get_chip_family_name(int chip_family)
+{
+	switch(chip_family) {
+	case CHIP_FAMILY_R100: return "R100";
+	case CHIP_FAMILY_RV100: return "RV100";
+	case CHIP_FAMILY_RS100: return "RS100";
+	case CHIP_FAMILY_RV200: return "RV200";
+	case CHIP_FAMILY_RS200: return "RS200";
+	case CHIP_FAMILY_R200: return "R200";
+	case CHIP_FAMILY_RV250: return "RV250";
+	case CHIP_FAMILY_RS300: return "RS300";
+	case CHIP_FAMILY_RV280: return "RV280";
+	case CHIP_FAMILY_R300: return "R300";
+	case CHIP_FAMILY_R350: return "R350";
+	case CHIP_FAMILY_RV350: return "RV350";
+	case CHIP_FAMILY_RV380: return "RV380";
+	case CHIP_FAMILY_R420: return "R420";
+	case CHIP_FAMILY_RV410: return "RV410";
+	case CHIP_FAMILY_RS400: return "RS400";
+	case CHIP_FAMILY_RS600: return "RS600";
+	case CHIP_FAMILY_RS690: return "RS690";
+	case CHIP_FAMILY_RS740: return "RS740";
+	case CHIP_FAMILY_RV515: return "RV515";
+	case CHIP_FAMILY_R520: return "R520";
+	case CHIP_FAMILY_RV530: return "RV530";
+	case CHIP_FAMILY_R580: return "R580";
+	case CHIP_FAMILY_RV560: return "RV560";
+	case CHIP_FAMILY_RV570: return "RV570";
+	case CHIP_FAMILY_R600: return "R600";
+	case CHIP_FAMILY_RV610: return "RV610";
+	case CHIP_FAMILY_RV630: return "RV630";
+	case CHIP_FAMILY_RV670: return "RV670";
+	case CHIP_FAMILY_RV620: return "RV620";
+	case CHIP_FAMILY_RV635: return "RV635";
+	case CHIP_FAMILY_RS780: return "RS780";
+	case CHIP_FAMILY_RS880: return "RS880";
+	case CHIP_FAMILY_RV770: return "RV770";
+	case CHIP_FAMILY_RV730: return "RV730";
+	case CHIP_FAMILY_RV710: return "RV710";
+	case CHIP_FAMILY_RV740: return "RV740";
+	default: return "unknown";
+	}
+}
+
+
+/* Return various strings for glGetString().
+ */
+static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	static char buffer[128];
+
+	switch (name) {
+	case GL_VENDOR:
+		if (IS_R600_CLASS(radeon->radeonScreen))
+			return (GLubyte *) "Advanced Micro Devices, Inc.";
+		else if (IS_R300_CLASS(radeon->radeonScreen))
+			return (GLubyte *) "DRI R300 Project";
+		else
+			return (GLubyte *) "Tungsten Graphics, Inc.";
+
+	case GL_RENDERER:
+	{
+		unsigned offset;
+		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
+			radeon->radeonScreen->AGPMode;
+		const char* chipclass;
+		char hardwarename[32];
+
+		if (IS_R600_CLASS(radeon->radeonScreen))
+			chipclass = "R600";
+		else if (IS_R300_CLASS(radeon->radeonScreen))
+			chipclass = "R300";
+		else if (IS_R200_CLASS(radeon->radeonScreen))
+			chipclass = "R200";
+		else
+			chipclass = "R100";
+
+		sprintf(hardwarename, "%s (%s %04X)",
+		        chipclass,
+		        get_chip_family_name(radeon->radeonScreen->chip_family),
+		        radeon->radeonScreen->device_id);
+
+		offset = driGetRendererString(buffer, hardwarename, DRIVER_DATE,
+					      agp_mode);
+
+		if (IS_R600_CLASS(radeon->radeonScreen)) {
+			sprintf(&buffer[offset], " TCL");
+		} else if (IS_R300_CLASS(radeon->radeonScreen)) {
+			sprintf(&buffer[offset], " %sTCL",
+				(radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
+				? "" : "NO-");
+		} else {
+			sprintf(&buffer[offset], " %sTCL",
+				!(radeon->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
+				? "" : "NO-");
+		}
+
+		if (radeon->radeonScreen->driScreen->dri2.enabled)
+			strcat(buffer, " DRI2");
+
+		return (GLubyte *) buffer;
+	}
+
+	default:
+		return NULL;
+	}
+}
+
+/* Initialize the driver's misc functions.
+ */
+static void radeonInitDriverFuncs(struct dd_function_table *functions)
+{
+	functions->GetString = radeonGetString;
+}
+
+/**
+ * Create and initialize all common fields of the context,
+ * including the Mesa context itself.
+ */
+GLboolean radeonInitContext(radeonContextPtr radeon,
+			    struct dd_function_table* functions,
+			    const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	GLcontext* ctx;
+	GLcontext* shareCtx;
+	int fthrottle_mode;
+
+	/* Fill in additional standard functions. */
+	radeonInitDriverFuncs(functions);
+
+	radeon->radeonScreen = screen;
+	/* Allocate and initialize the Mesa context */
+	if (sharedContextPrivate)
+		shareCtx = ((radeonContextPtr)sharedContextPrivate)->glCtx;
+	else
+		shareCtx = NULL;
+	radeon->glCtx = _mesa_create_context(glVisual, shareCtx,
+					    functions, (void *)radeon);
+	if (!radeon->glCtx)
+		return GL_FALSE;
+
+	ctx = radeon->glCtx;
+	driContextPriv->driverPrivate = radeon;
+
+	meta_init_metaops(ctx, &radeon->meta);
+
+	_mesa_meta_init(ctx);
+
+	/* DRI fields */
+	radeon->dri.context = driContextPriv;
+	radeon->dri.screen = sPriv;
+	radeon->dri.hwContext = driContextPriv->hHWContext;
+	radeon->dri.hwLock = &sPriv->pSAREA->lock;
+	radeon->dri.hwLockCount = 0;
+	radeon->dri.fd = sPriv->fd;
+	radeon->dri.drmMinor = sPriv->drm_version.minor;
+
+	radeon->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
+					       screen->sarea_priv_offset);
+
+	/* Setup IRQs */
+	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
+	radeon->iw.irq_seq = -1;
+	radeon->irqsEmitted = 0;
+	if (IS_R600_CLASS(radeon->radeonScreen))
+		radeon->do_irqs = 0;
+	else
+		radeon->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
+				   radeon->radeonScreen->irq);
+
+	radeon->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+
+	if (!radeon->do_irqs)
+		fprintf(stderr,
+			"IRQ's not enabled, falling back to %s: %d %d\n",
+			radeon->do_usleeps ? "usleeps" : "busy waits",
+			fthrottle_mode, radeon->radeonScreen->irq);
+
+        radeon->texture_depth = driQueryOptioni (&radeon->optionCache,
+					        "texture_depth");
+        if (radeon->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+                radeon->texture_depth = ( glVisual->rgbBits > 16 ) ?
+	        DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+
+	if (IS_R600_CLASS(radeon->radeonScreen)) {
+		radeon->texture_row_align = 256;
+		radeon->texture_rect_row_align = 256;
+		radeon->texture_compressed_row_align = 256;
+	} else if (IS_R200_CLASS(radeon->radeonScreen) ||
+		   IS_R100_CLASS(radeon->radeonScreen)) {
+		radeon->texture_row_align = 32;
+		radeon->texture_rect_row_align = 64;
+		radeon->texture_compressed_row_align = 32;
+	} else { /* R300 - not sure this is all correct */
+		int chip_family = radeon->radeonScreen->chip_family;
+		if (chip_family == CHIP_FAMILY_RS600 ||
+		    chip_family == CHIP_FAMILY_RS690 ||
+		    chip_family == CHIP_FAMILY_RS740)
+			radeon->texture_row_align = 64;
+		else
+			radeon->texture_row_align = 32;
+		radeon->texture_rect_row_align = 64;
+		radeon->texture_compressed_row_align = 64;
+	}
+
+	make_empty_list(&radeon->query.not_flushed_head);
+	radeon_init_dma(radeon);
+
+	return GL_TRUE;
+}
+
+
+
+/**
+ * Destroy the command buffer and state atoms.
+ */
+static void radeon_destroy_atom_list(radeonContextPtr radeon)
+{
+	struct radeon_state_atom *atom;
+
+	foreach(atom, &radeon->hw.atomlist) {
+		FREE(atom->cmd);
+		if (atom->lastcmd)
+			FREE(atom->lastcmd);
+	}
+
+}
+
+/**
+ * Cleanup common context fields.
+ * Called by r200DestroyContext/r300DestroyContext
+ */
+void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
+{
+#ifdef RADEON_BO_TRACK
+	FILE *track;
+#endif
+	GET_CURRENT_CONTEXT(ctx);
+	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+	radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
+
+	assert(radeon);
+
+	_mesa_meta_free(radeon->glCtx);
+
+	if (radeon == current) {
+		radeon_firevertices(radeon);
+		_mesa_make_current(NULL, NULL, NULL);
+	}
+
+	if (!is_empty_list(&radeon->dma.reserved)) {
+		rcommonFlushCmdBuf( radeon, __FUNCTION__ );
+	}
+
+	radeonFreeDmaRegions(radeon);
+	radeonReleaseArrays(radeon->glCtx, ~0);
+	meta_destroy_metaops(&radeon->meta);
+	if (radeon->vtbl.free_context)
+		radeon->vtbl.free_context(radeon->glCtx);
+	_swsetup_DestroyContext( radeon->glCtx );
+	_tnl_DestroyContext( radeon->glCtx );
+	_vbo_DestroyContext( radeon->glCtx );
+	_swrast_DestroyContext( radeon->glCtx );
+
+	/* free atom list */
+	/* free the Mesa context */
+	_mesa_destroy_context(radeon->glCtx);
+
+	/* _mesa_destroy_context() might result in calls to functions that
+	 * depend on the DriverCtx, so don't set it to NULL before.
+	 *
+	 * radeon->glCtx->DriverCtx = NULL;
+	 */
+	/* free the option cache */
+	driDestroyOptionCache(&radeon->optionCache);
+
+	rcommonDestroyCmdBuf(radeon);
+
+	radeon_destroy_atom_list(radeon);
+
+	if (radeon->state.scissor.pClipRects) {
+		FREE(radeon->state.scissor.pClipRects);
+		radeon->state.scissor.pClipRects = 0;
+	}
+#ifdef RADEON_BO_TRACK
+	track = fopen("/tmp/tracklog", "w");
+	if (track) {
+		radeon_tracker_print(&radeon->radeonScreen->bom->tracker, track);
+		fclose(track);
+	}
+#endif
+	FREE(radeon);
+}
+
+/* Force the context `c' to be unbound from its buffer.
+ */
+GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
+{
+	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+
+	if (RADEON_DEBUG & RADEON_DRI)
+		fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
+			radeon->glCtx);
+
+	return GL_TRUE;
+}
+
+
+static void
+radeon_make_kernel_renderbuffer_current(radeonContextPtr radeon,
+					struct radeon_framebuffer *draw)
+{
+	/* if radeon->fake */
+	struct radeon_renderbuffer *rb;
+
+	if ((rb = (void *)draw->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
+		if (!rb->bo) {
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->frontOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->frontPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
+		if (!rb->bo) {
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->backOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->backPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_DEPTH].Renderbuffer)) {
+		if (!rb->bo) {
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_STENCIL].Renderbuffer)) {
+		if (!rb->bo) {
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
+	}
+}
+
+static void
+radeon_make_renderbuffer_current(radeonContextPtr radeon,
+				 struct radeon_framebuffer *draw)
+{
+	int size = 4096*4096*4;
+	/* if radeon->fake */
+	struct radeon_renderbuffer *rb;
+
+	if (radeon->radeonScreen->kernel_mm) {
+		radeon_make_kernel_renderbuffer_current(radeon, draw);
+		return;
+	}
+
+
+	if ((rb = (void *)draw->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
+		if (!rb->bo) {
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->frontOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->frontPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
+		if (!rb->bo) {
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->backOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->backPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_DEPTH].Renderbuffer)) {
+		if (!rb->bo) {
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_STENCIL].Renderbuffer)) {
+		if (!rb->bo) {
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
+	}
+}
+
+static unsigned
+radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
+{
+   switch (rb->base._ActualFormat) {
+   case GL_RGB5:
+   case GL_DEPTH_COMPONENT16:
+      return 16;
+   case GL_RGB8:
+   case GL_RGBA8:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH24_STENCIL8_EXT:
+   case GL_STENCIL_INDEX8_EXT:
+      return 32;
+   default:
+      return 0;
+   }
+}
+
+void
+radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
+{
+	unsigned int attachments[10];
+	__DRIbuffer *buffers = NULL;
+	__DRIscreen *screen;
+	struct radeon_renderbuffer *rb;
+	int i, count;
+	struct radeon_framebuffer *draw;
+	radeonContextPtr radeon;
+	char *regname;
+	struct radeon_bo *depth_bo = NULL, *bo;
+
+	if (RADEON_DEBUG & RADEON_DRI)
+	    fprintf(stderr, "enter %s, drawable %p\n", __func__, drawable);
+
+	draw = drawable->driverPrivate;
+	screen = context->driScreenPriv;
+	radeon = (radeonContextPtr) context->driverPrivate;
+
+	if (screen->dri2.loader
+	   && (screen->dri2.loader->base.version > 2)
+	   && (screen->dri2.loader->getBuffersWithFormat != NULL)) {
+		struct radeon_renderbuffer *depth_rb;
+		struct radeon_renderbuffer *stencil_rb;
+
+		i = 0;
+		if ((radeon->is_front_buffer_rendering ||
+		     radeon->is_front_buffer_reading ||
+		     !draw->color_rb[1])
+		    && draw->color_rb[0]) {
+			attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[0]);
+		}
+
+		if (draw->color_rb[1]) {
+			attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[1]);
+		}
+
+		depth_rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+		stencil_rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+
+		if ((depth_rb != NULL) && (stencil_rb != NULL)) {
+			attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
+			attachments[i++] = radeon_bits_per_pixel(depth_rb);
+		} else if (depth_rb != NULL) {
+			attachments[i++] = __DRI_BUFFER_DEPTH;
+			attachments[i++] = radeon_bits_per_pixel(depth_rb);
+		} else if (stencil_rb != NULL) {
+			attachments[i++] = __DRI_BUFFER_STENCIL;
+			attachments[i++] = radeon_bits_per_pixel(stencil_rb);
+		}
+
+		buffers = (*screen->dri2.loader->getBuffersWithFormat)(drawable,
+								&drawable->w,
+								&drawable->h,
+								attachments, i / 2,
+								&count,
+								drawable->loaderPrivate);
+	} else if (screen->dri2.loader) {
+		i = 0;
+		if (draw->color_rb[0])
+			attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+		if (draw->color_rb[1])
+			attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+		if (radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH))
+			attachments[i++] = __DRI_BUFFER_DEPTH;
+		if (radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL))
+			attachments[i++] = __DRI_BUFFER_STENCIL;
+
+		buffers = (*screen->dri2.loader->getBuffers)(drawable,
+								 &drawable->w,
+								 &drawable->h,
+								 attachments, i,
+								 &count,
+								 drawable->loaderPrivate);
+	}
+
+	if (buffers == NULL)
+		return;
+
+	/* set one cliprect to cover the whole drawable */
+	drawable->x = 0;
+	drawable->y = 0;
+	drawable->backX = 0;
+	drawable->backY = 0;
+	drawable->numClipRects = 1;
+	drawable->pClipRects[0].x1 = 0;
+	drawable->pClipRects[0].y1 = 0;
+	drawable->pClipRects[0].x2 = drawable->w;
+	drawable->pClipRects[0].y2 = drawable->h;
+	drawable->numBackClipRects = 1;
+	drawable->pBackClipRects[0].x1 = 0;
+	drawable->pBackClipRects[0].y1 = 0;
+	drawable->pBackClipRects[0].x2 = drawable->w;
+	drawable->pBackClipRects[0].y2 = drawable->h;
+	for (i = 0; i < count; i++) {
+		switch (buffers[i].attachment) {
+		case __DRI_BUFFER_FRONT_LEFT:
+			rb = draw->color_rb[0];
+			regname = "dri2 front buffer";
+			break;
+		case __DRI_BUFFER_FAKE_FRONT_LEFT:
+			rb = draw->color_rb[0];
+			regname = "dri2 fake front buffer";
+			break;
+		case __DRI_BUFFER_BACK_LEFT:
+			rb = draw->color_rb[1];
+			regname = "dri2 back buffer";
+			break;
+		case __DRI_BUFFER_DEPTH:
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+			regname = "dri2 depth buffer";
+			break;
+		case __DRI_BUFFER_DEPTH_STENCIL:
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+			regname = "dri2 depth / stencil buffer";
+			break;
+		case __DRI_BUFFER_STENCIL:
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+			regname = "dri2 stencil buffer";
+			break;
+		case __DRI_BUFFER_ACCUM:
+		default:
+			fprintf(stderr,
+				"unhandled buffer attach event, attacment type %d\n",
+				buffers[i].attachment);
+			return;
+		}
+
+		if (rb == NULL)
+			continue;
+
+		if (rb->bo) {
+			uint32_t name = radeon_gem_name_bo(rb->bo);
+			if (name == buffers[i].name)
+				continue;
+		}
+
+		if (RADEON_DEBUG & RADEON_DRI)
+			fprintf(stderr,
+				"attaching buffer %s, %d, at %d, cpp %d, pitch %d\n",
+				regname, buffers[i].name, buffers[i].attachment,
+				buffers[i].cpp, buffers[i].pitch);
+
+		rb->cpp = buffers[i].cpp;
+		rb->pitch = buffers[i].pitch;
+		rb->base.Width = drawable->w;
+		rb->base.Height = drawable->h;
+		rb->has_surface = 0;
+
+		if (buffers[i].attachment == __DRI_BUFFER_STENCIL && depth_bo) {
+			if (RADEON_DEBUG & RADEON_DRI)
+				fprintf(stderr, "(reusing depth buffer as stencil)\n");
+			bo = depth_bo;
+			radeon_bo_ref(bo);
+		} else {
+			uint32_t tiling_flags = 0, pitch = 0;
+			int ret;
+
+			bo = radeon_bo_open(radeon->radeonScreen->bom,
+						buffers[i].name,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						buffers[i].flags);
+
+			if (bo == NULL) {
+
+				fprintf(stderr, "failed to attach %s %d\n",
+					regname, buffers[i].name);
+
+			}
+
+			ret = radeon_bo_get_tiling(bo, &tiling_flags, &pitch);
+			if (tiling_flags & RADEON_TILING_MACRO)
+				bo->flags |= RADEON_BO_FLAGS_MACRO_TILE;
+			if (tiling_flags & RADEON_TILING_MICRO)
+				bo->flags |= RADEON_BO_FLAGS_MICRO_TILE;
+			
+		}
+
+		if (buffers[i].attachment == __DRI_BUFFER_DEPTH) {
+			if (draw->base.Visual.depthBits == 16)
+				rb->cpp = 2;
+			depth_bo = bo;
+		}
+
+		radeon_renderbuffer_set_bo(rb, bo);
+		radeon_bo_unref(bo);
+
+		if (buffers[i].attachment == __DRI_BUFFER_DEPTH_STENCIL) {
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+			if (rb != NULL) {
+				struct radeon_bo *stencil_bo = NULL;
+
+				if (rb->bo) {
+					uint32_t name = radeon_gem_name_bo(rb->bo);
+					if (name == buffers[i].name)
+						continue;
+				}
+
+				stencil_bo = bo;
+				radeon_bo_ref(stencil_bo);
+				radeon_renderbuffer_set_bo(rb, stencil_bo);
+				radeon_bo_unref(stencil_bo);
+			}
+		}
+	}
+
+	driUpdateFramebufferSize(radeon->glCtx, drawable);
+}
+
+/* Force the context `c' to be the current context and associate with it
+ * buffer `b'.
+ */
+GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+			    __DRIdrawablePrivate * driDrawPriv,
+			    __DRIdrawablePrivate * driReadPriv)
+{
+	radeonContextPtr radeon;
+	struct radeon_framebuffer *drfb;
+	struct gl_framebuffer *readfb;
+
+	if (!driContextPriv) {
+		if (RADEON_DEBUG & RADEON_DRI)
+			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+		_mesa_make_current(NULL, NULL, NULL);
+		return GL_TRUE;
+	}
+
+	radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+	drfb = driDrawPriv->driverPrivate;
+	readfb = driReadPriv->driverPrivate;
+
+	if (driContextPriv->driScreenPriv->dri2.enabled) {
+		radeon_update_renderbuffers(driContextPriv, driDrawPriv);
+		if (driDrawPriv != driReadPriv)
+			radeon_update_renderbuffers(driContextPriv, driReadPriv);
+		_mesa_reference_renderbuffer(&radeon->state.color.rb,
+			&(radeon_get_renderbuffer(&drfb->base, BUFFER_BACK_LEFT)->base));
+		_mesa_reference_renderbuffer(&radeon->state.depth.rb,
+			&(radeon_get_renderbuffer(&drfb->base, BUFFER_DEPTH)->base));
+	} else {
+		radeon_make_renderbuffer_current(radeon, drfb);
+	}
+
+	if (RADEON_DEBUG & RADEON_DRI)
+	     fprintf(stderr, "%s ctx %p dfb %p rfb %p\n", __FUNCTION__, radeon->glCtx, drfb, readfb);
+
+	driUpdateFramebufferSize(radeon->glCtx, driDrawPriv);
+	if (driReadPriv != driDrawPriv)
+		driUpdateFramebufferSize(radeon->glCtx, driReadPriv);
+
+	_mesa_make_current(radeon->glCtx, &drfb->base, readfb);
+
+	_mesa_update_state(radeon->glCtx);
+
+	if (radeon->glCtx->DrawBuffer == &drfb->base) {
+		if (driDrawPriv->swap_interval == (unsigned)-1) {
+			int i;
+			driDrawPriv->vblFlags =
+				(radeon->radeonScreen->irq != 0)
+				? driGetDefaultVBlankFlags(&radeon->
+							   optionCache)
+				: VBLANK_FLAG_NO_IRQ;
+
+			driDrawableInitVBlank(driDrawPriv);
+			drfb->vbl_waited = driDrawPriv->vblSeq;
+
+			for (i = 0; i < 2; i++) {
+				if (drfb->color_rb[i])
+					drfb->color_rb[i]->vbl_pending = driDrawPriv->vblSeq;
+			}
+
+		}
+
+		radeon_window_moved(radeon);
+		radeon_draw_buffer(radeon->glCtx, &drfb->base);
+	}
+
+
+	if (RADEON_DEBUG & RADEON_DRI)
+		fprintf(stderr, "End %s\n", __FUNCTION__);
+
+	return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
new file mode 100644
index 0000000000..0309345393
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -0,0 +1,594 @@
+
+#ifndef COMMON_CONTEXT_H
+#define COMMON_CONTEXT_H
+
+#include "main/mm.h"
+#include "math/m_vector.h"
+#include "texmem.h"
+#include "tnl/t_context.h"
+#include "main/colormac.h"
+
+#include "radeon_debug.h"
+#include "radeon_screen.h"
+#include "radeon_drm.h"
+#include "dri_util.h"
+#include "tnl/t_vertex.h"
+
+#include "dri_metaops.h"
+struct radeon_context;
+
+#include "radeon_bocs_wrapper.h"
+
+/* This union is used to avoid warnings/miscompilation
+   with float to uint32_t casts due to strict-aliasing */
+typedef union { GLfloat f; uint32_t ui32; } float_ui32_type;
+
+struct radeon_context;
+typedef struct radeon_context radeonContextRec;
+typedef struct radeon_context *radeonContextPtr;
+
+
+#define TEX_0   0x1
+#define TEX_1   0x2
+#define TEX_2   0x4
+#define TEX_3	0x8
+#define TEX_4	0x10
+#define TEX_5	0x20
+
+/* Rasterizing fallbacks */
+/* See correponding strings in r200_swtcl.c */
+#define RADEON_FALLBACK_TEXTURE		0x0001
+#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
+#define RADEON_FALLBACK_STENCIL		0x0004
+#define RADEON_FALLBACK_RENDER_MODE	0x0008
+#define RADEON_FALLBACK_BLEND_EQ	0x0010
+#define RADEON_FALLBACK_BLEND_FUNC	0x0020
+#define RADEON_FALLBACK_DISABLE 	0x0040
+#define RADEON_FALLBACK_BORDER_MODE	0x0080
+#define RADEON_FALLBACK_DEPTH_BUFFER	0x0100
+#define RADEON_FALLBACK_STENCIL_BUFFER  0x0200
+
+#define R200_FALLBACK_TEXTURE           0x01
+#define R200_FALLBACK_DRAW_BUFFER       0x02
+#define R200_FALLBACK_STENCIL           0x04
+#define R200_FALLBACK_RENDER_MODE       0x08
+#define R200_FALLBACK_DISABLE           0x10
+#define R200_FALLBACK_BORDER_MODE       0x20
+
+#define RADEON_TCL_FALLBACK_RASTER            0x1 /* rasterization */
+#define RADEON_TCL_FALLBACK_UNFILLED          0x2 /* unfilled tris */
+#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE     0x4 /* twoside tris */
+#define RADEON_TCL_FALLBACK_MATERIAL          0x8 /* material in vb */
+#define RADEON_TCL_FALLBACK_TEXGEN_0          0x10 /* texgen, unit 0 */
+#define RADEON_TCL_FALLBACK_TEXGEN_1          0x20 /* texgen, unit 1 */
+#define RADEON_TCL_FALLBACK_TEXGEN_2          0x40 /* texgen, unit 2 */
+#define RADEON_TCL_FALLBACK_TCL_DISABLE       0x80 /* user disable */
+#define RADEON_TCL_FALLBACK_FOGCOORDSPEC      0x100 /* fogcoord, sep. spec light */
+
+/* The blit width for texture uploads
+ */
+#define BLIT_WIDTH_BYTES 1024
+
+/* Use the templated vertex format:
+ */
+#define COLOR_IS_RGBA
+#define TAG(x) radeon##x
+#include "tnl_dd/t_dd_vertex.h"
+#undef TAG
+
+#define RADEON_RB_CLASS 0xdeadbeef
+
+struct radeon_renderbuffer
+{
+	struct gl_renderbuffer base;
+	struct radeon_bo *bo;
+	unsigned int cpp;
+	/* unsigned int offset; */
+	unsigned int pitch;
+
+	uint32_t draw_offset; /* FBO */
+	/* boo Xorg 6.8.2 compat */
+	int has_surface;
+
+	GLuint pf_pending;  /**< sequence number of pending flip */
+	GLuint vbl_pending;   /**< vblank sequence number of pending flip */
+	__DRIdrawablePrivate *dPriv;
+};
+
+struct radeon_framebuffer
+{
+	struct gl_framebuffer base;
+
+	struct radeon_renderbuffer *color_rb[2];
+
+	GLuint vbl_waited;
+
+	/* buffer swap */
+	int64_t swap_ust;
+	int64_t swap_missed_ust;
+
+	GLuint swap_count;
+	GLuint swap_missed_count;
+
+	/* Drawable page flipping state */
+	GLboolean pf_active;
+	GLint pf_current_page;
+	GLint pf_num_pages;
+
+};
+
+
+struct radeon_colorbuffer_state {
+	GLuint clear;
+	int roundEnable;
+	struct gl_renderbuffer *rb;
+	uint32_t draw_offset; /* offset into color renderbuffer - FBOs */
+};
+
+struct radeon_depthbuffer_state {
+	GLuint clear;
+	struct gl_renderbuffer *rb;
+};
+
+struct radeon_scissor_state {
+	drm_clip_rect_t rect;
+	GLboolean enabled;
+
+	GLuint numClipRects;	/* Cliprects active */
+	GLuint numAllocedClipRects;	/* Cliprects available */
+	drm_clip_rect_t *pClipRects;
+};
+
+struct radeon_stencilbuffer_state {
+	GLuint clear;		/* rb3d_stencilrefmask value */
+};
+
+struct radeon_state_atom {
+	struct radeon_state_atom *next, *prev;
+	const char *name;	/* for debug */
+	int cmd_size;		/* size in bytes */
+        GLuint idx;
+	GLuint is_tcl;
+        GLuint *cmd;		/* one or more cmd's */
+	GLuint *lastcmd;		/* one or more cmd's */
+	GLboolean dirty;	/* dirty-mark in emit_state_list */
+        int (*check) (GLcontext *, struct radeon_state_atom *atom); /* is this state active? */
+        void (*emit) (GLcontext *, struct radeon_state_atom *atom);
+};
+
+struct radeon_hw_state {
+  	/* Head of the linked list of state atoms. */
+	struct radeon_state_atom atomlist;
+	int max_state_size;	/* Number of bytes necessary for a full state emit. */
+	int max_post_flush_size; /* Number of bytes necessary for post flushing emits */
+	GLboolean is_dirty, all_dirty;
+};
+
+
+/* Texture related */
+typedef struct _radeon_texture_image radeon_texture_image;
+
+struct _radeon_texture_image {
+	struct gl_texture_image base;
+
+	/**
+	 * If mt != 0, the image is stored in hardware format in the
+	 * given mipmap tree. In this case, base.Data may point into the
+	 * mapping of the buffer object that contains the mipmap tree.
+	 *
+	 * If mt == 0, the image is stored in normal memory pointed to
+	 * by base.Data.
+	 */
+	struct _radeon_mipmap_tree *mt;
+	struct radeon_bo *bo;
+
+	int mtlevel; /** if mt != 0, this is the image's level in the mipmap tree */
+	int mtface; /** if mt != 0, this is the image's face in the mipmap tree */
+};
+
+
+static INLINE radeon_texture_image *get_radeon_texture_image(struct gl_texture_image *image)
+{
+	return (radeon_texture_image*)image;
+}
+
+
+typedef struct radeon_tex_obj radeonTexObj, *radeonTexObjPtr;
+
+#define RADEON_TXO_MICRO_TILE               (1 << 3)
+
+/* Texture object in locally shared texture space.
+ */
+struct radeon_tex_obj {
+	struct gl_texture_object base;
+	struct _radeon_mipmap_tree *mt;
+
+	/**
+	 * This is true if we've verified that the mipmap tree above is complete
+	 * and so on.
+	 */
+	GLboolean validated;
+
+	GLuint override_offset;
+	GLboolean image_override; /* Image overridden by GLX_EXT_tfp */
+	GLuint tile_bits;	/* hw texture tile bits used on this texture */
+        struct radeon_bo *bo;
+
+	GLuint pp_txfilter;	/* hardware register values */
+	GLuint pp_txformat;
+	GLuint pp_txformat_x;
+	GLuint pp_txsize;	/* npot only */
+	GLuint pp_txpitch;	/* npot only */
+	GLuint pp_border_color;
+	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
+
+        GLuint pp_txfilter_1;	/*  r300 */
+
+	/* r700 texture states */
+	GLuint SQ_TEX_RESOURCE0;
+	GLuint SQ_TEX_RESOURCE1;
+	GLuint SQ_TEX_RESOURCE2;
+	GLuint SQ_TEX_RESOURCE3;
+	GLuint SQ_TEX_RESOURCE4;
+	GLuint SQ_TEX_RESOURCE5;
+	GLuint SQ_TEX_RESOURCE6;
+
+	GLuint SQ_TEX_SAMPLER0;
+	GLuint SQ_TEX_SAMPLER1;
+	GLuint SQ_TEX_SAMPLER2;
+
+	GLuint TD_PS_SAMPLER0_BORDER_RED;
+	GLuint TD_PS_SAMPLER0_BORDER_GREEN;
+	GLuint TD_PS_SAMPLER0_BORDER_BLUE;
+	GLuint TD_PS_SAMPLER0_BORDER_ALPHA;
+
+	GLboolean border_fallback;
+
+
+};
+
+static INLINE radeonTexObj* radeon_tex_obj(struct gl_texture_object *texObj)
+{
+	return (radeonTexObj*)texObj;
+}
+
+/* occlusion query */
+struct radeon_query_object {
+	struct gl_query_object Base;
+	struct radeon_bo *bo;
+	int curr_offset;
+	GLboolean emitted_begin;
+
+	/* Double linked list of not flushed query objects */
+	struct radeon_query_object *prev, *next;
+};
+
+/* Need refcounting on dma buffers:
+ */
+struct radeon_dma_buffer {
+	int refcount;		/* the number of retained regions in buf */
+	drmBufPtr buf;
+};
+
+struct radeon_aos {
+	struct radeon_bo *bo; /** Buffer object where vertex data is stored */
+	int offset; /** Offset into buffer object, in bytes */
+	int components; /** Number of components per vertex */
+	int stride; /** Stride in dwords (may be 0 for repeating) */
+	int count; /** Number of vertices */
+};
+
+#define DMA_BO_FREE_TIME 100
+
+struct radeon_dma_bo {
+  struct radeon_dma_bo *next, *prev;
+  struct radeon_bo *bo;
+  int expire_counter;
+};
+
+struct radeon_dma {
+        /* Active dma region.  Allocations for vertices and retained
+         * regions come from here.  Also used for emitting random vertices,
+         * these may be flushed by calling flush_current();
+         */
+	struct radeon_dma_bo free;
+	struct radeon_dma_bo wait;
+	struct radeon_dma_bo reserved;
+        size_t current_used; /** Number of bytes allocated and forgotten about */
+        size_t current_vertexptr; /** End of active vertex region */
+        size_t minimum_size;
+
+        /**
+         * If current_vertexptr != current_used then flush must be non-zero.
+         * flush must be called before non-active vertex allocations can be
+         * performed.
+         */
+        void (*flush) (GLcontext *);
+};
+
+/* radeon_swtcl.c
+ */
+struct radeon_swtcl_info {
+
+	GLuint RenderIndex;
+	GLuint vertex_size;
+	GLubyte *verts;
+
+	/* Fallback rasterization functions
+	 */
+	GLuint hw_primitive;
+	GLenum render_primitive;
+	GLuint numverts;
+
+	struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
+	GLuint vertex_attr_count;
+
+	GLuint emit_prediction;
+};
+
+#define RADEON_MAX_AOS_ARRAYS		16
+struct radeon_tcl_info {
+	struct radeon_aos aos[RADEON_MAX_AOS_ARRAYS];
+	GLuint aos_count;
+	struct radeon_bo *elt_dma_bo; /** Buffer object that contains element indices */
+	int elt_dma_offset; /** Offset into this buffer object, in bytes */
+};
+
+struct radeon_ioctl {
+	GLuint vertex_offset;
+	GLuint vertex_max;
+	struct radeon_bo *bo;
+	GLuint vertex_size;
+};
+
+#define RADEON_MAX_PRIMS 64
+
+struct radeon_prim {
+	GLuint start;
+	GLuint end;
+	GLuint prim;
+};
+
+static INLINE GLuint radeonPackColor(GLuint cpp,
+                                     GLubyte r, GLubyte g,
+                                     GLubyte b, GLubyte a)
+{
+	switch (cpp) {
+	case 2:
+		return PACK_COLOR_565(r, g, b);
+	case 4:
+		return PACK_COLOR_8888(a, r, g, b);
+	default:
+		return 0;
+	}
+}
+
+#define MAX_CMD_BUF_SZ (16*1024)
+
+#define MAX_DMA_BUF_SZ (64*1024)
+
+struct radeon_store {
+	GLuint statenr;
+	GLuint primnr;
+	char cmd_buf[MAX_CMD_BUF_SZ];
+	int cmd_used;
+	int elts_start;
+};
+
+struct radeon_dri_mirror {
+	__DRIcontextPrivate *context;	/* DRI context */
+	__DRIscreenPrivate *screen;	/* DRI screen */
+
+	drm_context_t hwContext;
+	drm_hw_lock_t *hwLock;
+	int hwLockCount;
+	int fd;
+	int drmMinor;
+};
+
+typedef void (*radeon_tri_func) (radeonContextPtr,
+				 radeonVertex *,
+				 radeonVertex *, radeonVertex *);
+
+typedef void (*radeon_line_func) (radeonContextPtr,
+				  radeonVertex *, radeonVertex *);
+
+typedef void (*radeon_point_func) (radeonContextPtr, radeonVertex *);
+
+#define RADEON_MAX_BOS 32
+struct radeon_state {
+	struct radeon_colorbuffer_state color;
+	struct radeon_depthbuffer_state depth;
+	struct radeon_scissor_state scissor;
+	struct radeon_stencilbuffer_state stencil;
+
+	struct radeon_cs_space_check bos[RADEON_MAX_BOS];
+	int validated_bo_count;
+};
+
+/**
+ * This structure holds the command buffer while it is being constructed.
+ *
+ * The first batch of commands in the buffer is always the state that needs
+ * to be re-emitted when the context is lost. This batch can be skipped
+ * otherwise.
+ */
+struct radeon_cmdbuf {
+	struct radeon_cs_manager    *csm;
+	struct radeon_cs            *cs;
+	int size; /** # of dwords total */
+	unsigned int flushing:1; /** whether we're currently in FlushCmdBufLocked */
+};
+
+struct radeon_context {
+   GLcontext *glCtx;
+   radeonScreenPtr radeonScreen;	/* Screen private DRI data */
+
+   /* Texture object bookkeeping
+    */
+   int                   texture_depth;
+   float                 initialMaxAnisotropy;
+   uint32_t              texture_row_align;
+   uint32_t              texture_rect_row_align;
+   uint32_t              texture_compressed_row_align;
+
+  struct radeon_dma dma;
+  struct radeon_hw_state hw;
+   /* Rasterization and vertex state:
+    */
+   GLuint TclFallback;
+   GLuint Fallback;
+   GLuint NewGLState;
+   DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+
+   /* Drawable, cliprect and scissor information */
+   GLuint numClipRects;	/* Cliprects for the draw buffer */
+   drm_clip_rect_t *pClipRects;
+   unsigned int lastStamp;
+   drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+
+   /* Mirrors of some DRI state */
+   struct radeon_dri_mirror dri;
+
+   /* Busy waiting */
+   GLuint do_usleeps;
+   GLuint do_irqs;
+   GLuint irqsEmitted;
+   drm_radeon_irq_wait_t iw;
+
+   /* Derived state - for r300 only */
+   struct radeon_state state;
+
+   struct radeon_swtcl_info swtcl;
+   struct radeon_tcl_info tcl;
+   /* Configuration cache
+    */
+   driOptionCache optionCache;
+
+   struct radeon_cmdbuf cmdbuf;
+
+   struct radeon_debug debug;
+
+  drm_clip_rect_t fboRect;
+  GLboolean constant_cliprect; /* use for FBO or DRI2 rendering */
+  GLboolean front_cliprects;
+
+   /**
+    * Set if rendering has occured to the drawable's front buffer.
+    *
+    * This is used in the DRI2 case to detect that glFlush should also copy
+    * the contents of the fake front buffer to the real front buffer.
+    */
+   GLboolean front_buffer_dirty;
+
+   /**
+    * Track whether front-buffer rendering is currently enabled
+    *
+    * A separate flag is used to track this in order to support MRT more
+    * easily.
+    */
+   GLboolean is_front_buffer_rendering;
+
+   /**
+    * Track whether front-buffer is the current read target.
+    *
+    * This is closely associated with is_front_buffer_rendering, but may
+    * be set separately.  The DRI2 fake front buffer must be referenced
+    * either way.
+    */
+   GLboolean is_front_buffer_reading;
+
+   struct dri_metaops meta;
+
+   struct {
+	struct radeon_query_object *current;
+	struct radeon_query_object not_flushed_head;
+	struct radeon_state_atom queryobj;
+   } query;
+
+   struct {
+	   void (*get_lock)(radeonContextPtr radeon);
+	   void (*update_viewport_offset)(GLcontext *ctx);
+	   void (*emit_cs_header)(struct radeon_cs *cs, radeonContextPtr rmesa);
+	   void (*swtcl_flush)(GLcontext *ctx, uint32_t offset);
+	   void (*pre_emit_atoms)(radeonContextPtr rmesa);
+	   void (*pre_emit_state)(radeonContextPtr rmesa);
+	   void (*fallback)(GLcontext *ctx, GLuint bit, GLboolean mode);
+	   void (*free_context)(GLcontext *ctx);
+	   void (*emit_query_finish)(radeonContextPtr radeon);
+	   void (*update_scissor)(GLcontext *ctx);
+   } vtbl;
+};
+
+#define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
+
+static inline __DRIdrawablePrivate* radeon_get_drawable(radeonContextPtr radeon)
+{
+	return radeon->dri.context->driDrawablePriv;
+}
+
+static inline __DRIdrawablePrivate* radeon_get_readable(radeonContextPtr radeon)
+{
+	return radeon->dri.context->driReadablePriv;
+}
+
+/**
+ * This function takes a float and packs it into a uint32_t
+ */
+static INLINE uint32_t radeonPackFloat32(float fl)
+{
+	union {
+		float fl;
+		uint32_t u;
+	} u;
+
+	u.fl = fl;
+	return u.u;
+}
+
+/* This is probably wrong for some values, I need to test this
+ * some more.  Range checking would be a good idea also..
+ *
+ * But it works for most things.  I'll fix it later if someone
+ * else with a better clue doesn't
+ */
+static INLINE uint32_t radeonPackFloat24(float f)
+{
+	float mantissa;
+	int exponent;
+	uint32_t float24 = 0;
+
+	if (f == 0.0)
+		return 0;
+
+	mantissa = frexpf(f, &exponent);
+
+	/* Handle -ve */
+	if (mantissa < 0) {
+		float24 |= (1 << 23);
+		mantissa = mantissa * -1.0;
+	}
+	/* Handle exponent, bias of 63 */
+	exponent += 62;
+	float24 |= (exponent << 16);
+	/* Kill 7 LSB of mantissa */
+	float24 |= (radeonPackFloat32(mantissa) & 0x7FFFFF) >> 7;
+
+	return float24;
+}
+
+GLboolean radeonInitContext(radeonContextPtr radeon,
+			    struct dd_function_table* functions,
+			    const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate);
+
+void radeonCleanupContext(radeonContextPtr radeon);
+GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
+void radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable);
+GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+			    __DRIdrawablePrivate * driDrawPriv,
+			    __DRIdrawablePrivate * driReadPriv);
+extern void radeonDestroyContext(__DRIcontextPrivate * driContextPriv);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_compat.c b/src/mesa/drivers/dri/radeon/radeon_compat.c
deleted file mode 100644
index 46b490d61f..0000000000
--- a/src/mesa/drivers/dri/radeon/radeon_compat.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/**************************************************************************
-
-Copyright 2002 ATI Technologies Inc., Ontario, Canada, and
-               Tungsten Graphics Inc., Austin, Texas.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and associated documentation files (the "Software"),
-to deal in the Software without restriction, including without limitation
-on the rights to use, copy, modify, merge, publish, distribute, sub
-license, and/or sell copies of the Software, and to permit persons to whom
-the Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice (including the next
-paragraph) shall be included in all copies or substantial portions of the
-Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
-ATI, TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
-DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
-OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
-USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- *
- */
-
-#include "main/glheader.h"
-#include "main/imports.h"
-
-#include "radeon_context.h"
-#include "radeon_state.h"
-#include "radeon_ioctl.h"
-
-
-static struct { 
-	int start; 
-	int len; 
-	const char *name;
-} packet[RADEON_MAX_STATE_PACKETS] = {
-	{ RADEON_PP_MISC,7,"RADEON_PP_MISC" },
-	{ RADEON_PP_CNTL,3,"RADEON_PP_CNTL" },
-	{ RADEON_RB3D_COLORPITCH,1,"RADEON_RB3D_COLORPITCH" },
-	{ RADEON_RE_LINE_PATTERN,2,"RADEON_RE_LINE_PATTERN" },
-	{ RADEON_SE_LINE_WIDTH,1,"RADEON_SE_LINE_WIDTH" },
-	{ RADEON_PP_LUM_MATRIX,1,"RADEON_PP_LUM_MATRIX" },
-	{ RADEON_PP_ROT_MATRIX_0,2,"RADEON_PP_ROT_MATRIX_0" },
-	{ RADEON_RB3D_STENCILREFMASK,3,"RADEON_RB3D_STENCILREFMASK" },
-	{ RADEON_SE_VPORT_XSCALE,6,"RADEON_SE_VPORT_XSCALE" },
-	{ RADEON_SE_CNTL,2,"RADEON_SE_CNTL" },
-	{ RADEON_SE_CNTL_STATUS,1,"RADEON_SE_CNTL_STATUS" },
-	{ RADEON_RE_MISC,1,"RADEON_RE_MISC" },
-	{ RADEON_PP_TXFILTER_0,6,"RADEON_PP_TXFILTER_0" },
-	{ RADEON_PP_BORDER_COLOR_0,1,"RADEON_PP_BORDER_COLOR_0" },
-	{ RADEON_PP_TXFILTER_1,6,"RADEON_PP_TXFILTER_1" },
-	{ RADEON_PP_BORDER_COLOR_1,1,"RADEON_PP_BORDER_COLOR_1" },
-	{ RADEON_PP_TXFILTER_2,6,"RADEON_PP_TXFILTER_2" },
-	{ RADEON_PP_BORDER_COLOR_2,1,"RADEON_PP_BORDER_COLOR_2" },
-	{ RADEON_SE_ZBIAS_FACTOR,2,"RADEON_SE_ZBIAS_FACTOR" },
-	{ RADEON_SE_TCL_OUTPUT_VTX_FMT,11,"RADEON_SE_TCL_OUTPUT_VTX_FMT" },
-	{ RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED,17,"RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED" },
-};
-
-
-static void radeonCompatEmitPacket( radeonContextPtr rmesa, 
-				    struct radeon_state_atom *state )
-{
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
-   drm_radeon_context_regs_t *ctx = &sarea->context_state;
-   drm_radeon_texture_regs_t *tex0 = &sarea->tex_state[0];
-   drm_radeon_texture_regs_t *tex1 = &sarea->tex_state[1];
-   int i;
-   int *buf = state->cmd;
-
-   for ( i = 0 ; i < state->cmd_size ; ) {
-      drm_radeon_cmd_header_t *header = (drm_radeon_cmd_header_t *)&buf[i++];
-
-      if (RADEON_DEBUG & DEBUG_STATE)
-	 fprintf(stderr, "%s %d: %s\n", __FUNCTION__, header->packet.packet_id,
-		 packet[(int)header->packet.packet_id].name);
-
-      switch (header->packet.packet_id) {
-      case RADEON_EMIT_PP_MISC:
-	 ctx->pp_misc = buf[i++]; 
-	 ctx->pp_fog_color = buf[i++];
-	 ctx->re_solid_color = buf[i++];
-	 ctx->rb3d_blendcntl = buf[i++];
-	 ctx->rb3d_depthoffset = buf[i++];
-	 ctx->rb3d_depthpitch = buf[i++];
-	 ctx->rb3d_zstencilcntl = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
-	 break;
-      case RADEON_EMIT_PP_CNTL:
-	 ctx->pp_cntl = buf[i++];
-	 ctx->rb3d_cntl = buf[i++];
-	 ctx->rb3d_coloroffset = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
-	 break;
-      case RADEON_EMIT_RB3D_COLORPITCH:
-	 ctx->rb3d_colorpitch = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
-	 break;
-      case RADEON_EMIT_RE_LINE_PATTERN:
-	 ctx->re_line_pattern = buf[i++];
-	 ctx->re_line_state = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_LINE;
-	 break;
-      case RADEON_EMIT_SE_LINE_WIDTH:
-	 ctx->se_line_width = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_LINE;
-	 break;
-      case RADEON_EMIT_PP_LUM_MATRIX:
-	 ctx->pp_lum_matrix = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_BUMPMAP;
-	 break;
-      case RADEON_EMIT_PP_ROT_MATRIX_0:
-	 ctx->pp_rot_matrix_0 = buf[i++];
-	 ctx->pp_rot_matrix_1 = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_BUMPMAP;
-	 break;
-      case RADEON_EMIT_RB3D_STENCILREFMASK:
-	 ctx->rb3d_stencilrefmask = buf[i++];
-	 ctx->rb3d_ropcntl = buf[i++];
-	 ctx->rb3d_planemask = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_MASKS;
-	 break;
-      case RADEON_EMIT_SE_VPORT_XSCALE:
-	 ctx->se_vport_xscale = buf[i++];
-	 ctx->se_vport_xoffset = buf[i++];
-	 ctx->se_vport_yscale = buf[i++];
-	 ctx->se_vport_yoffset = buf[i++];
-	 ctx->se_vport_zscale = buf[i++];
-	 ctx->se_vport_zoffset = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_VIEWPORT;
-	 break;
-      case RADEON_EMIT_SE_CNTL:
-	 ctx->se_cntl = buf[i++];
-	 ctx->se_coord_fmt = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT | RADEON_UPLOAD_VERTFMT;
-	 break;
-      case RADEON_EMIT_SE_CNTL_STATUS:
-	 ctx->se_cntl_status = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_SETUP;
-	 break;
-      case RADEON_EMIT_RE_MISC:
-	 ctx->re_misc = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_MISC;
-	 break;
-      case RADEON_EMIT_PP_TXFILTER_0:
-	 tex0->pp_txfilter = buf[i++];
-	 tex0->pp_txformat = buf[i++];
-	 tex0->pp_txoffset = buf[i++];
-	 tex0->pp_txcblend = buf[i++];
-	 tex0->pp_txablend = buf[i++];
-	 tex0->pp_tfactor = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_TEX0;
-	 break;
-      case RADEON_EMIT_PP_BORDER_COLOR_0:
-	 tex0->pp_border_color = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_TEX0;
-	 break;
-      case RADEON_EMIT_PP_TXFILTER_1:
-	 tex1->pp_txfilter = buf[i++];
-	 tex1->pp_txformat = buf[i++];
-	 tex1->pp_txoffset = buf[i++];
-	 tex1->pp_txcblend = buf[i++];
-	 tex1->pp_txablend = buf[i++];
-	 tex1->pp_tfactor = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_TEX1;
-	 break;
-      case RADEON_EMIT_PP_BORDER_COLOR_1:
-	 tex1->pp_border_color = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_TEX1;
-	 break;
-
-      case RADEON_EMIT_SE_ZBIAS_FACTOR:
-	 i++;
-	 i++;
-	 break;
-
-      case RADEON_EMIT_PP_TXFILTER_2:
-      case RADEON_EMIT_PP_BORDER_COLOR_2:
-      case RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT:
-      case RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED:
-      default:
-	 /* These states aren't understood by radeon drm 1.1 */
-	 fprintf(stderr, "Tried to emit unsupported state\n");
-	 return;
-      }
-   }
-}
-
-
-
-static void radeonCompatEmitStateLocked( radeonContextPtr rmesa )
-{
-   struct radeon_state_atom *atom;
-
-   if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   if (!rmesa->hw.is_dirty && !rmesa->hw.all_dirty)
-      return;
-
-   foreach(atom, &rmesa->hw.atomlist) {
-      if (rmesa->hw.all_dirty)
-	 atom->dirty = GL_TRUE;
-      if (atom->is_tcl)
-	 atom->dirty = GL_FALSE;
-      if (atom->dirty)
-	 radeonCompatEmitPacket(rmesa, atom);
-   }
- 
-   rmesa->hw.is_dirty = GL_FALSE;
-   rmesa->hw.all_dirty = GL_FALSE;
-}
-
-
-static void radeonCompatEmitPrimitiveLocked( radeonContextPtr rmesa,
-					     GLuint hw_primitive,
-					     GLuint nverts,
-					     drm_clip_rect_t *pbox,
-					     GLuint nbox )
-{
-   int i;
-
-   for ( i = 0 ; i < nbox ; ) {
-      int nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, nbox );
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      drm_radeon_vertex_t vtx;
-      
-      rmesa->sarea->dirty |= RADEON_UPLOAD_CLIPRECTS;
-      rmesa->sarea->nbox = nr - i;
-
-      for ( ; i < nr ; i++) 
-	 *b++ = pbox[i];
-      
-      if (RADEON_DEBUG & DEBUG_IOCTL)
-	 fprintf(stderr, 
-		 "RadeonFlushVertexBuffer: prim %x buf %d verts %d "
-		 "disc %d nbox %d\n",
-		 hw_primitive, 
-		 rmesa->dma.current.buf->buf->idx, 
-		 nverts, 
-		 nr == nbox,
-		 rmesa->sarea->nbox );
-
-      vtx.prim = hw_primitive;
-      vtx.idx = rmesa->dma.current.buf->buf->idx;
-      vtx.count = nverts;
-      vtx.discard = (nr == nbox);      
-
-      drmCommandWrite( rmesa->dri.fd, 
-		       DRM_RADEON_VERTEX,
-		       &vtx, sizeof(vtx));
-   }
-}
-
-
-
-/* No 'start' for 1.1 vertices ioctl: only one vertex prim/buffer!  
- */
-void radeonCompatEmitPrimitive( radeonContextPtr rmesa,
-				GLuint vertex_format,
-				GLuint hw_primitive,
-				GLuint nrverts )
-{
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   LOCK_HARDWARE( rmesa );
-
-   radeonCompatEmitStateLocked( rmesa );
-   rmesa->sarea->vc_format = vertex_format;
-   
-   if (rmesa->state.scissor.enabled) {
-      radeonCompatEmitPrimitiveLocked( rmesa, 
-				       hw_primitive,
-				       nrverts,
-				       rmesa->state.scissor.pClipRects,
-				       rmesa->state.scissor.numClipRects );
-   }
-   else {
-      radeonCompatEmitPrimitiveLocked( rmesa, 
-				       hw_primitive,
-				       nrverts,
-				       rmesa->pClipRects,
-				       rmesa->numClipRects );
-   }
-
-
-   UNLOCK_HARDWARE( rmesa );
-}
-
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index 1e992c0b3d..8f4485aee7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -53,6 +53,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "drivers/common/driverfuncs.h"
 
+#include "radeon_common.h"
 #include "radeon_context.h"
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
@@ -61,13 +62,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_swtcl.h"
 #include "radeon_tcl.h"
 #include "radeon_maos.h"
+#include "radeon_queryobj.h"
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
+#define need_GL_ARB_occlusion_query
 #define need_GL_EXT_blend_minmax
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_secondary_color
+#define need_GL_EXT_framebuffer_object
 #include "extension_helper.h"
 
 #define DRIVER_DATE	"20061018"
@@ -75,58 +76,23 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h" /* for symbolic values of enum-type options */
-#ifndef RADEON_DEBUG
-int RADEON_DEBUG = (0);
-#endif
-
-
-/* Return various strings for glGetString().
- */
-static const GLubyte *radeonGetString( GLcontext *ctx, GLenum name )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   static char buffer[128];
-   unsigned   offset;
-   GLuint agp_mode = (rmesa->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
-      rmesa->radeonScreen->AGPMode;
-
-   switch ( name ) {
-   case GL_VENDOR:
-      return (GLubyte *)"Tungsten Graphics, Inc.";
-
-   case GL_RENDERER:
-      offset = driGetRendererString( buffer, "Radeon", DRIVER_DATE,
-				     agp_mode );
-
-      sprintf( & buffer[ offset ], " %sTCL",
-	       !(rmesa->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
-	       ? "" : "NO-" );
-
-      return (GLubyte *)buffer;
-
-   default:
-      return NULL;
-   }
-}
-
 
 /* Extension strings exported by the R100 driver.
  */
 const struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
+    { "GL_ARB_occlusion_query",		   GL_ARB_occlusion_query_functions},
     { "GL_ARB_texture_border_clamp",       NULL },
-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
     { "GL_ARB_texture_env_add",            NULL },
     { "GL_ARB_texture_env_combine",        NULL },
     { "GL_ARB_texture_env_crossbar",       NULL },
     { "GL_ARB_texture_env_dot3",           NULL },
     { "GL_ARB_texture_mirrored_repeat",    NULL },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     { "GL_EXT_blend_logic_op",             NULL },
     { "GL_EXT_blend_subtract",             GL_EXT_blend_minmax_functions },
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+    { "GL_EXT_packed_depth_stencil",	   NULL},
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
     { "GL_EXT_stencil_wrap",               NULL },
     { "GL_EXT_texture_edge_clamp",         NULL },
@@ -143,6 +109,11 @@ const struct dri_extension card_extensions[] =
     { NULL,                                NULL }
 };
 
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
 extern const struct tnl_pipeline_stage _radeon_render_stage;
 extern const struct tnl_pipeline_stage _radeon_tcl_stage;
 
@@ -166,47 +137,85 @@ static const struct tnl_pipeline_stage *radeon_pipeline[] = {
    NULL,
 };
 
+static void r100_get_lock(radeonContextPtr radeon)
+{
+   r100ContextPtr rmesa = (r100ContextPtr)radeon;
+   drm_radeon_sarea_t *sarea = radeon->sarea;
 
+   RADEON_STATECHANGE(rmesa, ctx);
+   if (rmesa->radeon.sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
+	 RADEON_COLOR_TILE_ENABLE;
+   } else {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &=
+	 ~RADEON_COLOR_TILE_ENABLE;
+   }
+   
+   if (sarea->ctx_owner != rmesa->radeon.dri.hwContext) {
+      sarea->ctx_owner = rmesa->radeon.dri.hwContext;
+      
+      if (!radeon->radeonScreen->kernel_mm)
+         radeon_bo_legacy_texture_age(radeon->radeonScreen->bom);
+   }
+}
 
-/* Initialize the driver's misc functions.
- */
-static void radeonInitDriverFuncs( struct dd_function_table *functions )
+static void r100_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
 {
-    functions->GetString	= radeonGetString;
 }
 
-static const struct dri_debug_control debug_control[] =
+static void r100_vtbl_pre_emit_state(radeonContextPtr radeon)
 {
-    { "fall",  DEBUG_FALLBACKS },
-    { "tex",   DEBUG_TEXTURE },
-    { "ioctl", DEBUG_IOCTL },
-    { "prim",  DEBUG_PRIMS },
-    { "vert",  DEBUG_VERTS },
-    { "state", DEBUG_STATE },
-    { "code",  DEBUG_CODEGEN },
-    { "vfmt",  DEBUG_VFMT },
-    { "vtxf",  DEBUG_VFMT },
-    { "verb",  DEBUG_VERBOSE },
-    { "dri",   DEBUG_DRI },
-    { "dma",   DEBUG_DMA },
-    { "san",   DEBUG_SANITY },
-    { "sync",  DEBUG_SYNC },
-    { NULL,    0 }
-};
+   r100ContextPtr rmesa = (r100ContextPtr)radeon;
+   
+   /* r100 always needs to emit ZBS to avoid TCL lockups */
+   rmesa->hw.zbs.dirty = 1;
+   radeon->hw.is_dirty = 1;
+}
 
+static void r100_vtbl_free_context(GLcontext *ctx)
+{
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   _mesa_vector4f_free( &rmesa->tcl.ObjClean );
+}
+
+static void r100_emit_query_finish(radeonContextPtr radeon)
+{
+   BATCH_LOCALS(radeon);
+   struct radeon_query_object *query = radeon->query.current;
+
+   BEGIN_BATCH_NO_AUTOSTATE(4);
+   OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZPASS_ADDR, 0));
+   OUT_BATCH_RELOC(0, query->bo, query->curr_offset, 0, RADEON_GEM_DOMAIN_GTT, 0);
+   END_BATCH();
+   query->curr_offset += sizeof(uint32_t);
+   assert(query->curr_offset < RADEON_QUERY_PAGE_SIZE);
+   query->emitted_begin = GL_FALSE;
+}
+
+static void r100_init_vtbl(radeonContextPtr radeon)
+{
+   radeon->vtbl.get_lock = r100_get_lock;
+   radeon->vtbl.update_viewport_offset = radeonUpdateViewportOffset;
+   radeon->vtbl.emit_cs_header = r100_vtbl_emit_cs_header;
+   radeon->vtbl.swtcl_flush = r100_swtcl_flush;
+   radeon->vtbl.pre_emit_state = r100_vtbl_pre_emit_state;
+   radeon->vtbl.fallback = radeonFallback;
+   radeon->vtbl.free_context = r100_vtbl_free_context;
+   radeon->vtbl.emit_query_finish = r100_emit_query_finish;
+}
 
 /* Create the device specific context.
  */
 GLboolean
-radeonCreateContext( const __GLcontextModes *glVisual,
+r100CreateContext( const __GLcontextModes *glVisual,
                      __DRIcontextPrivate *driContextPriv,
                      void *sharedContextPrivate)
 {
    __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
    radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
    struct dd_function_table functions;
-   radeonContextPtr rmesa;
-   GLcontext *ctx, *shareCtx;
+   r100ContextPtr rmesa;
+   GLcontext *ctx;
    int i;
    int tcl_mode, fthrottle_mode;
 
@@ -215,10 +224,12 @@ radeonCreateContext( const __GLcontextModes *glVisual,
    assert(screen);
 
    /* Allocate the Radeon context */
-   rmesa = (radeonContextPtr) CALLOC( sizeof(*rmesa) );
+   rmesa = (r100ContextPtr) CALLOC( sizeof(*rmesa) );
    if ( !rmesa )
       return GL_FALSE;
 
+   r100_init_vtbl(&rmesa->radeon);
+
    /* init exp fog table data */
    radeonInitStaticFogData();
    
@@ -226,12 +237,12 @@ radeonCreateContext( const __GLcontextModes *glVisual,
     * Do this here so that initialMaxAnisotropy is set before we create
     * the default textures.
     */
-   driParseConfigFiles (&rmesa->optionCache, &screen->optionCache,
+   driParseConfigFiles (&rmesa->radeon.optionCache, &screen->optionCache,
 			screen->driScreen->myNum, "radeon");
-   rmesa->initialMaxAnisotropy = driQueryOptionf(&rmesa->optionCache,
+   rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
                                                  "def_max_anisotropy");
 
-   if ( driQueryOptionb( &rmesa->optionCache, "hyperz" ) ) {
+   if ( driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
       if ( sPriv->drm_version.minor < 13 )
 	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
 			  "disabling.\n", sPriv->drm_version.minor );
@@ -246,65 +257,18 @@ radeonCreateContext( const __GLcontextModes *glVisual,
     * (the texture functions are especially important)
     */
    _mesa_init_driver_functions( &functions );
-   radeonInitDriverFuncs( &functions );
    radeonInitTextureFuncs( &functions );
+   radeonInitQueryObjFunctions(&functions);
 
-   /* Allocate the Mesa context */
-   if (sharedContextPrivate)
-      shareCtx = ((radeonContextPtr) sharedContextPrivate)->glCtx;
-   else
-      shareCtx = NULL;
-   rmesa->glCtx = _mesa_create_context(glVisual, shareCtx,
-                                       &functions, (void *) rmesa);
-   if (!rmesa->glCtx) {
-      FREE(rmesa);
-      return GL_FALSE;
-   }
-   driContextPriv->driverPrivate = rmesa;
-
-   /* Init radeon context data */
-   rmesa->dri.context = driContextPriv;
-   rmesa->dri.screen = sPriv;
-   rmesa->dri.drawable = NULL;
-   rmesa->dri.readable = NULL;
-   rmesa->dri.hwContext = driContextPriv->hHWContext;
-   rmesa->dri.hwLock = &sPriv->pSAREA->lock;
-   rmesa->dri.fd = sPriv->fd;
-   rmesa->dri.drmMinor = sPriv->drm_version.minor;
-
-   rmesa->radeonScreen = screen;
-   rmesa->sarea = (drm_radeon_sarea_t *)((GLubyte *)sPriv->pSAREA +
-				       screen->sarea_priv_offset);
-
-
-   rmesa->dma.buf0_address = rmesa->radeonScreen->buffers->list[0].address;
-
-   (void) memset( rmesa->texture_heaps, 0, sizeof( rmesa->texture_heaps ) );
-   make_empty_list( & rmesa->swapped );
-
-   rmesa->nr_heaps = screen->numTexHeaps;
-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-      rmesa->texture_heaps[i] = driCreateTextureHeap( i, rmesa,
-	    screen->texSize[i],
-	    12,
-	    RADEON_NR_TEX_REGIONS,
-	    (drmTextureRegionPtr)rmesa->sarea->tex_list[i],
-	    & rmesa->sarea->tex_age[i],
-	    & rmesa->swapped,
-	    sizeof( radeonTexObj ),
-	    (destroy_texture_object_t *) radeonDestroyTexObj );
-
-      driSetTextureSwapCounterLocation( rmesa->texture_heaps[i],
-					& rmesa->c_textureSwaps );
+   if (!radeonInitContext(&rmesa->radeon, &functions,
+			  glVisual, driContextPriv,
+			  sharedContextPrivate)) {
+     FREE(rmesa);
+     return GL_FALSE;
    }
-   rmesa->texture_depth = driQueryOptioni (&rmesa->optionCache,
-					   "texture_depth");
-   if (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
-      rmesa->texture_depth = ( screen->cpp == 4 ) ?
-	 DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
 
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->hw.all_dirty = GL_TRUE;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.hw.all_dirty = GL_TRUE;
 
    /* Set the maximum texture size small enough that we can guarentee that
     * all texture units can bind a maximal texture and have all of them in
@@ -312,26 +276,20 @@ radeonCreateContext( const __GLcontextModes *glVisual,
     * setting allow larger textures.
     */
 
-   ctx = rmesa->glCtx;
-   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->optionCache,
+   ctx = rmesa->radeon.glCtx;
+   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->radeon.optionCache,
 						 "texture_units");
    ctx->Const.MaxTextureImageUnits = ctx->Const.MaxTextureUnits;
    ctx->Const.MaxTextureCoordUnits = ctx->Const.MaxTextureUnits;
 
-   i = driQueryOptioni( &rmesa->optionCache, "allow_large_textures");
-
-   driCalculateMaxTextureLevels( rmesa->texture_heaps,
-				 rmesa->nr_heaps,
-				 & ctx->Const,
-				 4,
-				 11, /* max 2D texture size is 2048x2048 */
-				 8,  /* 256^3 */
-				 9,  /* \todo: max cube texture size seems to be 512x512(x6) */
-				 11, /* max rect texture size is 2048x2048. */
-				 12,
-				 GL_FALSE,
-				 i );
+   i = driQueryOptioni( &rmesa->radeon.optionCache, "allow_large_textures");
 
+   /* FIXME: When no memory manager is available we should set this 
+    * to some reasonable value based on texture memory pool size */
+   ctx->Const.MaxTextureLevels = 12;
+   ctx->Const.Max3DTextureLevels = 9;
+   ctx->Const.MaxCubeTextureLevels = 12;
+   ctx->Const.MaxTextureRectSize = 2048;
 
    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 
@@ -359,6 +317,10 @@ radeonCreateContext( const __GLcontextModes *glVisual,
 
    rmesa->boxes = 0;
 
+   ctx->Const.MaxDrawBuffers = 1;
+
+   _mesa_set_mvp_with_dp4( ctx, GL_TRUE );
+
    /* Initialize the software rasterizer and helper modules.
     */
    _swrast_CreateContext( ctx );
@@ -392,38 +354,42 @@ radeonCreateContext( const __GLcontextModes *glVisual,
    }
 
    driInitExtensions( ctx, card_extensions, GL_TRUE );
-   if (rmesa->radeonScreen->drmSupportsCubeMapsR100)
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+     driInitExtensions(ctx, mm_extensions, GL_FALSE);
+   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR100)
       _mesa_enable_extension( ctx, "GL_ARB_texture_cube_map" );
-   if (rmesa->glCtx->Mesa_DXTn) {
+   if (rmesa->radeon.glCtx->Mesa_DXTn) {
       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
       _mesa_enable_extension( ctx, "GL_S3_s3tc" );
    }
-   else if (driQueryOptionb (&rmesa->optionCache, "force_s3tc_enable")) {
+   else if (driQueryOptionb (&rmesa->radeon.optionCache, "force_s3tc_enable")) {
       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
    }
 
-   if (rmesa->dri.drmMinor >= 9)
+   if (rmesa->radeon.radeonScreen->kernel_mm || rmesa->radeon.dri.drmMinor >= 9)
       _mesa_enable_extension( ctx, "GL_NV_texture_rectangle");
 
+   if (!rmesa->radeon.radeonScreen->kernel_mm)
+      _mesa_disable_extension(ctx, "GL_ARB_occlusion_query");
+
    /* XXX these should really go right after _mesa_init_driver_functions() */
-   radeonInitIoctlFuncs( ctx );
-   radeonInitStateFuncs( ctx );
+   radeon_fbo_init(&rmesa->radeon);
    radeonInitSpanFuncs( ctx );
+   radeonInitIoctlFuncs( ctx );
+   radeonInitStateFuncs( ctx , rmesa->radeon.radeonScreen->kernel_mm );
    radeonInitState( rmesa );
    radeonInitSwtcl( ctx );
 
    _mesa_vector4f_alloc( &rmesa->tcl.ObjClean, 0, 
 			 ctx->Const.MaxArrayLockSize, 32 );
 
-   fthrottle_mode = driQueryOptioni(&rmesa->optionCache, "fthrottle_mode");
-   rmesa->iw.irq_seq = -1;
-   rmesa->irqsEmitted = 0;
-   rmesa->do_irqs = (rmesa->radeonScreen->irq != 0 &&
-		     fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS);
+   fthrottle_mode = driQueryOptioni(&rmesa->radeon.optionCache, "fthrottle_mode");
+   rmesa->radeon.iw.irq_seq = -1;
+   rmesa->radeon.irqsEmitted = 0;
+   rmesa->radeon.do_irqs = (rmesa->radeon.radeonScreen->irq != 0 &&
+			    fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS);
 
-   rmesa->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
-
-   (*sPriv->systemTime->getUST)( & rmesa->swap_ust );
+   rmesa->radeon.do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
 
 
 #if DO_DEBUG
@@ -431,206 +397,21 @@ radeonCreateContext( const __GLcontextModes *glVisual,
 				       debug_control );
 #endif
 
-   tcl_mode = driQueryOptioni(&rmesa->optionCache, "tcl_mode");
-   if (driQueryOptionb(&rmesa->optionCache, "no_rast")) {
+   tcl_mode = driQueryOptioni(&rmesa->radeon.optionCache, "tcl_mode");
+   if (driQueryOptionb(&rmesa->radeon.optionCache, "no_rast")) {
       fprintf(stderr, "disabling 3D acceleration\n");
       FALLBACK(rmesa, RADEON_FALLBACK_DISABLE, 1);
    } else if (tcl_mode == DRI_CONF_TCL_SW ||
-	      !(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
-      if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
-	 rmesa->radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
+	      !(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+	 rmesa->radeon.radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
 	 fprintf(stderr, "Disabling HW TCL support\n");
       }
-      TCL_FALLBACK(rmesa->glCtx, RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+      TCL_FALLBACK(rmesa->radeon.glCtx, RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
    }
 
-   if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+   if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
 /*       _tnl_need_dlist_norm_lengths( ctx, GL_FALSE ); */
    }
    return GL_TRUE;
 }
-
-
-/* Destroy the device specific context.
- */
-/* Destroy the Mesa and driver specific context data.
- */
-void radeonDestroyContext( __DRIcontextPrivate *driContextPriv )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   radeonContextPtr rmesa = (radeonContextPtr) driContextPriv->driverPrivate;
-   radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
-
-   /* check if we're deleting the currently bound context */
-   if (rmesa == current) {
-      RADEON_FIREVERTICES( rmesa );
-      _mesa_make_current(NULL, NULL, NULL);
-   }
-
-   /* Free radeon context resources */
-   assert(rmesa); /* should never be null */
-   if ( rmesa ) {
-      GLboolean   release_texture_heaps;
-
-
-      release_texture_heaps = (rmesa->glCtx->Shared->RefCount == 1);
-      _swsetup_DestroyContext( rmesa->glCtx );
-      _tnl_DestroyContext( rmesa->glCtx );
-      _vbo_DestroyContext( rmesa->glCtx );
-      _swrast_DestroyContext( rmesa->glCtx );
-
-      radeonDestroySwtcl( rmesa->glCtx );
-      radeonReleaseArrays( rmesa->glCtx, ~0 );
-      if (rmesa->dma.current.buf) {
-	 radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
-	 radeonFlushCmdBuf( rmesa, __FUNCTION__ );
-      }
-
-      _mesa_vector4f_free( &rmesa->tcl.ObjClean );
-
-      if (rmesa->state.scissor.pClipRects) {
-	 FREE(rmesa->state.scissor.pClipRects);
-	 rmesa->state.scissor.pClipRects = NULL;
-      }
-
-      if ( release_texture_heaps ) {
-         /* This share group is about to go away, free our private
-          * texture object data.
-          */
-         int i;
-
-         for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-	    driDestroyTextureHeap( rmesa->texture_heaps[ i ] );
-	    rmesa->texture_heaps[ i ] = NULL;
-         }
-
-	 assert( is_empty_list( & rmesa->swapped ) );
-      }
-
-      /* free the Mesa context */
-      rmesa->glCtx->DriverCtx = NULL;
-      _mesa_destroy_context( rmesa->glCtx );
-
-      /* free the option cache */
-      driDestroyOptionCache (&rmesa->optionCache);
-
-      FREE( rmesa );
-   }
-}
-
-
-
-
-void
-radeonSwapBuffers( __DRIdrawablePrivate *dPriv )
-{
-
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      radeonContextPtr rmesa;
-      GLcontext *ctx;
-      rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-      ctx = rmesa->glCtx;
-      if (ctx->Visual.doubleBufferMode) {
-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
-
-         if ( rmesa->doPageFlip ) {
-            radeonPageFlip( dPriv );
-         }
-         else {
-	     radeonCopyBuffer( dPriv, NULL );
-         }
-      }
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
-   }
-}
-
-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-			 int x, int y, int w, int h )
-{
-    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-	radeonContextPtr radeon;
-	GLcontext *ctx;
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-	ctx = radeon->glCtx;
-
-	if (ctx->Visual.doubleBufferMode) {
-	    drm_clip_rect_t rect;
-	    rect.x1 = x + dPriv->x;
-	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
-	    rect.x2 = rect.x1 + w;
-	    rect.y2 = rect.y1 + h;
-	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
-	    radeonCopyBuffer(dPriv, &rect);
-	}
-    } else {
-	/* XXX this shouldn't be an error but we can't handle it for now */
-	_mesa_problem(NULL, "%s: drawable has no context!",
-		      __FUNCTION__);
-    }
-}
-
-/* Make context `c' the current context and bind it to the given
- * drawing and reading surfaces.
- */
-GLboolean
-radeonMakeCurrent( __DRIcontextPrivate *driContextPriv,
-                   __DRIdrawablePrivate *driDrawPriv,
-                   __DRIdrawablePrivate *driReadPriv )
-{
-   if ( driContextPriv ) {
-      radeonContextPtr newCtx = 
-	 (radeonContextPtr) driContextPriv->driverPrivate;
-
-      if (RADEON_DEBUG & DEBUG_DRI)
-	 fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *) newCtx->glCtx);
-
-      newCtx->dri.readable = driReadPriv;
-
-      if ( (newCtx->dri.drawable != driDrawPriv) ||
-           newCtx->lastStamp != driDrawPriv->lastStamp ) {
-	 if (driDrawPriv->swap_interval == (unsigned)-1) {
-	    driDrawPriv->vblFlags = (newCtx->radeonScreen->irq != 0)
-	       ? driGetDefaultVBlankFlags(&newCtx->optionCache)
-	       : VBLANK_FLAG_NO_IRQ;
-
-	    driDrawableInitVBlank( driDrawPriv );
-	 }
-
-	 newCtx->dri.drawable = driDrawPriv;
-
-	 radeonSetCliprects(newCtx);
-	 radeonUpdateViewportOffset( newCtx->glCtx );
-      }
-
-      _mesa_make_current( newCtx->glCtx,
-			  (GLframebuffer *) driDrawPriv->driverPrivate,
-			  (GLframebuffer *) driReadPriv->driverPrivate );
-
-      _mesa_update_state( newCtx->glCtx );
-   } else {
-      if (RADEON_DEBUG & DEBUG_DRI)
-	 fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
-      _mesa_make_current( NULL, NULL, NULL );
-   }
-
-   if (RADEON_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "End %s\n", __FUNCTION__);
-   return GL_TRUE;
-}
-
-/* Force the context `c' to be unbound from its buffer.
- */
-GLboolean
-radeonUnbindContext( __DRIcontextPrivate *driContextPriv )
-{
-   radeonContextPtr rmesa = (radeonContextPtr) driContextPriv->driverPrivate;
-
-   if (RADEON_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *) rmesa->glCtx);
-
-   return GL_TRUE;
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index 53df766f8c..4e2c52c835 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -48,91 +48,23 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "radeon_drm.h"
 #include "texmem.h"
-
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/colormac.h"
-
-struct radeon_context;
-typedef struct radeon_context radeonContextRec;
-typedef struct radeon_context *radeonContextPtr;
-
-/* This union is used to avoid warnings/miscompilation
-   with float to uint32_t casts due to strict-aliasing */
-typedef union {
-	GLfloat f;
-	uint32_t ui32;
-} float_ui32_type;
-
-#include "radeon_lock.h"
 #include "radeon_screen.h"
-#include "main/mm.h"
-
-#include "math/m_vector.h"
-
-#define TEX_0   0x1
-#define TEX_1   0x2
-#define TEX_2   0x4
-#define TEX_ALL 0x7
-
-/* Rasterizing fallbacks */
-/* See correponding strings in r200_swtcl.c */
-#define RADEON_FALLBACK_TEXTURE		0x0001
-#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
-#define RADEON_FALLBACK_STENCIL		0x0004
-#define RADEON_FALLBACK_RENDER_MODE	0x0008
-#define RADEON_FALLBACK_BLEND_EQ	0x0010
-#define RADEON_FALLBACK_BLEND_FUNC	0x0020
-#define RADEON_FALLBACK_DISABLE 	0x0040
-#define RADEON_FALLBACK_BORDER_MODE	0x0080
-
-/* The blit width for texture uploads
- */
-#define BLIT_WIDTH_BYTES 1024
 
-/* Use the templated vertex format:
- */
-#define COLOR_IS_RGBA
-#define TAG(x) radeon##x
-#include "tnl_dd/t_dd_vertex.h"
-#undef TAG
-
-typedef void (*radeon_tri_func) (radeonContextPtr,
-				 radeonVertex *,
-				 radeonVertex *, radeonVertex *);
-
-typedef void (*radeon_line_func) (radeonContextPtr,
-				  radeonVertex *, radeonVertex *);
+#include "radeon_common.h"
 
-typedef void (*radeon_point_func) (radeonContextPtr, radeonVertex *);
-
-struct radeon_colorbuffer_state {
-	GLuint clear;
-	int roundEnable;
-};
 
-struct radeon_depthbuffer_state {
-	GLuint clear;
-	GLfloat scale;
-};
+struct r100_context;
+typedef struct r100_context r100ContextRec;
+typedef struct r100_context *r100ContextPtr;
 
-struct radeon_scissor_state {
-	drm_clip_rect_t rect;
-	GLboolean enabled;
+#include "radeon_lock.h"
 
-	GLuint numClipRects;	/* Cliprects active */
-	GLuint numAllocedClipRects;	/* Cliprects available */
-	drm_clip_rect_t *pClipRects;
-};
 
-struct radeon_stencilbuffer_state {
-	GLboolean hwBuffer;
-	GLuint clear;		/* rb3d_stencilrefmask value */
-};
 
-struct radeon_stipple_state {
-	GLuint mask[32];
-};
+#define R100_TEX_ALL 0x7
 
 /* used for both tcl_vtx and vc_frmt tex bits (they are identical) */
 #define RADEON_ST_BIT(unit) \
@@ -141,42 +73,6 @@ struct radeon_stipple_state {
 #define RADEON_Q_BIT(unit) \
 (unit == 0 ? RADEON_CP_VC_FRMT_Q0 : (RADEON_CP_VC_FRMT_Q1 >> 2) << (2 * unit))
 
-typedef struct radeon_tex_obj radeonTexObj, *radeonTexObjPtr;
-
-/* Texture object in locally shared texture space.
- */
-struct radeon_tex_obj {
-	driTextureObject base;
-
-	GLuint bufAddr;		/* Offset to start of locally
-				   shared texture block */
-
-	GLuint dirty_state;	/* Flags (1 per texunit) for
-				   whether or not this texobj
-				   has dirty hardware state
-				   (pp_*) that needs to be
-				   brought into the
-				   texunit. */
-
-	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
-	/* Six, for the cube faces */
-
-	GLboolean image_override; /* Image overridden by GLX_EXT_tfp */
-
-	GLuint pp_txfilter;	/* hardware register values */
-	GLuint pp_txformat;
-	GLuint pp_txoffset;	/* Image location in texmem.
-				   All cube faces follow. */
-	GLuint pp_txsize;	/* npot only */
-	GLuint pp_txpitch;	/* npot only */
-	GLuint pp_border_color;
-	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
-
-	GLboolean border_fallback;
-
-	GLuint tile_bits;	/* hw texture tile bits used on this texture */
-};
-
 struct radeon_texture_env_state {
 	radeonTexObjPtr texobj;
 	GLenum format;
@@ -187,17 +83,6 @@ struct radeon_texture_state {
 	struct radeon_texture_env_state unit[RADEON_MAX_TEXTURE_UNITS];
 };
 
-struct radeon_state_atom {
-	struct radeon_state_atom *next, *prev;
-	const char *name;	/* for debug */
-	int cmd_size;		/* size in bytes */
-	GLuint is_tcl;
-	int *cmd;		/* one or more cmd's */
-	int *lastcmd;		/* one or more cmd's */
-	GLboolean dirty;	/* dirty-mark in emit_state_list */
-	 GLboolean(*check) (GLcontext *);	/* is this state active? */
-};
-
 /* Trying to keep these relatively short as the variables are becoming
  * extravagently long.  Drop the driver name prefix off the front of
  * everything - I think we know which driver we're in by now, and keep the
@@ -410,10 +295,16 @@ struct radeon_state_atom {
 #define SHN_SHININESS      1
 #define SHN_STATE_SIZE     2
 
-struct radeon_hw_state {
-	/* Head of the linked list of state atoms. */
-	struct radeon_state_atom atomlist;
+#define R100_QUERYOBJ_CMD_0  0
+#define R100_QUERYOBJ_DATA_0 1
+#define R100_QUERYOBJ_CMDSIZE  2
+
+#define STP_CMD_0 0
+#define STP_DATA_0 1
+#define STP_CMD_1 2
+#define STP_STATE_SIZE 35
 
+struct r100_hw_state {
 	/* Hardware state, stored as cmdbuf commands:  
 	 *   -- Need to doublebuffer for
 	 *           - eliding noop statechange loops? (except line stipple count)
@@ -437,90 +328,19 @@ struct radeon_hw_state {
 	struct radeon_state_atom fog;
 	struct radeon_state_atom glt;
 	struct radeon_state_atom txr[3];	/* for NPOT */
-
-	int max_state_size;	/* Number of bytes necessary for a full state emit. */
-	GLboolean is_dirty, all_dirty;
-};
-
-struct radeon_state {
-	/* Derived state for internal purposes:
-	 */
-	struct radeon_colorbuffer_state color;
-	struct radeon_depthbuffer_state depth;
-	struct radeon_scissor_state scissor;
-	struct radeon_stencilbuffer_state stencil;
-	struct radeon_stipple_state stipple;
-	struct radeon_texture_state texture;
-};
-
-/* Need refcounting on dma buffers:
- */
-struct radeon_dma_buffer {
-	int refcount;		/* the number of retained regions in buf */
-	drmBufPtr buf;
-};
-
-#define GET_START(rvb) (rmesa->radeonScreen->gart_buffer_offset +			\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-
-/* A retained region, eg vertices for indexed vertices.
- */
-struct radeon_dma_region {
-	struct radeon_dma_buffer *buf;
-	char *address;		/* == buf->address */
-	int start, end, ptr;	/* offsets from start of buf */
-	int aos_start;
-	int aos_stride;
-	int aos_size;
-};
-
-struct radeon_dma {
-	/* Active dma region.  Allocations for vertices and retained
-	 * regions come from here.  Also used for emitting random vertices,
-	 * these may be flushed by calling flush_current();
-	 */
-	struct radeon_dma_region current;
-
-	void (*flush) (radeonContextPtr);
-
-	char *buf0_address;	/* start of buf[0], for index calcs */
-	GLuint nr_released_bufs;	/* flush after so many buffers released */
+	struct radeon_state_atom stp;
 };
 
-struct radeon_dri_mirror {
-	__DRIcontextPrivate *context;	/* DRI context */
-	__DRIscreenPrivate *screen;	/* DRI screen */
-
-   /**
-    * DRI drawable bound to this context for drawing.
-    */
-	__DRIdrawablePrivate *drawable;
 
-   /**
-    * DRI drawable bound to this context for reading.
-    */
-	__DRIdrawablePrivate *readable;
-
-	drm_context_t hwContext;
-	drm_hw_lock_t *hwLock;
-	int fd;
-	int drmMinor;
+struct r100_state {
+	struct radeon_texture_state texture;
 };
 
 #define RADEON_CMD_BUF_SZ  (8*1024)
-
-struct radeon_store {
-	GLuint statenr;
-	GLuint primnr;
-	char cmd_buf[RADEON_CMD_BUF_SZ];
-	int cmd_used;
-	int elts_start;
-};
-
+#define R200_ELT_BUF_SZ  (8*1024)
 /* radeon_tcl.c
  */
-struct radeon_tcl_info {
+struct r100_tcl_info {
 	GLuint vertex_format;
 	GLuint hw_primitive;
 
@@ -529,30 +349,18 @@ struct radeon_tcl_info {
 	 */
 	GLvector4f ObjClean;
 
-	struct radeon_dma_region *aos_components[8];
-	GLuint nr_aos_components;
-
 	GLuint *Elts;
 
-	struct radeon_dma_region indexed_verts;
-	struct radeon_dma_region obj;
-	struct radeon_dma_region rgba;
-	struct radeon_dma_region spec;
-	struct radeon_dma_region fog;
-	struct radeon_dma_region tex[RADEON_MAX_TEXTURE_UNITS];
-	struct radeon_dma_region norm;
+        int elt_cmd_offset;
+	int elt_cmd_start;
+        int elt_used;
 };
 
 /* radeon_swtcl.c
  */
-struct radeon_swtcl_info {
-	GLuint RenderIndex;
-	GLuint vertex_size;
+struct r100_swtcl_info {
 	GLuint vertex_format;
 
-	struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
-	GLuint vertex_attr_count;
-
 	GLubyte *verts;
 
 	/* Fallback rasterization functions
@@ -561,10 +369,6 @@ struct radeon_swtcl_info {
 	radeon_line_func draw_line;
 	radeon_tri_func draw_tri;
 
-	GLuint hw_primitive;
-	GLenum render_primitive;
-	GLuint numverts;
-
    /**
     * Offset of the 4UB color data within a hardware (swtcl) vertex.
     */
@@ -576,22 +380,9 @@ struct radeon_swtcl_info {
 	GLuint specoffset;
 
 	GLboolean needproj;
-
-	struct radeon_dma_region indexed_verts;
 };
 
-struct radeon_ioctl {
-	GLuint vertex_offset;
-	GLuint vertex_size;
-};
 
-#define RADEON_MAX_PRIMS 64
-
-struct radeon_prim {
-	GLuint start;
-	GLuint end;
-	GLuint prim;
-};
 
 /* A maximum total of 20 elements per vertex:  3 floats for position, 3
  * floats for normal, 4 floats for color, 4 bytes for secondary color,
@@ -602,59 +393,18 @@ struct radeon_prim {
  */
 #define RADEON_MAX_VERTEX_SIZE 20
 
-struct radeon_context {
-	GLcontext *glCtx;	/* Mesa context */
+struct r100_context {
+        struct radeon_context radeon;
 
 	/* Driver and hardware state management
 	 */
-	struct radeon_hw_state hw;
-	struct radeon_state state;
-
-	/* Texture object bookkeeping
-	 */
-	unsigned nr_heaps;
-	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
-	driTextureObject swapped;
-	int texture_depth;
-	float initialMaxAnisotropy;
-
-	/* Rasterization and vertex state:
-	 */
-	GLuint TclFallback;
-	GLuint Fallback;
-	GLuint NewGLState;
-	 DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+	struct r100_hw_state hw;
+	struct r100_state state;
 
 	/* Vertex buffers
 	 */
 	struct radeon_ioctl ioctl;
-	struct radeon_dma dma;
 	struct radeon_store store;
-	/* A full state emit as of the first state emit in the main store, in case
-	 * the context is lost.
-	 */
-	struct radeon_store backup_store;
-
-	/* Page flipping
-	 */
-	GLuint doPageFlip;
-
-	/* Busy waiting
-	 */
-	GLuint do_usleeps;
-	GLuint do_irqs;
-	GLuint irqsEmitted;
-	drm_radeon_irq_wait_t iw;
-
-	/* Drawable, cliprect and scissor information
-	 */
-	GLuint numClipRects;	/* Cliprects for the draw buffer */
-	drm_clip_rect_t *pClipRects;
-	unsigned int lastStamp;
-	GLboolean lost_context;
-	GLboolean save_on_next_emit;
-	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
-	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
 
 	/* TCL stuff
 	 */
@@ -667,29 +417,13 @@ struct radeon_context {
 	GLmatrix tmpmat[RADEON_MAX_TEXTURE_UNITS];
 	GLuint last_ReallyEnabled;
 
-	/* VBI
-	 */
-	int64_t swap_ust;
-	int64_t swap_missed_ust;
-
-	GLuint swap_count;
-	GLuint swap_missed_count;
-
 	/* radeon_tcl.c
 	 */
-	struct radeon_tcl_info tcl;
+	struct r100_tcl_info tcl;
 
 	/* radeon_swtcl.c
 	 */
-	struct radeon_swtcl_info swtcl;
-
-	/* Mirrors of some DRI state
-	 */
-	struct radeon_dri_mirror dri;
-
-	/* Configuration cache
-	 */
-	driOptionCache optionCache;
+	struct r100_swtcl_info swtcl;
 
 	GLboolean using_hyperz;
 	GLboolean texmicrotile;
@@ -703,61 +437,19 @@ struct radeon_context {
 	GLuint c_textureSwaps;
 	GLuint c_textureBytes;
 	GLuint c_vertexBuffers;
+
 };
 
-#define RADEON_CONTEXT(ctx)		((radeonContextPtr)(ctx->DriverCtx))
-
-static INLINE GLuint radeonPackColor(GLuint cpp,
-                                     GLubyte r, GLubyte g,
-                                     GLubyte b, GLubyte a)
-{
-	switch (cpp) {
-	case 2:
-		return PACK_COLOR_565(r, g, b);
-	case 4:
-		return PACK_COLOR_8888(a, r, g, b);
-	default:
-		return 0;
-	}
-}
+
+#define R100_CONTEXT(ctx)		((r100ContextPtr)(ctx->DriverCtx))
+
 
 #define RADEON_OLD_PACKETS 1
 
-extern void radeonDestroyContext(__DRIcontextPrivate * driContextPriv);
-extern GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
-				     __DRIcontextPrivate * driContextPriv,
-				     void *sharedContextPrivate);
-extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
-extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-				int x, int y, int w, int h);
-extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-				   __DRIdrawablePrivate * driDrawPriv,
-				   __DRIdrawablePrivate * driReadPriv);
-extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
-
-/* ================================================================
- * Debugging:
- */
-#define DO_DEBUG		1
-
-#if DO_DEBUG
-extern int RADEON_DEBUG;
-#else
-#define RADEON_DEBUG		0
-#endif
-
-#define DEBUG_TEXTURE	0x0001
-#define DEBUG_STATE	0x0002
-#define DEBUG_IOCTL	0x0004
-#define DEBUG_PRIMS	0x0008
-#define DEBUG_VERTS	0x0010
-#define DEBUG_FALLBACKS	0x0020
-#define DEBUG_VFMT	0x0040
-#define DEBUG_CODEGEN	0x0080
-#define DEBUG_VERBOSE	0x0100
-#define DEBUG_DRI       0x0200
-#define DEBUG_DMA       0x0400
-#define DEBUG_SANITY    0x0800
-#define DEBUG_SYNC      0x1000
+extern GLboolean r100CreateContext( const __GLcontextModes *glVisual,
+				    __DRIcontextPrivate *driContextPriv,
+				    void *sharedContextPrivate);
+  
+
 
 #endif				/* __RADEON_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_drm.h b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
new file mode 100644
index 0000000000..ab4eca31a3
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
@@ -0,0 +1,246 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <aet@rasterburn.org>
+ *      Nicolai Haehnle <prefect_@gmx.net>
+ *      Jérôme Glisse <glisse@freedesktop.org>
+ */
+#ifndef RADEON_CS_H
+#define RADEON_CS_H
+
+#include <stdint.h>
+#include <string.h>
+#include "drm.h"
+#include "radeon_drm.h"
+
+struct radeon_cs_reloc {
+    struct radeon_bo    *bo;
+    uint32_t            read_domain;
+    uint32_t            write_domain;
+    uint32_t            flags;
+};
+
+
+#define RADEON_CS_SPACE_OK 0
+#define RADEON_CS_SPACE_OP_TO_BIG 1
+#define RADEON_CS_SPACE_FLUSH 2
+
+struct radeon_cs_space_check {
+    struct radeon_bo *bo;
+    uint32_t read_domains;
+    uint32_t write_domain;
+    uint32_t new_accounted;
+};
+
+#define MAX_SPACE_BOS (32)
+
+struct radeon_cs_manager;
+
+struct radeon_cs {
+    struct radeon_cs_manager    *csm;
+    void                        *relocs;
+    uint32_t                    *packets;
+    unsigned                    crelocs;
+    unsigned                    relocs_total_size;
+    unsigned                    cdw;
+    unsigned                    ndw;
+    int                         section;
+    unsigned                    section_ndw;
+    unsigned                    section_cdw;
+    const char                  *section_file;
+    const char                  *section_func;
+    int                         section_line;
+    struct radeon_cs_space_check bos[MAX_SPACE_BOS];
+    int                         bo_count;
+    void                        (*space_flush_fn)(void *);
+    void                        *space_flush_data;
+};
+
+/* cs functions */
+struct radeon_cs_funcs {
+    struct radeon_cs *(*cs_create)(struct radeon_cs_manager *csm,
+                                   uint32_t ndw);
+    int (*cs_write_reloc)(struct radeon_cs *cs,
+                          struct radeon_bo *bo,
+                          uint32_t read_domain,
+                          uint32_t write_domain,
+                          uint32_t flags);
+    int (*cs_begin)(struct radeon_cs *cs,
+                    uint32_t ndw,
+                    const char *file,
+                    const char *func,
+                    int line);
+    int (*cs_end)(struct radeon_cs *cs,
+                  const char *file,
+                  const char *func,
+                  int line);
+    int (*cs_emit)(struct radeon_cs *cs);
+    int (*cs_destroy)(struct radeon_cs *cs);
+    int (*cs_erase)(struct radeon_cs *cs);
+    int (*cs_need_flush)(struct radeon_cs *cs);
+    void (*cs_print)(struct radeon_cs *cs, FILE *file);
+};
+
+struct radeon_cs_manager {
+    struct radeon_cs_funcs  *funcs;
+    int                     fd;
+    int32_t vram_limit, gart_limit;
+    int32_t vram_write_used, gart_write_used;
+    int32_t read_used;
+};
+
+static inline struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
+                                                 uint32_t ndw)
+{
+    return csm->funcs->cs_create(csm, ndw);
+}
+
+static inline int radeon_cs_write_reloc(struct radeon_cs *cs,
+                                        struct radeon_bo *bo,
+                                        uint32_t read_domain,
+                                        uint32_t write_domain,
+                                        uint32_t flags)
+{
+    return cs->csm->funcs->cs_write_reloc(cs,
+                                          bo,
+                                          read_domain,
+                                          write_domain,
+                                          flags);
+}
+
+static inline int radeon_cs_begin(struct radeon_cs *cs,
+                                  uint32_t ndw,
+                                  const char *file,
+                                  const char *func,
+                                  int line)
+{
+    return cs->csm->funcs->cs_begin(cs, ndw, file, func, line);
+}
+
+static inline int radeon_cs_end(struct radeon_cs *cs,
+                                const char *file,
+                                const char *func,
+                                int line)
+{
+    return cs->csm->funcs->cs_end(cs, file, func, line);
+}
+
+static inline int radeon_cs_emit(struct radeon_cs *cs)
+{
+    return cs->csm->funcs->cs_emit(cs);
+}
+
+static inline int radeon_cs_destroy(struct radeon_cs *cs)
+{
+    return cs->csm->funcs->cs_destroy(cs);
+}
+
+static inline int radeon_cs_erase(struct radeon_cs *cs)
+{
+    return cs->csm->funcs->cs_erase(cs);
+}
+
+static inline int radeon_cs_need_flush(struct radeon_cs *cs)
+{
+    return cs->csm->funcs->cs_need_flush(cs);
+}
+
+static inline void radeon_cs_print(struct radeon_cs *cs, FILE *file)
+{
+    cs->csm->funcs->cs_print(cs, file);
+}
+
+static inline void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit)
+{
+    
+    if (domain == RADEON_GEM_DOMAIN_VRAM)
+	cs->csm->vram_limit = limit;
+    else
+	cs->csm->gart_limit = limit;
+}
+
+static inline void radeon_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
+{
+    cs->packets[cs->cdw++] = dword;
+    if (cs->section) {
+        cs->section_cdw++;
+    }
+}
+
+static inline void radeon_cs_write_qword(struct radeon_cs *cs, uint64_t qword)
+{
+
+    memcpy(cs->packets + cs->cdw, &qword, sizeof(qword));
+    cs->cdw+=2;
+    if (cs->section) {
+        cs->section_cdw+=2;
+    }
+}
+
+static inline void radeon_cs_write_table(struct radeon_cs *cs, void *data, uint32_t size)
+{
+    memcpy(cs->packets + cs->cdw, data, size * 4);
+    cs->cdw += size;
+    if (cs->section) {
+	    cs->section_cdw += size;
+    }
+}
+
+static inline void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *data)
+{
+    cs->space_flush_fn = fn;
+    cs->space_flush_data = data;
+}
+
+
+/*
+ * add a persistent BO to the list
+ * a persistent BO is one that will be referenced across flushes,
+ * i.e. colorbuffer, textures etc.
+ * They get reset when a new "operation" happens, where an operation
+ * is a state emission with a color/textures etc followed by a bunch of vertices.
+ */
+void radeon_cs_space_add_persistent_bo(struct radeon_cs *cs,
+				       struct radeon_bo *bo,
+				       uint32_t read_domains,
+				       uint32_t write_domain);
+
+/* reset the persistent BO list */
+void radeon_cs_space_reset_bos(struct radeon_cs *cs);
+
+/* do a space check with the current persistent BO list */
+int radeon_cs_space_check(struct radeon_cs *cs);
+
+/* do a space check with the current persistent BO list and a temporary BO
+ * a temporary BO is like a DMA buffer, which  gets flushed with the
+ * command buffer */
+int radeon_cs_space_check_with_bo(struct radeon_cs *cs,
+				  struct radeon_bo *bo,
+				  uint32_t read_domains,
+				  uint32_t write_domain);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
new file mode 100644
index 0000000000..f1addb299e
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
@@ -0,0 +1,409 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <aet@rasterburn.org>
+ *      Nicolai Haehnle <prefect_@gmx.net>
+ *      Jérôme Glisse <glisse@freedesktop.org>
+ */
+#include <errno.h>
+
+#include "radeon_bocs_wrapper.h"
+#include "radeon_common.h"
+
+struct cs_manager_legacy {
+    struct radeon_cs_manager    base;
+    struct radeon_context       *ctx;
+    /* hack for scratch stuff */
+    uint32_t                    pending_age;
+    uint32_t                    pending_count;
+
+
+};
+
+struct cs_reloc_legacy {
+    struct radeon_cs_reloc  base;
+    uint32_t                cindices;
+    uint32_t                *indices;
+};
+
+
+static struct radeon_cs *cs_create(struct radeon_cs_manager *csm,
+                                   uint32_t ndw)
+{
+    struct radeon_cs *cs;
+
+    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
+    if (cs == NULL) {
+        return NULL;
+    }
+    cs->csm = csm;
+    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
+    cs->packets = (uint32_t*)malloc(4*cs->ndw);
+    if (cs->packets == NULL) {
+        free(cs);
+        return NULL;
+    }
+    cs->relocs_total_size = 0;
+    return cs;
+}
+
+static int cs_write_reloc(struct radeon_cs *cs,
+                          struct radeon_bo *bo,
+                          uint32_t read_domain,
+                          uint32_t write_domain,
+                          uint32_t flags)
+{
+    struct cs_reloc_legacy *relocs;
+    int i;
+
+    relocs = (struct cs_reloc_legacy *)cs->relocs;
+    /* check domains */
+    if ((read_domain && write_domain) || (!read_domain && !write_domain)) {
+        /* in one CS a bo can only be in read or write domain but not
+         * in read & write domain at the same sime
+         */
+        return -EINVAL;
+    }
+    if (read_domain == RADEON_GEM_DOMAIN_CPU) {
+        return -EINVAL;
+    }
+    if (write_domain == RADEON_GEM_DOMAIN_CPU) {
+        return -EINVAL;
+    }
+    /* check if bo is already referenced */
+    for(i = 0; i < cs->crelocs; i++) {
+        uint32_t *indices;
+
+        if (relocs[i].base.bo->handle == bo->handle) {
+            /* Check domains must be in read or write. As we check already
+             * checked that in argument one of the read or write domain was
+             * set we only need to check that if previous reloc as the read
+             * domain set then the read_domain should also be set for this
+             * new relocation.
+             */
+            if (relocs[i].base.read_domain && !read_domain) {
+                return -EINVAL;
+            }
+            if (relocs[i].base.write_domain && !write_domain) {
+                return -EINVAL;
+            }
+            relocs[i].base.read_domain |= read_domain;
+            relocs[i].base.write_domain |= write_domain;
+            /* save indice */
+            relocs[i].cindices++;
+            indices = (uint32_t*)realloc(relocs[i].indices,
+                                         relocs[i].cindices * 4);
+            if (indices == NULL) {
+                relocs[i].cindices -= 1;
+                return -ENOMEM;
+            }
+            relocs[i].indices = indices;
+            relocs[i].indices[relocs[i].cindices - 1] = cs->cdw - 1;
+            return 0;
+        }
+    }
+    /* add bo to reloc */
+    relocs = (struct cs_reloc_legacy*)
+             realloc(cs->relocs,
+                     sizeof(struct cs_reloc_legacy) * (cs->crelocs + 1));
+    if (relocs == NULL) {
+        return -ENOMEM;
+    }
+    cs->relocs = relocs;
+    relocs[cs->crelocs].base.bo = bo;
+    relocs[cs->crelocs].base.read_domain = read_domain;
+    relocs[cs->crelocs].base.write_domain = write_domain;
+    relocs[cs->crelocs].base.flags = flags;
+    relocs[cs->crelocs].indices = (uint32_t*)malloc(4);
+    if (relocs[cs->crelocs].indices == NULL) {
+        return -ENOMEM;
+    }
+    relocs[cs->crelocs].indices[0] = cs->cdw - 1;
+    relocs[cs->crelocs].cindices = 1;
+    cs->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
+    cs->crelocs++;
+    radeon_bo_ref(bo);
+    return 0;
+}
+
+static int cs_begin(struct radeon_cs *cs,
+                    uint32_t ndw,
+                    const char *file,
+                    const char *func,
+                    int line)
+{
+    if (cs->section) {
+        fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
+                cs->section_file, cs->section_func, cs->section_line);
+        fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+    cs->section = 1;
+    cs->section_ndw = ndw;
+    cs->section_cdw = 0;
+    cs->section_file = file;
+    cs->section_func = func;
+    cs->section_line = line;
+
+
+    if (cs->cdw + ndw > cs->ndw) {
+        uint32_t tmp, *ptr;
+	int num = (ndw > 0x3FF) ? ndw : 0x3FF;
+
+        tmp = (cs->cdw + 1 + num) & (~num);
+        ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
+        if (ptr == NULL) {
+            return -ENOMEM;
+        }
+        cs->packets = ptr;
+        cs->ndw = tmp;
+    }
+
+    return 0;
+}
+
+static int cs_end(struct radeon_cs *cs,
+                  const char *file,
+                  const char *func,
+                  int line)
+
+{
+    if (!cs->section) {
+        fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+    cs->section = 0;
+    if (cs->section_ndw != cs->section_cdw) {
+        fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
+                cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
+        fprintf(stderr, "CS section end at (%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+    return 0;
+}
+
+static int cs_process_relocs(struct radeon_cs *cs)
+{
+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
+    struct cs_reloc_legacy *relocs;
+    int i, j, r;
+
+    csm = (struct cs_manager_legacy*)cs->csm;
+    relocs = (struct cs_reloc_legacy *)cs->relocs;
+restart:
+    for (i = 0; i < cs->crelocs; i++) 
+    {
+        for (j = 0; j < relocs[i].cindices; j++) 
+        {
+            uint32_t soffset, eoffset;
+
+            r = radeon_bo_legacy_validate(relocs[i].base.bo,
+                                           &soffset, &eoffset);
+	        if (r == -EAGAIN)
+            {
+	             goto restart;
+            }
+            if (r) 
+            {
+                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+                        relocs[i].base.bo, soffset, eoffset);
+                return r;
+            }
+            cs->packets[relocs[i].indices[j]] += soffset;
+            if (cs->packets[relocs[i].indices[j]] >= eoffset) 
+            {
+	      /*                radeon_bo_debug(relocs[i].base.bo, 12); */
+                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+                        relocs[i].base.bo, soffset, eoffset);
+                fprintf(stderr, "above end: %p 0x%08X 0x%08X\n",
+                        relocs[i].base.bo,
+                        cs->packets[relocs[i].indices[j]],
+                        eoffset);
+                exit(0);
+                return -EINVAL;
+            }
+        }
+    }
+    return 0;
+}
+
+static int cs_set_age(struct radeon_cs *cs)
+{
+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
+    struct cs_reloc_legacy *relocs;
+    int i;
+
+    relocs = (struct cs_reloc_legacy *)cs->relocs;
+    for (i = 0; i < cs->crelocs; i++) {
+        radeon_bo_legacy_pending(relocs[i].base.bo, csm->pending_age);
+        radeon_bo_unref(relocs[i].base.bo);
+    }
+    return 0;
+}
+
+static int cs_emit(struct radeon_cs *cs)
+{
+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
+    drm_radeon_cmd_buffer_t cmd;
+    drm_r300_cmd_header_t age;
+    uint64_t ull;
+    int r;
+
+    csm->ctx->vtbl.emit_cs_header(cs, csm->ctx);
+
+    /* append buffer age */
+    if ( IS_R300_CLASS(csm->ctx->radeonScreen) )
+    { 
+      age.scratch.cmd_type = R300_CMD_SCRATCH;
+      /* Scratch register 2 corresponds to what radeonGetAge polls */
+      csm->pending_age = 0;
+      csm->pending_count = 1;
+      ull = (uint64_t) (intptr_t) &csm->pending_age;
+      age.scratch.reg = 2;
+      age.scratch.n_bufs = 1;
+      age.scratch.flags = 0;
+      radeon_cs_write_dword(cs, age.u);
+      radeon_cs_write_qword(cs, ull);
+      radeon_cs_write_dword(cs, 0);
+    }
+
+    r = cs_process_relocs(cs);
+    if (r) {
+        return 0;
+    }
+
+    cmd.buf = (char *)cs->packets;
+    cmd.bufsz = cs->cdw * 4;
+    if (csm->ctx->state.scissor.enabled) {
+        cmd.nbox = csm->ctx->state.scissor.numClipRects;
+        cmd.boxes = (drm_clip_rect_t *) csm->ctx->state.scissor.pClipRects;
+    } else {
+        cmd.nbox = csm->ctx->numClipRects;
+        cmd.boxes = (drm_clip_rect_t *) csm->ctx->pClipRects;
+    }
+
+    //dump_cmdbuf(cs);
+
+    r = drmCommandWrite(cs->csm->fd, DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
+    if (r) {
+        return r;
+    }
+    if ((!IS_R300_CLASS(csm->ctx->radeonScreen)) &&
+        (!IS_R600_CLASS(csm->ctx->radeonScreen))) { /* +r6/r7 : No irq for r6/r7 yet. */
+	drm_radeon_irq_emit_t emit_cmd;
+	emit_cmd.irq_seq = (int*)&csm->pending_age;
+	r = drmCommandWrite(cs->csm->fd, DRM_RADEON_IRQ_EMIT, &emit_cmd, sizeof(emit_cmd));
+	if (r) {
+		return r;
+	}
+    }
+    cs_set_age(cs);
+
+    cs->csm->read_used = 0;
+    cs->csm->vram_write_used = 0;
+    cs->csm->gart_write_used = 0;
+    return 0;
+}
+
+static void inline cs_free_reloc(void *relocs_p, int crelocs)
+{
+    struct cs_reloc_legacy *relocs = relocs_p;
+    int i;
+    if (!relocs_p)
+      return;
+    for (i = 0; i < crelocs; i++)
+      free(relocs[i].indices);
+}
+
+static int cs_destroy(struct radeon_cs *cs)
+{
+    cs_free_reloc(cs->relocs, cs->crelocs);
+    free(cs->relocs);
+    free(cs->packets);
+    free(cs);
+    return 0;
+}
+
+static int cs_erase(struct radeon_cs *cs)
+{
+    cs_free_reloc(cs->relocs, cs->crelocs);
+    free(cs->relocs);
+    cs->relocs_total_size = 0;
+    cs->relocs = NULL;
+    cs->crelocs = 0;
+    cs->cdw = 0;
+    cs->section = 0;
+    return 0;
+}
+
+static int cs_need_flush(struct radeon_cs *cs)
+{
+    /* this function used to flush when the BO usage got to
+     * a certain size, now the higher levels handle this better */
+    return 0;
+}
+
+static void cs_print(struct radeon_cs *cs, FILE *file)
+{
+}
+
+static struct radeon_cs_funcs  radeon_cs_legacy_funcs = {
+    cs_create,
+    cs_write_reloc,
+    cs_begin,
+    cs_end,
+    cs_emit,
+    cs_destroy,
+    cs_erase,
+    cs_need_flush,
+    cs_print,
+};
+
+struct radeon_cs_manager *radeon_cs_manager_legacy_ctor(struct radeon_context *ctx)
+{
+    struct cs_manager_legacy *csm;
+
+    csm = (struct cs_manager_legacy*)
+          calloc(1, sizeof(struct cs_manager_legacy));
+    if (csm == NULL) {
+        return NULL;
+    }
+    csm->base.funcs = &radeon_cs_legacy_funcs;
+    csm->base.fd = ctx->dri.fd;
+    csm->ctx = ctx;
+    csm->pending_age = 1;
+    return (struct radeon_cs_manager*)csm;
+}
+
+void radeon_cs_manager_legacy_dtor(struct radeon_cs_manager *csm)
+{
+    free(csm);
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
new file mode 100644
index 0000000000..cafbc9e576
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
@@ -0,0 +1,40 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <aet@rasterburn.org>
+ *      Nicolai Haehnle <prefect_@gmx.net>
+ *      Jérôme Glisse <glisse@freedesktop.org>
+ */
+#ifndef RADEON_CS_LEGACY_H
+#define RADEON_CS_LEGACY_H
+
+struct radeon_context;
+
+struct radeon_cs_manager *radeon_cs_manager_legacy_ctor(struct radeon_context *ctx);
+void radeon_cs_manager_legacy_dtor(struct radeon_cs_manager *csm);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c b/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c
new file mode 100644
index 0000000000..89cbbb5a6b
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c
@@ -0,0 +1,234 @@
+/* 
+ * Copyright © 2009 Red Hat Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ */
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include "radeon_bocs_wrapper.h"
+
+struct rad_sizes {
+    int32_t op_read;
+    int32_t op_gart_write;
+    int32_t op_vram_write;
+};
+
+static inline int radeon_cs_setup_bo(struct radeon_cs_space_check *sc, struct rad_sizes *sizes)
+{
+    uint32_t read_domains, write_domain;
+    struct radeon_bo *bo;
+
+    bo = sc->bo;
+    sc->new_accounted = 0;
+    read_domains = sc->read_domains;
+    write_domain = sc->write_domain;
+
+    /* legacy needs a static check */
+    if (radeon_bo_is_static(bo)) {
+	bo->space_accounted = sc->new_accounted = (read_domains << 16) | write_domain;
+	return 0;
+    }
+
+    /* already accounted this bo */
+    if (write_domain && (write_domain == bo->space_accounted)) {
+	sc->new_accounted = bo->space_accounted;
+	return 0;
+    }
+    if (read_domains && ((read_domains << 16) == bo->space_accounted)) {
+	sc->new_accounted = bo->space_accounted;
+	return 0;
+    }
+
+    if (bo->space_accounted == 0) {
+	if (write_domain == RADEON_GEM_DOMAIN_VRAM)
+	    sizes->op_vram_write += bo->size;
+	else if (write_domain == RADEON_GEM_DOMAIN_GTT)
+	  sizes->op_gart_write += bo->size;
+	else
+	    sizes->op_read += bo->size;
+	sc->new_accounted = (read_domains << 16) | write_domain;
+    } else {
+	uint16_t old_read, old_write;
+	
+	old_read = bo->space_accounted >> 16;
+	old_write = bo->space_accounted & 0xffff;
+	
+	if (write_domain && (old_read & write_domain)) {
+	    sc->new_accounted = write_domain;
+	    /* moving from read to a write domain */
+	    if (write_domain == RADEON_GEM_DOMAIN_VRAM) {
+		sizes->op_read -= bo->size;
+		sizes->op_vram_write += bo->size;
+	    } else if (write_domain == RADEON_GEM_DOMAIN_GTT) {
+		sizes->op_read -= bo->size;
+		sizes->op_gart_write += bo->size;
+	    }
+	} else if (read_domains & old_write) {
+	    sc->new_accounted = bo->space_accounted & 0xffff;
+	} else {
+	    /* rewrite the domains */
+	    if (write_domain != old_write)
+		fprintf(stderr,"WRITE DOMAIN RELOC FAILURE 0x%x %d %d\n", bo->handle, write_domain, old_write);
+	    if (read_domains != old_read)
+		fprintf(stderr,"READ DOMAIN RELOC FAILURE 0x%x %d %d\n", bo->handle, read_domains, old_read);
+	    return RADEON_CS_SPACE_FLUSH;
+	}
+    }
+    return 0;
+}
+
+static int radeon_cs_do_space_check(struct radeon_cs *cs, struct radeon_cs_space_check *new_tmp)
+{
+    struct radeon_cs_manager *csm = cs->csm;
+    int i;
+    struct radeon_bo *bo;
+    struct rad_sizes sizes;
+    int ret;
+
+    /* check the totals for this operation */
+
+    if (cs->bo_count == 0 && !new_tmp)
+	return 0;
+
+    memset(&sizes, 0, sizeof(struct rad_sizes));
+
+    /* prepare */
+    for (i = 0; i < cs->bo_count; i++) {
+	ret = radeon_cs_setup_bo(&cs->bos[i], &sizes);
+	if (ret)
+	    return ret;
+    }
+
+    if (new_tmp) {
+	ret = radeon_cs_setup_bo(new_tmp, &sizes);
+	if (ret)
+	    return ret;
+    }
+	
+    if (sizes.op_read < 0)
+	    sizes.op_read = 0;
+
+    /* check sizes - operation first */
+    if ((sizes.op_read + sizes.op_gart_write > csm->gart_limit) ||
+	(sizes.op_vram_write > csm->vram_limit)) {
+	    return RADEON_CS_SPACE_OP_TO_BIG;
+    }
+    
+    if (((csm->vram_write_used + sizes.op_vram_write) > csm->vram_limit) ||
+	((csm->read_used + csm->gart_write_used + sizes.op_gart_write + sizes.op_read) > csm->gart_limit)) {
+	    return RADEON_CS_SPACE_FLUSH;
+    }
+    
+    csm->gart_write_used += sizes.op_gart_write;
+    csm->vram_write_used += sizes.op_vram_write;
+    csm->read_used += sizes.op_read;
+    /* commit */
+    for (i = 0; i < cs->bo_count; i++) {
+	    bo = cs->bos[i].bo;
+	    bo->space_accounted = cs->bos[i].new_accounted;
+    }
+    if (new_tmp)
+	new_tmp->bo->space_accounted = new_tmp->new_accounted;
+    
+    return RADEON_CS_SPACE_OK;
+}
+
+void radeon_cs_space_add_persistent_bo(struct radeon_cs *cs, struct radeon_bo *bo, uint32_t read_domains, uint32_t write_domain)
+{
+    int i;
+    for (i = 0; i < cs->bo_count; i++) {
+	if (cs->bos[i].bo == bo &&
+	    cs->bos[i].read_domains == read_domains &&
+	    cs->bos[i].write_domain == write_domain)
+	    return;
+    }
+    radeon_bo_ref(bo);
+    i = cs->bo_count;
+    cs->bos[i].bo = bo;
+    cs->bos[i].read_domains = read_domains;
+    cs->bos[i].write_domain = write_domain;
+    cs->bos[i].new_accounted = 0;
+    cs->bo_count++;
+
+    assert(cs->bo_count < MAX_SPACE_BOS);
+}
+
+static int radeon_cs_check_space_internal(struct radeon_cs *cs, struct radeon_cs_space_check *tmp_bo)
+{
+    int ret;
+    int flushed = 0;
+
+again:
+    ret = radeon_cs_do_space_check(cs, tmp_bo);
+    if (ret == RADEON_CS_SPACE_OP_TO_BIG)
+	return -1;
+    if (ret == RADEON_CS_SPACE_FLUSH) {
+	(*cs->space_flush_fn)(cs->space_flush_data);
+	if (flushed)
+	    return -1;
+	flushed = 1;
+	goto again;
+    }
+    return 0;
+}
+
+int radeon_cs_space_check_with_bo(struct radeon_cs *cs,
+				  struct radeon_bo *bo,
+				  uint32_t read_domains, uint32_t write_domain)
+{									
+    struct radeon_cs_space_check temp_bo;
+    int ret = 0;
+
+    if (bo) {
+	temp_bo.bo = bo;
+	temp_bo.read_domains = read_domains;
+	temp_bo.write_domain = write_domain;
+	temp_bo.new_accounted = 0;
+    }
+
+    ret = radeon_cs_check_space_internal(cs, bo ? &temp_bo : NULL);
+    return ret;
+}
+
+int radeon_cs_space_check(struct radeon_cs *cs)
+{
+    return radeon_cs_check_space_internal(cs, NULL);
+}
+
+void radeon_cs_space_reset_bos(struct radeon_cs *cs)
+{
+    int i;
+    for (i = 0; i < cs->bo_count; i++) {
+	radeon_bo_unref(cs->bos[i].bo);
+	cs->bos[i].bo = NULL;
+	cs->bos[i].read_domains = 0;
+	cs->bos[i].write_domain = 0;
+	cs->bos[i].new_accounted = 0;
+    }
+    cs->bo_count = 0;
+}
+
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_debug.c b/src/mesa/drivers/dri/radeon/radeon_debug.c
new file mode 100644
index 0000000000..3b6f003803
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_debug.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright © 2009 Pauli Nieminen
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Pauli Nieminen <suokkos@gmail.com>
+ */
+
+#include "utils.h"
+
+#include "radeon_debug.h"
+#include "radeon_common_context.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+
+static const struct dri_debug_control debug_control[] = {
+	{"fall", RADEON_FALLBACKS},
+	{"tex", RADEON_TEXTURE},
+	{"ioctl", RADEON_IOCTL},
+	{"verts", RADEON_RENDER},
+	{"render", RADEON_RENDER},
+	{"swrender", RADEON_SWRENDER},
+	{"state", RADEON_STATE},
+	{"shader", RADEON_SHADER},
+	{"vfmt", RADEON_VFMT},
+	{"vtxf", RADEON_VFMT},
+	{"dri", RADEON_DRI},
+	{"dma", RADEON_DMA},
+	{"sanity", RADEON_SANITY},
+	{"sync", RADEON_SYNC},
+	{"pixel", RADEON_PIXEL},
+	{"mem", RADEON_MEMORY},
+	{"cs", RADEON_CS},
+	{"allmsg", ~RADEON_SYNC}, /* avoid the term "sync" because the parser uses strstr */
+	{NULL, 0}
+};
+
+radeon_debug_type_t radeon_enabled_debug_types;
+
+void radeon_init_debug(void)
+{
+	radeon_enabled_debug_types = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
+
+	radeon_enabled_debug_types |= RADEON_GENERAL;
+}
+
+void _radeon_debug_add_indent(void)
+{
+	GET_CURRENT_CONTEXT(ctx);
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	const size_t length = sizeof(radeon->debug.indent)
+		/ sizeof(radeon->debug.indent[0]);
+	if (radeon->debug.indent_depth < length - 1) {
+		radeon->debug.indent[radeon->debug.indent_depth] = '\t';
+		++radeon->debug.indent_depth;
+	};
+}
+
+void _radeon_debug_remove_indent(void)
+{
+	GET_CURRENT_CONTEXT(ctx);
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	if (radeon->debug.indent_depth > 0) {
+		radeon->debug.indent[radeon->debug.indent_depth] = '\0';
+		--radeon->debug.indent_depth;
+	}
+}
+
+void _radeon_print(const radeon_debug_type_t type,
+	   const radeon_debug_level_t level,
+	   const char* message,
+	   ...)
+{
+	GET_CURRENT_CONTEXT(ctx);
+	if (ctx) {
+		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+		// FIXME: Make this multi thread safe
+		if (radeon->debug.indent_depth)
+			fprintf(stderr, "%s", radeon->debug.indent);
+	}
+	va_list values;
+	va_start( values, message );
+	vfprintf(stderr, message, values);
+	va_end( values );
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_debug.h b/src/mesa/drivers/dri/radeon/radeon_debug.h
new file mode 100644
index 0000000000..2a8302293b
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_debug.h
@@ -0,0 +1,169 @@
+/*
+ * Copyright © 2009 Pauli Nieminen
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Pauli Nieminen <suokkos@gmail.com>
+ */
+
+#ifndef RADEON_DEBUG_H_INCLUDED
+#define RADEON_DEBUG_H_INCLUDED
+
+#include <stdlib.h>
+
+typedef enum radeon_debug_levels {
+	RADEON_CRITICAL  = 0, /* Only errors */
+	RADEON_IMPORTANT = 1, /* Important warnings and messages */
+	RADEON_NORMAL    = 2, /* Normal log messages usefull for debugging */
+	RADEON_VERBOSE   = 3, /* Extra details to debugging */
+	RADEON_TRACE     = 4  /* Log about everything that happens */
+} radeon_debug_level_t;
+
+/**
+ * Compile time option to change level of debugging compiled to dri driver.
+ * Selecting critical level is not recommended because perfromance gains are
+ * going to minimal but you will lose a lot of important warnings in case of
+ * errors.
+ */
+#ifndef RADEON_DEBUG_LEVEL
+#define RADEON_DEBUG_LEVEL RADEON_VERBOSE
+#endif
+
+typedef enum radeon_debug_types {
+	RADEON_TEXTURE   = 0x00001,
+	RADEON_STATE     = 0x00002,
+	RADEON_IOCTL     = 0x00004,
+	RADEON_RENDER    = 0x00008,
+	RADEON_SWRENDER  = 0x00010,
+	RADEON_FALLBACKS = 0x00020,
+	RADEON_VFMT      = 0x00040,
+	RADEON_SHADER    = 0x00080,
+	RADEON_CS        = 0x00100,
+	RADEON_DRI       = 0x00200,
+	RADEON_DMA       = 0x00400,
+	RADEON_SANITY    = 0x00800,
+	RADEON_SYNC      = 0x01000,
+	RADEON_PIXEL     = 0x02000,
+	RADEON_MEMORY    = 0x04000,
+	RADEON_VERTS     = 0x08000,
+	RADEON_GENERAL   = 0x10000   /* Used for errors and warnings */
+} radeon_debug_type_t;
+
+#define RADEON_MAX_INDENT 5
+
+struct radeon_debug {
+       size_t indent_depth;
+       char indent[RADEON_MAX_INDENT];
+};
+
+extern radeon_debug_type_t radeon_enabled_debug_types;
+
+/**
+ * Compabibility layer for old debug code
+ **/
+#define RADEON_DEBUG radeon_enabled_debug_types
+
+static inline int radeon_is_debug_enabled(const radeon_debug_type_t type,
+	   const radeon_debug_level_t level)
+{
+       return RADEON_DEBUG_LEVEL >= level
+		&& (type & radeon_enabled_debug_types);
+}
+/*
+ * define macro for gcc specific __attribute__ if using alternative compiler
+ */
+#ifndef __GNUC__
+#define  __attribute__(x)  /*empty*/
+#endif
+
+
+extern void _radeon_print(const radeon_debug_type_t type,
+	   const radeon_debug_level_t level,
+	   const char* message,
+	   ...)  __attribute__((format(printf,3,4)));
+/**
+ * Print out debug message if channel specified by type is enabled
+ * and compile time debugging level is at least as high as level parameter
+ */
+#define radeon_print(type, level, message, ...) do {		\
+	const radeon_debug_level_t _debug_level = (level);	\
+	const radeon_debug_type_t _debug_type = (type);		\
+	/* Compile out if level of message is too high */	\
+	if (radeon_is_debug_enabled(type, level)) {		\
+		_radeon_print(_debug_type, _debug_level,	\
+			(message), ## __VA_ARGS__);		\
+	}							\
+} while(0)
+
+/**
+ * printf style function for writing error messages.
+ */
+#define radeon_error(message, ...) do {				\
+	radeon_print(RADEON_GENERAL, RADEON_CRITICAL,		\
+		(message), ## __VA_ARGS__);			\
+} while(0)
+
+/**
+ * printf style function for writing warnings.
+ */
+#define radeon_warning(message, ...) do {			\
+	radeon_print(RADEON_GENERAL, RADEON_IMPORTANT,		\
+		(message), ## __VA_ARGS__);			\
+} while(0)
+
+extern void radeon_init_debug(void);
+extern void _radeon_debug_add_indent(void);
+extern void _radeon_debug_remove_indent(void);
+
+static inline void radeon_debug_add_indent(void)
+{
+       if (RADEON_DEBUG_LEVEL >= RADEON_VERBOSE) {
+	      _radeon_debug_add_indent();
+       }
+}
+static inline void radeon_debug_remove_indent(void)
+{
+       if (RADEON_DEBUG_LEVEL >= RADEON_VERBOSE) {
+	      _radeon_debug_remove_indent();
+       }
+}
+
+/* From http://gcc. gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
+   I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
+   with other compilers ... GLUE!
+*/
+#define WARN_ONCE(a, ...)      { \
+       static int warn##__LINE__=1; \
+       if(warn##__LINE__){ \
+               radeon_warning("*********************************WARN_ONCE*********************************\n"); \
+               radeon_warning("File %s function %s line %d\n", \
+                       __FILE__, __FUNCTION__, __LINE__); \
+               radeon_warning(  (a), ## __VA_ARGS__);\
+               radeon_warning("***************************************************************************\n"); \
+               warn##__LINE__=0;\
+               } \
+       }
+
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.c b/src/mesa/drivers/dri/radeon/radeon_dma.c
new file mode 100644
index 0000000000..2eefa3f2b1
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_dma.c
@@ -0,0 +1,476 @@
+/**************************************************************************
+
+Copyright (C) 2004 Nicolai Haehnle.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+#include <errno.h>
+#include "radeon_common.h"
+#include "main/simple_list.h"
+
+#if defined(USE_X86_ASM)
+#define COPY_DWORDS( dst, src, nr )					\
+do {									\
+	int __tmp;							\
+	__asm__ __volatile__( "rep ; movsl"				\
+			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
+			      : "0" (nr),				\
+			        "D" ((long)dst),			\
+			        "S" ((long)src) );			\
+} while (0)
+#else
+#define COPY_DWORDS( dst, src, nr )		\
+do {						\
+   int j;					\
+   for ( j = 0 ; j < nr ; j++ )			\
+      dst[j] = ((int *)src)[j];			\
+   dst += nr;					\
+} while (0)
+#endif
+
+void radeonEmitVec4(uint32_t *out, const GLvoid * data, int stride, int count)
+{
+	int i;
+
+	if (RADEON_DEBUG & RADEON_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 4)
+		COPY_DWORDS(out, data, count);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out++;
+			data += stride;
+		}
+}
+
+void radeonEmitVec8(uint32_t *out, const GLvoid * data, int stride, int count)
+{
+	int i;
+
+	if (RADEON_DEBUG & RADEON_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 8)
+		COPY_DWORDS(out, data, count * 2);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out += 2;
+			data += stride;
+		}
+}
+
+void radeonEmitVec12(uint32_t *out, const GLvoid * data, int stride, int count)
+{
+	int i;
+
+	if (RADEON_DEBUG & RADEON_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 12) {
+		COPY_DWORDS(out, data, count * 3);
+    }
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out[2] = *(int *)(data + 8);
+			out += 3;
+			data += stride;
+		}
+}
+
+void radeonEmitVec16(uint32_t *out, const GLvoid * data, int stride, int count)
+{
+	int i;
+
+	if (RADEON_DEBUG & RADEON_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 16)
+		COPY_DWORDS(out, data, count * 4);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out[2] = *(int *)(data + 8);
+			out[3] = *(int *)(data + 12);
+			out += 4;
+			data += stride;
+		}
+}
+
+void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
+			 const GLvoid * data, int size, int stride, int count)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	uint32_t *out;
+
+	if (stride == 0) {
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
+		count = 1;
+		aos->stride = 0;
+	} else {
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * count * 4, 32);
+		aos->stride = size;
+	}
+
+	aos->components = size;
+	aos->count = count;
+
+	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
+	switch (size) {
+	case 1: radeonEmitVec4(out, data, stride, count); break;
+	case 2: radeonEmitVec8(out, data, stride, count); break;
+	case 3: radeonEmitVec12(out, data, stride, count); break;
+	case 4: radeonEmitVec16(out, data, stride, count); break;
+	default:
+		assert(0);
+		break;
+	}
+}
+
+void radeon_init_dma(radeonContextPtr rmesa)
+{
+	make_empty_list(&rmesa->dma.free);
+	make_empty_list(&rmesa->dma.wait);
+	make_empty_list(&rmesa->dma.reserved);
+	rmesa->dma.minimum_size = MAX_DMA_BUF_SZ;
+}
+
+void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size)
+{
+	struct radeon_dma_bo *dma_bo = NULL;
+	/* we set minimum sizes to at least requested size
+	   aligned to next 16 bytes. */
+	if (size > rmesa->dma.minimum_size)
+		rmesa->dma.minimum_size = (size + 15) & (~15);
+
+	radeon_print(RADEON_DMA, RADEON_NORMAL, "%s size %d minimum_size %d\n",
+			__FUNCTION__, size, rmesa->dma.minimum_size);
+
+
+	/* unmap old reserved bo */
+	if (!is_empty_list(&rmesa->dma.reserved))
+		radeon_bo_unmap(first_elem(&rmesa->dma.reserved)->bo);
+
+	if (is_empty_list(&rmesa->dma.free)
+	      || last_elem(&rmesa->dma.free)->bo->size < size) {
+		dma_bo = CALLOC_STRUCT(radeon_dma_bo);
+		assert(dma_bo);
+
+again_alloc:
+		dma_bo->bo = radeon_bo_open(rmesa->radeonScreen->bom,
+					    0, rmesa->dma.minimum_size, 4,
+					    RADEON_GEM_DOMAIN_GTT, 0);
+
+		if (!dma_bo->bo) {
+			rcommonFlushCmdBuf(rmesa, __FUNCTION__);
+			goto again_alloc;
+		}
+		insert_at_head(&rmesa->dma.reserved, dma_bo);
+	} else {
+		/* We push and pop buffers from end of list so we can keep
+		   counter on unused buffers for later freeing them from
+		   begin of list */
+		dma_bo = last_elem(&rmesa->dma.free);
+		assert(dma_bo->bo->cref == 1);
+		remove_from_list(dma_bo);
+		insert_at_head(&rmesa->dma.reserved, dma_bo);
+	}
+
+	rmesa->dma.current_used = 0;
+	rmesa->dma.current_vertexptr = 0;
+	
+	if (radeon_cs_space_check_with_bo(rmesa->cmdbuf.cs,
+					  first_elem(&rmesa->dma.reserved)->bo,
+					  RADEON_GEM_DOMAIN_GTT, 0))
+		fprintf(stderr,"failure to revalidate BOs - badness\n");
+
+	if (is_empty_list(&rmesa->dma.reserved)) {
+        /* Cmd buff have been flushed in radeon_revalidate_bos */
+		goto again_alloc;
+	}
+
+	radeon_bo_map(first_elem(&rmesa->dma.reserved)->bo, 1);
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void radeonAllocDmaRegion(radeonContextPtr rmesa,
+			  struct radeon_bo **pbo, int *poffset,
+			  int bytes, int alignment)
+{
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa->glCtx);
+
+	assert(rmesa->dma.current_used == rmesa->dma.current_vertexptr);
+
+	alignment--;
+	rmesa->dma.current_used = (rmesa->dma.current_used + alignment) & ~alignment;
+
+	if (is_empty_list(&rmesa->dma.reserved)
+		|| rmesa->dma.current_used + bytes > first_elem(&rmesa->dma.reserved)->bo->size)
+		radeonRefillCurrentDmaRegion(rmesa, bytes);
+
+	*poffset = rmesa->dma.current_used;
+	*pbo = first_elem(&rmesa->dma.reserved)->bo;
+	radeon_bo_ref(*pbo);
+
+	/* Always align to at least 16 bytes */
+	rmesa->dma.current_used = (rmesa->dma.current_used + bytes + 15) & ~15;
+	rmesa->dma.current_vertexptr = rmesa->dma.current_used;
+
+	assert(rmesa->dma.current_used <= first_elem(&rmesa->dma.reserved)->bo->size);
+}
+
+void radeonFreeDmaRegions(radeonContextPtr rmesa)
+{
+	struct radeon_dma_bo *dma_bo = CALLOC_STRUCT(radeon_dma_bo);
+	struct radeon_dma_bo *temp;
+	if (RADEON_DEBUG & RADEON_DMA)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	foreach_s(dma_bo, temp, &rmesa->dma.free) {
+		remove_from_list(dma_bo);
+	        radeon_bo_unref(dma_bo->bo);
+		FREE(dma_bo);
+	}
+
+	foreach_s(dma_bo, temp, &rmesa->dma.wait) {
+		remove_from_list(dma_bo);
+	        radeon_bo_unref(dma_bo->bo);
+		FREE(dma_bo);
+	}
+
+	foreach_s(dma_bo, temp, &rmesa->dma.reserved) {
+		remove_from_list(dma_bo);
+		radeon_bo_unmap(dma_bo->bo);
+	        radeon_bo_unref(dma_bo->bo);
+		FREE(dma_bo);
+	}
+}
+
+void radeonReturnDmaRegion(radeonContextPtr rmesa, int return_bytes)
+{
+	if (is_empty_list(&rmesa->dma.reserved))
+		return;
+
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, return_bytes);
+	rmesa->dma.current_used -= return_bytes;
+	rmesa->dma.current_vertexptr = rmesa->dma.current_used;
+}
+
+static int radeon_bo_is_idle(struct radeon_bo* bo)
+{
+	uint32_t domain;
+	int ret = radeon_bo_is_busy(bo, &domain);
+	if (ret == -EINVAL) {
+		WARN_ONCE("Your libdrm or kernel doesn't have support for busy query.\n"
+			"This may cause small performance drop for you.\n");
+	}
+	return ret != -EBUSY;
+}
+
+void radeonReleaseDmaRegions(radeonContextPtr rmesa)
+{
+	struct radeon_dma_bo *dma_bo;
+	struct radeon_dma_bo *temp;
+	const int expire_at = ++rmesa->dma.free.expire_counter + DMA_BO_FREE_TIME;
+	const int time = rmesa->dma.free.expire_counter;
+
+	if (RADEON_DEBUG & RADEON_DMA) {
+		size_t free = 0,
+		       wait = 0,
+		       reserved = 0;
+		foreach(dma_bo, &rmesa->dma.free)
+			++free;
+
+		foreach(dma_bo, &rmesa->dma.wait)
+			++wait;
+
+		foreach(dma_bo, &rmesa->dma.reserved)
+			++reserved;
+
+		fprintf(stderr, "%s: free %zu, wait %zu, reserved %zu, minimum_size: %zu\n",
+		      __FUNCTION__, free, wait, reserved, rmesa->dma.minimum_size);
+	}
+
+	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
+		/* request updated cs processing information from kernel */
+		legacy_track_pending(rmesa->radeonScreen->bom, 0);
+	}
+	/* move waiting bos to free list.
+	   wait list provides gpu time to handle data before reuse */
+	foreach_s(dma_bo, temp, &rmesa->dma.wait) {
+		if (dma_bo->expire_counter == time) {
+			WARN_ONCE("Leaking dma buffer object!\n");
+			radeon_bo_unref(dma_bo->bo);
+			remove_from_list(dma_bo);
+			FREE(dma_bo);
+			continue;
+		}
+		/* free objects that are too small to be used because of large request */
+		if (dma_bo->bo->size < rmesa->dma.minimum_size) {
+		   radeon_bo_unref(dma_bo->bo);
+		   remove_from_list(dma_bo);
+		   FREE(dma_bo);
+		   continue;
+		}
+		if (!radeon_bo_is_idle(dma_bo->bo))
+			continue;
+		remove_from_list(dma_bo);
+		dma_bo->expire_counter = expire_at;
+		insert_at_tail(&rmesa->dma.free, dma_bo);
+	}
+
+	/* unmap the last dma region */
+	if (!is_empty_list(&rmesa->dma.reserved))
+		radeon_bo_unmap(first_elem(&rmesa->dma.reserved)->bo);
+	/* move reserved to wait list */
+	foreach_s(dma_bo, temp, &rmesa->dma.reserved) {
+		/* free objects that are too small to be used because of large request */
+		if (dma_bo->bo->size < rmesa->dma.minimum_size) {
+		   radeon_bo_unref(dma_bo->bo);
+		   remove_from_list(dma_bo);
+		   FREE(dma_bo);
+		   continue;
+		}
+		remove_from_list(dma_bo);
+		dma_bo->expire_counter = expire_at;
+		insert_at_tail(&rmesa->dma.wait, dma_bo);
+	}
+
+	/* free bos that have been unused for some time */
+	foreach_s(dma_bo, temp, &rmesa->dma.free) {
+		if (dma_bo->expire_counter != time)
+			break;
+		remove_from_list(dma_bo);
+	        radeon_bo_unref(dma_bo->bo);
+		FREE(dma_bo);
+	}
+
+}
+
+
+/* Flush vertices in the current dma region.
+ */
+void rcommon_flush_last_swtcl_prim( GLcontext *ctx  )
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	struct radeon_dma *dma = &rmesa->dma;
+		
+
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+	dma->flush = NULL;
+
+	if (!is_empty_list(&dma->reserved)) {
+	    GLuint current_offset = dma->current_used;
+
+	    assert (dma->current_used +
+		    rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+		    dma->current_vertexptr);
+
+	    if (dma->current_used != dma->current_vertexptr) {
+		    dma->current_used = dma->current_vertexptr;
+
+		    rmesa->vtbl.swtcl_flush(ctx, current_offset);
+	    }
+	    rmesa->swtcl.numverts = 0;
+	}
+}
+/* Alloc space in the current dma region.
+ */
+void *
+rcommonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
+{
+	GLuint bytes = vsize * nverts;
+	void *head;
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+	if(is_empty_list(&rmesa->dma.reserved)
+	      ||rmesa->dma.current_vertexptr + bytes > first_elem(&rmesa->dma.reserved)->bo->size) {
+		if (rmesa->dma.flush) {
+			rmesa->dma.flush(rmesa->glCtx);
+		}
+
+                radeonRefillCurrentDmaRegion(rmesa, bytes);
+
+		return NULL;
+	}
+
+        if (!rmesa->dma.flush) {
+		/* if cmdbuf flushed DMA restart */
+                rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+                rmesa->dma.flush = rcommon_flush_last_swtcl_prim;
+        }
+
+	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
+        ASSERT( rmesa->dma.flush == rcommon_flush_last_swtcl_prim );
+        ASSERT( rmesa->dma.current_used +
+                rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+                rmesa->dma.current_vertexptr );
+
+	head = (first_elem(&rmesa->dma.reserved)->bo->ptr + rmesa->dma.current_vertexptr);
+	rmesa->dma.current_vertexptr += bytes;
+	rmesa->swtcl.numverts += nverts;
+	return head;
+}
+
+void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
+{
+   radeonContextPtr radeon = RADEON_CONTEXT( ctx );
+   int i;
+	if (RADEON_DEBUG & RADEON_IOCTL)
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+   if (radeon->dma.flush) {
+       radeon->dma.flush(radeon->glCtx);
+   }
+   for (i = 0; i < radeon->tcl.aos_count; i++) {
+      if (radeon->tcl.aos[i].bo) {
+         radeon_bo_unref(radeon->tcl.aos[i].bo);
+         radeon->tcl.aos[i].bo = NULL;
+
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.h b/src/mesa/drivers/dri/radeon/radeon_dma.h
new file mode 100644
index 0000000000..74e653fd18
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_dma.h
@@ -0,0 +1,58 @@
+/**************************************************************************
+
+Copyright (C) 2004 Nicolai Haehnle.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+#ifndef RADEON_DMA_H
+#define RADEON_DMA_H
+
+void radeonEmitVec4(uint32_t *out, const GLvoid * data, int stride, int count);
+void radeonEmitVec8(uint32_t *out, const GLvoid * data, int stride, int count);
+void radeonEmitVec12(uint32_t *out, const GLvoid * data, int stride, int count);
+void radeonEmitVec16(uint32_t *out, const GLvoid * data, int stride, int count);
+
+void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
+			 const GLvoid * data, int size, int stride, int count);
+
+void radeonReturnDmaRegion(radeonContextPtr rmesa, int return_bytes);
+void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size);
+void radeon_init_dma(radeonContextPtr rmesa);
+void radeonReturnDmaRegion(radeonContextPtr rmesa, int return_bytes);
+void radeonAllocDmaRegion(radeonContextPtr rmesa,
+			  struct radeon_bo **pbo, int *poffset,
+			  int bytes, int alignment);
+void radeonReleaseDmaRegions(radeonContextPtr rmesa);
+
+void rcommon_flush_last_swtcl_prim(GLcontext *ctx);
+
+void *rcommonAllocDmaLowVerts(radeonContextPtr rmesa, int nverts, int vsize);
+void radeonFreeDmaRegions(radeonContextPtr rmesa);
+void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs );
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
new file mode 100644
index 0000000000..d83b166742
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -0,0 +1,599 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Red Hat Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/mtypes.h"
+#include "main/fbobject.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "main/context.h"
+#include "main/texformat.h"
+#include "main/texrender.h"
+#include "drivers/common/meta.h"
+
+#include "radeon_common.h"
+#include "radeon_mipmap_tree.h"
+
+#define FILE_DEBUG_FLAG RADEON_TEXTURE
+#define DBG(...) do {                                           \
+        if (RADEON_DEBUG & FILE_DEBUG_FLAG)                      \
+                _mesa_printf(__VA_ARGS__);                      \
+} while(0)
+
+static struct gl_framebuffer *
+radeon_new_framebuffer(GLcontext *ctx, GLuint name)
+{
+  return _mesa_new_framebuffer(ctx, name);
+}
+
+static void
+radeon_delete_renderbuffer(struct gl_renderbuffer *rb)
+{
+  struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
+
+  ASSERT(rrb);
+
+  if (rrb && rrb->bo) {
+    radeon_bo_unref(rrb->bo);
+  }
+  _mesa_free(rrb);
+}
+
+static void *
+radeon_get_pointer(GLcontext *ctx, struct gl_renderbuffer *rb,
+		   GLint x, GLint y)
+{
+  return NULL;
+}
+
+/**
+ * Called via glRenderbufferStorageEXT() to set the format and allocate
+ * storage for a user-created renderbuffer.
+ */
+static GLboolean
+radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
+                                 GLenum internalFormat,
+                                 GLuint width, GLuint height)
+{
+  struct radeon_context *radeon = RADEON_CONTEXT(ctx);
+  struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
+  GLboolean software_buffer = GL_FALSE;
+  int cpp;
+
+   ASSERT(rb->Name != 0);
+  switch (internalFormat) {
+   case GL_R3_G3_B2:
+   case GL_RGB4:
+   case GL_RGB5:
+      rb->_ActualFormat = GL_RGB5;
+      rb->DataType = GL_UNSIGNED_BYTE;
+      rb->RedBits = 5;
+      rb->GreenBits = 6;
+      rb->BlueBits = 5;
+      cpp = 2;
+      break;
+   case GL_RGB:
+   case GL_RGB8:
+   case GL_RGB10:
+   case GL_RGB12:
+   case GL_RGB16:
+      rb->_ActualFormat = GL_RGB8;
+      rb->DataType = GL_UNSIGNED_BYTE;
+      rb->RedBits = 8;
+      rb->GreenBits = 8;
+      rb->BlueBits = 8;
+      rb->AlphaBits = 0;
+      cpp = 4;
+      break;
+   case GL_RGBA:
+   case GL_RGBA2:
+   case GL_RGBA4:
+   case GL_RGB5_A1:
+   case GL_RGBA8:
+   case GL_RGB10_A2:
+   case GL_RGBA12:
+   case GL_RGBA16:
+      rb->_ActualFormat = GL_RGBA8;
+      rb->DataType = GL_UNSIGNED_BYTE;
+      rb->RedBits = 8;
+      rb->GreenBits = 8;
+      rb->BlueBits = 8;
+      rb->AlphaBits = 8;
+      cpp = 4;
+      break;
+   case GL_STENCIL_INDEX:
+   case GL_STENCIL_INDEX1_EXT:
+   case GL_STENCIL_INDEX4_EXT:
+   case GL_STENCIL_INDEX8_EXT:
+   case GL_STENCIL_INDEX16_EXT:
+      /* alloc a depth+stencil buffer */
+      rb->_ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+      rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
+      rb->StencilBits = 8;
+      cpp = 4;
+      break;
+   case GL_DEPTH_COMPONENT16:
+      rb->_ActualFormat = GL_DEPTH_COMPONENT16;
+      rb->DataType = GL_UNSIGNED_SHORT;
+      rb->DepthBits = 16;
+      cpp = 2;
+      break;
+   case GL_DEPTH_COMPONENT:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH_COMPONENT32:
+      rb->_ActualFormat = GL_DEPTH_COMPONENT24;
+      rb->DataType = GL_UNSIGNED_INT;
+      rb->DepthBits = 24;
+      cpp = 4;
+      break;
+   case GL_DEPTH_STENCIL_EXT:
+   case GL_DEPTH24_STENCIL8_EXT:
+      rb->_ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+      rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
+      rb->DepthBits = 24;
+      rb->StencilBits = 8;
+      cpp = 4;
+      break;
+   default:
+      _mesa_problem(ctx,
+                    "Unexpected format in intel_alloc_renderbuffer_storage");
+      return GL_FALSE;
+   }
+
+  if (ctx->Driver.Flush)
+	  ctx->Driver.Flush(ctx); /* +r6/r7 */
+
+  if (rrb->bo)
+    radeon_bo_unref(rrb->bo);
+  
+    
+   if (software_buffer) {
+      return _mesa_soft_renderbuffer_storage(ctx, rb, internalFormat,
+                                             width, height);
+   }
+   else {
+     uint32_t size;
+     uint32_t pitch = ((cpp * width + 63) & ~63) / cpp;
+
+     fprintf(stderr,"Allocating %d x %d radeon RBO (pitch %d)\n", width,
+	  height, pitch);
+
+     size = pitch * height * cpp;
+     rrb->pitch = pitch * cpp;
+     rrb->cpp = cpp;
+     rrb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+			      0,
+			      size,
+			      0,
+			      RADEON_GEM_DOMAIN_VRAM,
+			      0);
+     rb->Width = width;
+     rb->Height = height;
+       return GL_TRUE;
+   }    
+   
+}
+
+
+/**
+ * Called for each hardware renderbuffer when a _window_ is resized.
+ * Just update fields.
+ * Not used for user-created renderbuffers!
+ */
+static GLboolean
+radeon_alloc_window_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
+                           GLenum internalFormat, GLuint width, GLuint height)
+{
+   ASSERT(rb->Name == 0);
+   rb->Width = width;
+   rb->Height = height;
+   rb->_ActualFormat = internalFormat;
+
+   return GL_TRUE;
+}
+
+
+static void
+radeon_resize_buffers(GLcontext *ctx, struct gl_framebuffer *fb,
+		     GLuint width, GLuint height)
+{
+     struct radeon_framebuffer *radeon_fb = (struct radeon_framebuffer*)fb;
+   int i;
+
+   _mesa_resize_framebuffer(ctx, fb, width, height);
+
+   fb->Initialized = GL_TRUE; /* XXX remove someday */
+
+   if (fb->Name != 0) {
+      return;
+   }
+
+   /* Make sure all window system renderbuffers are up to date */
+   for (i = 0; i < 2; i++) {
+      struct gl_renderbuffer *rb = &radeon_fb->color_rb[i]->base;
+
+      /* only resize if size is changing */
+      if (rb && (rb->Width != width || rb->Height != height)) {
+	 rb->AllocStorage(ctx, rb, rb->InternalFormat, width, height);
+      }
+   }
+}
+
+
+/** Dummy function for gl_renderbuffer::AllocStorage() */
+static GLboolean
+radeon_nop_alloc_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
+			 GLenum internalFormat, GLuint width, GLuint height)
+{
+   _mesa_problem(ctx, "radeon_op_alloc_storage should never be called.");
+   return GL_FALSE;
+}
+
+struct radeon_renderbuffer *
+radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv)
+{
+    struct radeon_renderbuffer *rrb;
+
+    rrb = CALLOC_STRUCT(radeon_renderbuffer);
+    if (!rrb)
+	return NULL;
+
+    _mesa_init_renderbuffer(&rrb->base, 0);
+    rrb->base.ClassID = RADEON_RB_CLASS;
+
+    /* XXX format junk */
+    switch (format) {
+	case GL_RGB5:
+	    rrb->base._ActualFormat = GL_RGB5;
+	    rrb->base._BaseFormat = GL_RGBA;
+	    rrb->base.RedBits = 5;
+	    rrb->base.GreenBits = 6;
+	    rrb->base.BlueBits = 5;
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_RGB8:
+	    rrb->base._ActualFormat = GL_RGB8;
+	    rrb->base._BaseFormat = GL_RGB;
+	    rrb->base.RedBits = 8;
+	    rrb->base.GreenBits = 8;
+	    rrb->base.BlueBits = 8;
+	    rrb->base.AlphaBits = 0;
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_RGBA8:
+	    rrb->base._ActualFormat = GL_RGBA8;
+	    rrb->base._BaseFormat = GL_RGBA;
+	    rrb->base.RedBits = 8;
+	    rrb->base.GreenBits = 8;
+	    rrb->base.BlueBits = 8;
+	    rrb->base.AlphaBits = 8;
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_STENCIL_INDEX8_EXT:
+	    rrb->base._ActualFormat = GL_STENCIL_INDEX8_EXT;
+	    rrb->base._BaseFormat = GL_STENCIL_INDEX;
+	    rrb->base.StencilBits = 8;
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_DEPTH_COMPONENT16:
+	    rrb->base._ActualFormat = GL_DEPTH_COMPONENT16;
+	    rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
+	    rrb->base.DepthBits = 16;
+	    rrb->base.DataType = GL_UNSIGNED_SHORT;
+	    break;
+	case GL_DEPTH_COMPONENT24:
+	    rrb->base._ActualFormat = GL_DEPTH_COMPONENT24;
+	    rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
+	    rrb->base.DepthBits = 24;
+	    rrb->base.DataType = GL_UNSIGNED_INT;
+	    break;
+	case GL_DEPTH24_STENCIL8_EXT:
+	    rrb->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+	    rrb->base._BaseFormat = GL_DEPTH_STENCIL_EXT;
+	    rrb->base.DepthBits = 24;
+	    rrb->base.StencilBits = 8;
+	    rrb->base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+	    break;
+	default:
+	    fprintf(stderr, "%s: Unknown format 0x%04x\n", __FUNCTION__, format);
+	    _mesa_delete_renderbuffer(&rrb->base);
+	    return NULL;
+    }
+
+    rrb->dPriv = driDrawPriv;
+    rrb->base.InternalFormat = format;
+
+    rrb->base.Delete = radeon_delete_renderbuffer;
+    rrb->base.AllocStorage = radeon_alloc_window_storage;
+    rrb->base.GetPointer = radeon_get_pointer;
+
+    rrb->bo = NULL;
+    return rrb;
+}
+
+static struct gl_renderbuffer *
+radeon_new_renderbuffer(GLcontext * ctx, GLuint name)
+{
+  struct radeon_renderbuffer *rrb;
+
+  rrb = CALLOC_STRUCT(radeon_renderbuffer);
+  if (!rrb)
+    return NULL;
+
+  _mesa_init_renderbuffer(&rrb->base, name);
+  rrb->base.ClassID = RADEON_RB_CLASS;
+
+  rrb->base.Delete = radeon_delete_renderbuffer;
+  rrb->base.AllocStorage = radeon_alloc_renderbuffer_storage;
+  rrb->base.GetPointer = radeon_get_pointer;
+
+  return &rrb->base;
+}
+
+static void
+radeon_bind_framebuffer(GLcontext * ctx, GLenum target,
+                       struct gl_framebuffer *fb, struct gl_framebuffer *fbread)
+{
+   if (target == GL_FRAMEBUFFER_EXT || target == GL_DRAW_FRAMEBUFFER_EXT) {
+      radeon_draw_buffer(ctx, fb);
+   }
+   else {
+      /* don't need to do anything if target == GL_READ_FRAMEBUFFER_EXT */
+   }
+}
+
+static void
+radeon_framebuffer_renderbuffer(GLcontext * ctx,
+                               struct gl_framebuffer *fb,
+                               GLenum attachment, struct gl_renderbuffer *rb)
+{
+
+	if (ctx->Driver.Flush)
+		ctx->Driver.Flush(ctx); /* +r6/r7 */
+
+   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb);
+   radeon_draw_buffer(ctx, fb);
+}
+
+
+static GLboolean
+radeon_update_wrapper(GLcontext *ctx, struct radeon_renderbuffer *rrb, 
+		     struct gl_texture_image *texImage)
+{
+	int retry = 0;
+restart:
+	if (texImage->TexFormat == &_mesa_texformat_argb8888) {
+		rrb->cpp = 4;
+		rrb->base._ActualFormat = GL_RGBA8;
+		rrb->base._BaseFormat = GL_RGBA;
+		rrb->base.DataType = GL_UNSIGNED_BYTE;
+		DBG("Render to RGBA8 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_rgb565) {
+		rrb->cpp = 2;
+		rrb->base._ActualFormat = GL_RGB5;
+		rrb->base._BaseFormat = GL_RGB;
+		rrb->base.DataType = GL_UNSIGNED_BYTE;
+		DBG("Render to RGB5 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_argb1555) {
+		rrb->cpp = 2;
+		rrb->base._ActualFormat = GL_RGB5_A1;
+		rrb->base._BaseFormat = GL_RGBA;
+		rrb->base.DataType = GL_UNSIGNED_BYTE;
+		DBG("Render to ARGB1555 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_argb4444) {
+		rrb->cpp = 2;
+		rrb->base._ActualFormat = GL_RGBA4;
+		rrb->base._BaseFormat = GL_RGBA;
+		rrb->base.DataType = GL_UNSIGNED_BYTE;
+		DBG("Render to ARGB1555 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_z16) {
+		rrb->cpp = 2;
+		rrb->base._ActualFormat = GL_DEPTH_COMPONENT16;
+		rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
+		rrb->base.DataType = GL_UNSIGNED_SHORT;
+		DBG("Render to DEPTH16 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_s8_z24) {
+		rrb->cpp = 4;
+		rrb->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+		rrb->base._BaseFormat = GL_DEPTH_STENCIL_EXT;
+		rrb->base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+		DBG("Render to DEPTH_STENCIL texture OK\n");
+	}
+	else {
+		/* try redoing the FBO */
+		if (retry == 1) {
+			DBG("Render to texture BAD FORMAT %d\n",
+			    texImage->TexFormat->MesaFormat);
+			return GL_FALSE;
+		}
+		texImage->TexFormat = radeonChooseTextureFormat(ctx, texImage->InternalFormat, 0,
+								texImage->TexFormat->DataType,
+								1);
+
+		retry++;
+		goto restart;
+	}
+	
+	rrb->pitch = texImage->Width * rrb->cpp;
+	rrb->base.InternalFormat = rrb->base._ActualFormat;
+	rrb->base.Width = texImage->Width;
+	rrb->base.Height = texImage->Height;
+	rrb->base.RedBits = texImage->TexFormat->RedBits;
+	rrb->base.GreenBits = texImage->TexFormat->GreenBits;
+	rrb->base.BlueBits = texImage->TexFormat->BlueBits;
+	rrb->base.AlphaBits = texImage->TexFormat->AlphaBits;
+	rrb->base.DepthBits = texImage->TexFormat->DepthBits;
+	rrb->base.StencilBits = texImage->TexFormat->StencilBits;
+	
+	rrb->base.Delete = radeon_delete_renderbuffer;
+	rrb->base.AllocStorage = radeon_nop_alloc_storage;
+	
+	return GL_TRUE;
+}
+
+
+static struct radeon_renderbuffer *
+radeon_wrap_texture(GLcontext * ctx, struct gl_texture_image *texImage)
+{
+  const GLuint name = ~0;   /* not significant, but distinct for debugging */
+  struct radeon_renderbuffer *rrb;
+
+   /* make an radeon_renderbuffer to wrap the texture image */
+   rrb = CALLOC_STRUCT(radeon_renderbuffer);
+   if (!rrb) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glFramebufferTexture");
+      return NULL;
+   }
+
+   _mesa_init_renderbuffer(&rrb->base, name);
+   rrb->base.ClassID = RADEON_RB_CLASS;
+
+   if (!radeon_update_wrapper(ctx, rrb, texImage)) {
+      _mesa_free(rrb);
+      return NULL;
+   }
+
+   return rrb;
+  
+}
+static void
+radeon_render_texture(GLcontext * ctx,
+                     struct gl_framebuffer *fb,
+                     struct gl_renderbuffer_attachment *att)
+{
+   struct gl_texture_image *newImage
+      = att->Texture->Image[att->CubeMapFace][att->TextureLevel];
+   struct radeon_renderbuffer *rrb = radeon_renderbuffer(att->Renderbuffer);
+   radeon_texture_image *radeon_image;
+   GLuint imageOffset;
+
+   (void) fb;
+
+   ASSERT(newImage);
+
+   if (newImage->Border != 0) {
+      /* Fallback on drawing to a texture with a border, which won't have a
+       * miptree.
+       */
+      _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
+      _mesa_render_texture(ctx, fb, att);
+      return;
+   }
+   else if (!rrb) {
+      rrb = radeon_wrap_texture(ctx, newImage);
+      if (rrb) {
+         /* bind the wrapper to the attachment point */
+         _mesa_reference_renderbuffer(&att->Renderbuffer, &rrb->base);
+      }
+      else {
+         /* fallback to software rendering */
+         _mesa_render_texture(ctx, fb, att);
+         return;
+      }
+   }
+
+   if (!radeon_update_wrapper(ctx, rrb, newImage)) {
+       _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
+       _mesa_render_texture(ctx, fb, att);
+       return;
+   }
+
+   DBG("Begin render texture tid %x tex=%u w=%d h=%d refcount=%d\n",
+       _glthread_GetID(),
+       att->Texture->Name, newImage->Width, newImage->Height,
+       rrb->base.RefCount);
+
+   /* point the renderbufer's region to the texture image region */
+   radeon_image = (radeon_texture_image *)newImage;
+   if (rrb->bo != radeon_image->mt->bo) {
+      if (rrb->bo)
+  	radeon_bo_unref(rrb->bo);
+      rrb->bo = radeon_image->mt->bo;
+      radeon_bo_ref(rrb->bo);
+   }
+
+   /* compute offset of the particular 2D image within the texture region */
+   imageOffset = radeon_miptree_image_offset(radeon_image->mt,
+                                            att->CubeMapFace,
+                                            att->TextureLevel);
+
+   if (att->Texture->Target == GL_TEXTURE_3D) {
+      GLuint offsets[6];
+      radeon_miptree_depth_offsets(radeon_image->mt, att->TextureLevel,
+				   offsets);
+      imageOffset += offsets[att->Zoffset];
+   }
+
+   /* store that offset in the region */
+   rrb->draw_offset = imageOffset;
+
+   /* update drawing region, etc */
+   radeon_draw_buffer(ctx, fb);
+}
+
+static void
+radeon_finish_render_texture(GLcontext * ctx,
+                            struct gl_renderbuffer_attachment *att)
+{
+
+}
+static void
+radeon_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
+{
+}
+
+void radeon_fbo_init(struct radeon_context *radeon)
+{
+  radeon->glCtx->Driver.NewFramebuffer = radeon_new_framebuffer;
+  radeon->glCtx->Driver.NewRenderbuffer = radeon_new_renderbuffer;
+  radeon->glCtx->Driver.BindFramebuffer = radeon_bind_framebuffer;
+  radeon->glCtx->Driver.FramebufferRenderbuffer = radeon_framebuffer_renderbuffer;
+  radeon->glCtx->Driver.RenderTexture = radeon_render_texture;
+  radeon->glCtx->Driver.FinishRenderTexture = radeon_finish_render_texture;
+  radeon->glCtx->Driver.ResizeBuffers = radeon_resize_buffers;
+  radeon->glCtx->Driver.ValidateFramebuffer = radeon_validate_framebuffer;
+  radeon->glCtx->Driver.BlitFramebuffer = _mesa_meta_blit_framebuffer;
+}
+
+  
+void radeon_renderbuffer_set_bo(struct radeon_renderbuffer *rb,
+				struct radeon_bo *bo)
+{
+  struct radeon_bo *old;
+  old = rb->bo;
+  rb->bo = bo;
+  radeon_bo_ref(bo);
+  if (old)
+    radeon_bo_unref(old);
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.c b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
index 09acf6b4f8..a0106d00fa 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
@@ -35,7 +35,21 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include <sched.h>
-#include <errno.h> 
+#include <errno.h>
+
+#include "main/attrib.h"
+#include "main/enable.h"
+#include "main/blend.h"
+#include "main/bufferobj.h"
+#include "main/buffers.h"
+#include "main/depth.h"
+#include "main/shaders.h"
+#include "main/texstate.h"
+#include "main/varray.h"
+#include "glapi/dispatch.h"
+#include "swrast/swrast.h"
+#include "main/stencil.h"
+#include "main/matrix.h"
 
 #include "main/glheader.h"
 #include "main/imports.h"
@@ -43,6 +57,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 
 #include "radeon_context.h"
+#include "radeon_common.h"
 #include "radeon_state.h"
 #include "radeon_ioctl.h"
 #include "radeon_tcl.h"
@@ -58,75 +73,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define RADEON_IDLE_RETRY           16
 
 
-static void radeonWaitForIdle( radeonContextPtr rmesa );
-static int radeonFlushCmdBufLocked( radeonContextPtr rmesa, 
-				    const char * caller );
-
-static void print_state_atom( struct radeon_state_atom *state )
-{
-   int i;
-
-   fprintf(stderr, "emit %s/%d\n", state->name, state->cmd_size);
-
-   if (RADEON_DEBUG & DEBUG_VERBOSE) 
-      for (i = 0 ; i < state->cmd_size ; i++) 
-	 fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
-
-}
-
-static void radeonSaveHwState( radeonContextPtr rmesa )
-{
-   struct radeon_state_atom *atom;
-   char * dest = rmesa->backup_store.cmd_buf;
-
-   if (RADEON_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-   
-   rmesa->backup_store.cmd_used = 0;
-
-   foreach( atom, &rmesa->hw.atomlist ) {
-      if ( atom->check( rmesa->glCtx ) ) {
-	 int size = atom->cmd_size * 4;
-	 memcpy( dest, atom->cmd, size);
-	 dest += size;
-	 rmesa->backup_store.cmd_used += size;
-	 if (RADEON_DEBUG & DEBUG_STATE)
-	    print_state_atom( atom );
-      }
-   }
-
-   assert( rmesa->backup_store.cmd_used <= RADEON_CMD_BUF_SZ );
-   if (RADEON_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "Returning to radeonEmitState\n");
-}
-
-/* At this point we were in FlushCmdBufLocked but we had lost our context, so
- * we need to unwire our current cmdbuf, hook the one with the saved state in
- * it, flush it, and then put the current one back.  This is so commands at the
- * start of a cmdbuf can rely on the state being kept from the previous one.
- */
-static void radeonBackUpAndEmitLostStateLocked( radeonContextPtr rmesa )
-{
-   GLuint nr_released_bufs;
-   struct radeon_store saved_store;
-
-   if (rmesa->backup_store.cmd_used == 0)
-      return;
-
-   if (RADEON_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "Emitting backup state on lost context\n");
-
-   rmesa->lost_context = GL_FALSE;
-
-   nr_released_bufs = rmesa->dma.nr_released_bufs;
-   saved_store = rmesa->store;
-   rmesa->dma.nr_released_bufs = 0;
-   rmesa->store = rmesa->backup_store;
-   radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
-   rmesa->dma.nr_released_bufs = nr_released_bufs;
-   rmesa->store = saved_store;
-}
-
 /* =============================================================
  * Kernel command buffer handling
  */
@@ -134,965 +80,382 @@ static void radeonBackUpAndEmitLostStateLocked( radeonContextPtr rmesa )
 /* The state atoms will be emitted in the order they appear in the atom list,
  * so this step is important.
  */
-void radeonSetUpAtomList( radeonContextPtr rmesa )
+void radeonSetUpAtomList( r100ContextPtr rmesa )
 {
-   int i, mtu = rmesa->glCtx->Const.MaxTextureUnits;
-
-   make_empty_list(&rmesa->hw.atomlist);
-   rmesa->hw.atomlist.name = "atom-list";
-
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.ctx);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.set);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.lin);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.msk);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.vpt);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.tcl);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.msc);
+   int i, mtu = rmesa->radeon.glCtx->Const.MaxTextureUnits;
+
+   make_empty_list(&rmesa->radeon.hw.atomlist);
+   rmesa->radeon.hw.atomlist.name = "atom-list";
+
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.ctx);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.set);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.lin);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.msk);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.vpt);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.tcl);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.msc);
    for (i = 0; i < mtu; ++i) {
-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.tex[i]);
-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.txr[i]);
-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.cube[i]);
+       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.tex[i]);
+       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.txr[i]);
+       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.cube[i]);
    }
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.zbs);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.mtl);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.zbs);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.mtl);
    for (i = 0; i < 3 + mtu; ++i)
-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.mat[i]);
+      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.mat[i]);
    for (i = 0; i < 8; ++i)
-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.lit[i]);
+      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.lit[i]);
    for (i = 0; i < 6; ++i)
-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.ucp[i]);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.eye);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.grd);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.fog);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.glt);
+      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.ucp[i]);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.eye);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.grd);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.fog);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.glt);
 }
 
-void radeonEmitState( radeonContextPtr rmesa )
+static void radeonEmitScissor(r100ContextPtr rmesa)
 {
-   struct radeon_state_atom *atom;
-   char *dest;
-
-   if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   if (rmesa->save_on_next_emit) {
-      radeonSaveHwState(rmesa);
-      rmesa->save_on_next_emit = GL_FALSE;
-   }
-
-   /* this code used to return here but now it emits zbs */
-
-   /* To avoid going across the entire set of states multiple times, just check
-    * for enough space for the case of emitting all state, and inline the
-    * radeonAllocCmdBuf code here without all the checks.
-    */
-   radeonEnsureCmdBufSpace(rmesa, rmesa->hw.max_state_size);
-   dest = rmesa->store.cmd_buf + rmesa->store.cmd_used;
-
-   /* We always always emit zbs, this is due to a bug found by keithw in
-      the hardware and rediscovered after Erics changes by me.
-      if you ever touch this code make sure you emit zbs otherwise
-      you get tcl lockups on at least M7/7500 class of chips - airlied */
-   rmesa->hw.zbs.dirty=1;
-
-   if (RADEON_DEBUG & DEBUG_STATE) {
-      foreach(atom, &rmesa->hw.atomlist) {
-	 if (atom->dirty || rmesa->hw.all_dirty) {
-	    if (atom->check(rmesa->glCtx))
-	       print_state_atom(atom);
-	    else
-	       fprintf(stderr, "skip state %s\n", atom->name);
-	 }
-      }
-   }
-
-   foreach(atom, &rmesa->hw.atomlist) {
-      if (rmesa->hw.all_dirty)
-	 atom->dirty = GL_TRUE;
-      if (!(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) &&
-	   atom->is_tcl)
-	 atom->dirty = GL_FALSE;
-      if (atom->dirty) {
-	 if (atom->check(rmesa->glCtx)) {
-	    int size = atom->cmd_size * 4;
-	    memcpy(dest, atom->cmd, size);
-	    dest += size;
-	    rmesa->store.cmd_used += size;
-	    atom->dirty = GL_FALSE;
-	 }
-      }
-   }
-
-   assert(rmesa->store.cmd_used <= RADEON_CMD_BUF_SZ);
- 
-   rmesa->hw.is_dirty = GL_FALSE;
-   rmesa->hw.all_dirty = GL_FALSE;
+    BATCH_LOCALS(&rmesa->radeon);
+    if (!rmesa->radeon.radeonScreen->kernel_mm) {
+       return;
+    }
+    if (rmesa->radeon.state.scissor.enabled) {
+        BEGIN_BATCH(6);
+        OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 0));
+        OUT_BATCH(rmesa->hw.ctx.cmd[CTX_PP_CNTL] | RADEON_SCISSOR_ENABLE);
+        OUT_BATCH(CP_PACKET0(RADEON_RE_TOP_LEFT, 0));
+        OUT_BATCH((rmesa->radeon.state.scissor.rect.y1 << 16) |
+                  rmesa->radeon.state.scissor.rect.x1);
+        OUT_BATCH(CP_PACKET0(RADEON_RE_WIDTH_HEIGHT, 0));
+        OUT_BATCH(((rmesa->radeon.state.scissor.rect.y2) << 16) |
+                  (rmesa->radeon.state.scissor.rect.x2));
+        END_BATCH();
+    } else {
+        BEGIN_BATCH(2);
+        OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 0));
+        OUT_BATCH(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & ~RADEON_SCISSOR_ENABLE);
+        END_BATCH();
+    }
 }
 
 /* Fire a section of the retained (indexed_verts) buffer as a regular
- * primtive.  
+ * primtive.
  */
-extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
+extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
 				GLuint vertex_format,
 				GLuint primitive,
 				GLuint vertex_nr )
 {
-   drm_radeon_cmd_header_t *cmd;
-
+   BATCH_LOCALS(&rmesa->radeon);
 
    assert(!(primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
-   
-   radeonEmitState( rmesa );
-
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s cmd_used/4: %d\n", __FUNCTION__,
-	      rmesa->store.cmd_used/4);
-   
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VBUF_BUFSZ,
-						       __FUNCTION__ );
+
+   radeonEmitState(&rmesa->radeon);
+   radeonEmitScissor(rmesa);
+
 #if RADEON_OLD_PACKETS
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM | (3 << 16);
-   cmd[2].i = rmesa->ioctl.vertex_offset;
-   cmd[3].i = vertex_nr;
-   cmd[4].i = vertex_format;
-   cmd[5].i = (primitive | 
-	       RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
-	       (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
-
-   if (RADEON_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s: header 0x%x offt 0x%x vfmt 0x%x vfcntl %x \n",
-	      __FUNCTION__,
-	      cmd[1].i, cmd[2].i, cmd[4].i, cmd[5].i);
+   BEGIN_BATCH(8);
+   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM, 3);
+   if (!rmesa->radeon.radeonScreen->kernel_mm) {
+     OUT_BATCH_RELOC(rmesa->ioctl.vertex_offset, rmesa->ioctl.bo, rmesa->ioctl.vertex_offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+   } else {
+     OUT_BATCH(rmesa->ioctl.vertex_offset);
+   }
+
+   OUT_BATCH(vertex_nr);
+   OUT_BATCH(vertex_format);
+   OUT_BATCH(primitive |  RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
+	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+	     (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			   rmesa->ioctl.bo,
+			   RADEON_GEM_DOMAIN_GTT,
+			   0, 0);
+   }
+
+   END_BATCH();
+
 #else
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_VBUF | (1 << 16);
-   cmd[2].i = vertex_format;
-   cmd[3].i = (primitive | 
-	       RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
-	       RADEON_CP_VC_CNTL_MAOS_ENABLE |
-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
-	       (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
-
-
-   if (RADEON_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s: header 0x%x vfmt 0x%x vfcntl %x \n",
-	      __FUNCTION__,
-	      cmd[1].i, cmd[2].i, cmd[3].i);
+   BEGIN_BATCH(4);
+   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_DRAW_VBUF, 1);
+   OUT_BATCH(vertex_format);
+   OUT_BATCH(primitive |
+	     RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
+	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	     RADEON_CP_VC_CNTL_MAOS_ENABLE |
+	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+	     (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+   END_BATCH();
 #endif
 }
 
-
-void radeonFlushElts( radeonContextPtr rmesa )
+void radeonFlushElts( GLcontext *ctx )
 {
-   int *cmd = (int *)(rmesa->store.cmd_buf + rmesa->store.elts_start);
-   int dwords;
-#if RADEON_OLD_PACKETS
-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 24)) / 2;
-#else
-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 16)) / 2;
-#endif
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&rmesa->radeon);
+   int nr;
+   uint32_t *cmd = (uint32_t *)(rmesa->radeon.cmdbuf.cs->packets + rmesa->tcl.elt_cmd_start);
+   int dwords = (rmesa->radeon.cmdbuf.cs->section_ndw - rmesa->radeon.cmdbuf.cs->section_cdw);
 
-   if (RADEON_DEBUG & DEBUG_IOCTL)
+   if (RADEON_DEBUG & RADEON_IOCTL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-   assert( rmesa->dma.flush == radeonFlushElts );
-   rmesa->dma.flush = NULL;
+   assert( rmesa->radeon.dma.flush == radeonFlushElts );
+   rmesa->radeon.dma.flush = NULL;
 
-   /* Cope with odd number of elts:
-    */
-   rmesa->store.cmd_used = (rmesa->store.cmd_used + 2) & ~2;
-   dwords = (rmesa->store.cmd_used - rmesa->store.elts_start) / 4;
+   nr = rmesa->tcl.elt_used;
 
 #if RADEON_OLD_PACKETS
-   cmd[1] |= (dwords - 3) << 16;
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     dwords -= 2;
+   }
+#endif
+
+#if RADEON_OLD_PACKETS
+   cmd[1] |= (dwords + 3) << 16;
    cmd[5] |= nr << RADEON_CP_VC_CNTL_NUM_SHIFT;
 #else
-   cmd[1] |= (dwords - 3) << 16;
+   cmd[1] |= (dwords + 2) << 16;
    cmd[3] |= nr << RADEON_CP_VC_CNTL_NUM_SHIFT;
 #endif
 
-   if (RADEON_DEBUG & DEBUG_SYNC) {
+   rmesa->radeon.cmdbuf.cs->cdw += dwords;
+   rmesa->radeon.cmdbuf.cs->section_cdw += dwords;
+
+#if RADEON_OLD_PACKETS
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			    rmesa->ioctl.bo,
+			    RADEON_GEM_DOMAIN_GTT,
+			    0, 0);
+   }
+#endif
+
+   END_BATCH();
+
+   if (RADEON_DEBUG & RADEON_SYNC) {
       fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-      radeonFinish( rmesa->glCtx );
+      radeonFinish( rmesa->radeon.glCtx );
    }
-}
 
+}
 
-GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
+GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
 				    GLuint vertex_format,
 				    GLuint primitive,
 				    GLuint min_nr )
 {
-   drm_radeon_cmd_header_t *cmd;
    GLushort *retval;
+   int align_min_nr;
+   BATCH_LOCALS(&rmesa->radeon);
 
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s %d\n", __FUNCTION__, min_nr);
+   if (RADEON_DEBUG & RADEON_IOCTL)
+      fprintf(stderr, "%s %d prim %x\n", __FUNCTION__, min_nr, primitive);
 
    assert((primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
-   
-   radeonEmitState( rmesa );
-   
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa,
-						       ELTS_BUFSZ(min_nr),
-						       __FUNCTION__ );
+
+   radeonEmitState(&rmesa->radeon);
+   radeonEmitScissor(rmesa);
+
+   rmesa->tcl.elt_cmd_start = rmesa->radeon.cmdbuf.cs->cdw;
+
+   /* round up min_nr to align the state */
+   align_min_nr = (min_nr + 1) & ~1;
+
 #if RADEON_OLD_PACKETS
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM;
-   cmd[2].i = rmesa->ioctl.vertex_offset;
-   cmd[3].i = 0xffff;
-   cmd[4].i = vertex_format;
-   cmd[5].i = (primitive | 
-	       RADEON_CP_VC_CNTL_PRIM_WALK_IND |
-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
-
-   retval = (GLushort *)(cmd+6);
-#else   
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_INDX;
-   cmd[2].i = vertex_format;
-   cmd[3].i = (primitive | 
-	       RADEON_CP_VC_CNTL_PRIM_WALK_IND |
-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
-	       RADEON_CP_VC_CNTL_MAOS_ENABLE |
-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
-
-   retval = (GLushort *)(cmd+4);
+   BEGIN_BATCH_NO_AUTOSTATE(2+ELTS_BUFSZ(align_min_nr)/4);
+   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM, 0);
+   if (!rmesa->radeon.radeonScreen->kernel_mm) {
+     OUT_BATCH_RELOC(rmesa->ioctl.vertex_offset, rmesa->ioctl.bo, rmesa->ioctl.vertex_offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+   } else {
+     OUT_BATCH(rmesa->ioctl.vertex_offset);
+   }
+   OUT_BATCH(rmesa->ioctl.vertex_max);
+   OUT_BATCH(vertex_format);
+   OUT_BATCH(primitive |
+	     RADEON_CP_VC_CNTL_PRIM_WALK_IND |
+	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
+#else
+   BEGIN_BATCH_NO_AUTOSTATE(ELTS_BUFSZ(align_min_nr)/4);
+   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_DRAW_INDX, 0);
+   OUT_BATCH(vertex_format);
+   OUT_BATCH(primitive |
+	     RADEON_CP_VC_CNTL_PRIM_WALK_IND |
+	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	     RADEON_CP_VC_CNTL_MAOS_ENABLE |
+	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
 #endif
 
-   if (RADEON_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s: header 0x%x vfmt 0x%x prim %x \n",
-	      __FUNCTION__,
-	      cmd[1].i, vertex_format, primitive);
 
-   assert(!rmesa->dma.flush);
-   rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-   rmesa->dma.flush = radeonFlushElts;
+   rmesa->tcl.elt_cmd_offset = rmesa->radeon.cmdbuf.cs->cdw;
+   rmesa->tcl.elt_used = min_nr;
 
-   rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
+   retval = (GLushort *)(rmesa->radeon.cmdbuf.cs->packets + rmesa->tcl.elt_cmd_offset);
 
-   return retval;
-}
+   if (RADEON_DEBUG & RADEON_RENDER)
+      fprintf(stderr, "%s: header prim %x \n",
+	      __FUNCTION__, primitive);
 
+   assert(!rmesa->radeon.dma.flush);
+   rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+   rmesa->radeon.dma.flush = radeonFlushElts;
 
+   return retval;
+}
 
-void radeonEmitVertexAOS( radeonContextPtr rmesa,
+void radeonEmitVertexAOS( r100ContextPtr rmesa,
 			  GLuint vertex_size,
+			  struct radeon_bo *bo,
 			  GLuint offset )
 {
 #if RADEON_OLD_PACKETS
-   rmesa->ioctl.vertex_size = vertex_size;
    rmesa->ioctl.vertex_offset = offset;
+   rmesa->ioctl.bo = bo;
 #else
-   drm_radeon_cmd_header_t *cmd;
+   BATCH_LOCALS(&rmesa->radeon);
 
-   if (RADEON_DEBUG & (DEBUG_PRIMS|DEBUG_IOCTL))
+   if (RADEON_DEBUG & (RADEON_PRIMS|DEBUG_IOCTL))
       fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
 	      __FUNCTION__, vertex_size, offset);
 
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
-						  __FUNCTION__ );
+   BEGIN_BATCH(7);
+   OUT_BATCH_PACKET3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, 2);
+   OUT_BATCH(1);
+   OUT_BATCH(vertex_size | (vertex_size << 8));
+   OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+   END_BATCH();
 
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (2 << 16);
-   cmd[2].i = 1;
-   cmd[3].i = vertex_size | (vertex_size << 8);
-   cmd[4].i = offset;
 #endif
 }
-		       
 
-void radeonEmitAOS( radeonContextPtr rmesa,
-		    struct radeon_dma_region **component,
+
+void radeonEmitAOS( r100ContextPtr rmesa,
 		    GLuint nr,
 		    GLuint offset )
 {
 #if RADEON_OLD_PACKETS
    assert( nr == 1 );
-   assert( component[0]->aos_size == component[0]->aos_stride );
-   rmesa->ioctl.vertex_size = component[0]->aos_size;
-   rmesa->ioctl.vertex_offset = 
-      (component[0]->aos_start + offset * component[0]->aos_stride * 4);
+   rmesa->ioctl.bo = rmesa->radeon.tcl.aos[0].bo;
+   rmesa->ioctl.vertex_offset =
+     (rmesa->radeon.tcl.aos[0].offset + offset * rmesa->radeon.tcl.aos[0].stride * 4);
+   rmesa->ioctl.vertex_max = rmesa->radeon.tcl.aos[0].count;
 #else
-   drm_radeon_cmd_header_t *cmd;
-   int sz = AOS_BUFSZ(nr);
+   BATCH_LOCALS(&rmesa->radeon);
+   uint32_t voffset;
+   //   int sz = AOS_BUFSZ(nr);
+   int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
    int i;
-   int *tmp;
 
-   if (RADEON_DEBUG & DEBUG_IOCTL)
+   if (RADEON_DEBUG & RADEON_IOCTL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sz,
-						  __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (((sz / sizeof(int))-3) << 16);
-   cmd[2].i = nr;
-   tmp = &cmd[0].i;
-   cmd += 3;
-
-   for (i = 0 ; i < nr ; i++) {
-      if (i & 1) {
-	 cmd[0].i |= ((component[i]->aos_stride << 24) | 
-		      (component[i]->aos_size << 16));
-	 cmd[2].i = (component[i]->aos_start + 
-		     offset * component[i]->aos_stride * 4);
-	 cmd += 3;
-      }
-      else {
-	 cmd[0].i = ((component[i]->aos_stride << 8) | 
-		     (component[i]->aos_size << 0));
-	 cmd[1].i = (component[i]->aos_start + 
-		     offset * component[i]->aos_stride * 4);
+   BEGIN_BATCH(sz+2+(nr * 2));
+   OUT_BATCH_PACKET3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+   OUT_BATCH(nr);
+
+   if (!rmesa->radeon.radeonScreen->kernel_mm) {
+      for (i = 0; i + 1 < nr; i += 2) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+		   (rmesa->radeon.tcl.aos[i].stride << 8) |
+		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[i].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[i+1].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
       }
-   }
 
-   if (RADEON_DEBUG & DEBUG_VERTS) {
-      fprintf(stderr, "%s:\n", __FUNCTION__);
-      for (i = 0 ; i < sz ; i++)
-	 fprintf(stderr, "   %d: %x\n", i, tmp[i]);
-   }
-#endif
-}
-
-/* using already shifted color_fmt! */
-void radeonEmitBlit( radeonContextPtr rmesa, /* FIXME: which drmMinor is required? */
-		   GLuint color_fmt,
-		   GLuint src_pitch,
-		   GLuint src_offset,
-		   GLuint dst_pitch,
-		   GLuint dst_offset,
-		   GLint srcx, GLint srcy,
-		   GLint dstx, GLint dsty,
-		   GLuint w, GLuint h )
-{
-   drm_radeon_cmd_header_t *cmd;
-
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
-	      __FUNCTION__, 
-	      src_pitch, src_offset, srcx, srcy,
-	      dst_pitch, dst_offset, dstx, dsty,
-	      w, h);
-
-   assert( (src_pitch & 63) == 0 );
-   assert( (dst_pitch & 63) == 0 );
-   assert( (src_offset & 1023) == 0 ); 
-   assert( (dst_offset & 1023) == 0 ); 
-   assert( w < (1<<16) );
-   assert( h < (1<<16) );
-
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 8 * sizeof(int),
-						  __FUNCTION__ );
-
-
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = RADEON_CP_PACKET3_CNTL_BITBLT_MULTI | (5 << 16);
-   cmd[2].i = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
-	       RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-	       RADEON_GMC_BRUSH_NONE |
-	       color_fmt |
-	       RADEON_GMC_SRC_DATATYPE_COLOR |
-	       RADEON_ROP3_S |
-	       RADEON_DP_SRC_SOURCE_MEMORY |
-	       RADEON_GMC_CLR_CMP_CNTL_DIS |
-	       RADEON_GMC_WR_MSK_DIS );
-
-   cmd[3].i = ((src_pitch/64)<<22) | (src_offset >> 10);
-   cmd[4].i = ((dst_pitch/64)<<22) | (dst_offset >> 10);
-   cmd[5].i = (srcx << 16) | srcy;
-   cmd[6].i = (dstx << 16) | dsty; /* dst */
-   cmd[7].i = (w << 16) | h;
-}
-
-
-void radeonEmitWait( radeonContextPtr rmesa, GLuint flags )
-{
-   drm_radeon_cmd_header_t *cmd;
-
-   assert( !(flags & ~(RADEON_WAIT_2D|RADEON_WAIT_3D)) );
-
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 1 * sizeof(int),
-					   __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].wait.cmd_type = RADEON_CMD_WAIT;
-   cmd[0].wait.flags = flags;
-}
-
-
-static int radeonFlushCmdBufLocked( radeonContextPtr rmesa, 
-				    const char * caller )
-{
-   int ret, i;
-   drm_radeon_cmd_buffer_t cmd;
-
-   if (rmesa->lost_context)
-      radeonBackUpAndEmitLostStateLocked(rmesa);
-
-   if (RADEON_DEBUG & DEBUG_IOCTL) {
-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
-
-      if (RADEON_DEBUG & DEBUG_VERBOSE) 
-	 for (i = 0 ; i < rmesa->store.cmd_used ; i += 4 )
-	    fprintf(stderr, "%d: %x\n", i/4, 
-		    *(int *)(&rmesa->store.cmd_buf[i]));
-   }
-
-   if (RADEON_DEBUG & DEBUG_DMA)
-      fprintf(stderr, "%s: Releasing %d buffers\n", __FUNCTION__,
-	      rmesa->dma.nr_released_bufs);
-
-
-   if (RADEON_DEBUG & DEBUG_SANITY) {
-      if (rmesa->state.scissor.enabled) 
-	 ret = radeonSanityCmdBuffer( rmesa, 
-				      rmesa->state.scissor.numClipRects,
-				      rmesa->state.scissor.pClipRects);
-      else
-	 ret = radeonSanityCmdBuffer( rmesa, 
-				      rmesa->numClipRects,
-				      rmesa->pClipRects);
-      if (ret) {
-	 fprintf(stderr, "drmSanityCommandWrite: %d\n", ret);	 
-	 goto out;
+      if (nr & 1) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[nr - 1].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
       }
-   }
-
-
-   cmd.bufsz = rmesa->store.cmd_used;
-   cmd.buf = rmesa->store.cmd_buf;
-
-   if (rmesa->state.scissor.enabled) {
-      cmd.nbox = rmesa->state.scissor.numClipRects;
-      cmd.boxes = rmesa->state.scissor.pClipRects;
    } else {
-      cmd.nbox = rmesa->numClipRects;
-      cmd.boxes = rmesa->pClipRects;
-   }
-
-   ret = drmCommandWrite( rmesa->dri.fd,
-			  DRM_RADEON_CMDBUF,
-			  &cmd, sizeof(cmd) );
-
-   if (ret)
-      fprintf(stderr, "drmCommandWrite: %d\n", ret);
-
-   if (RADEON_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "\nSyncing in %s\n\n", __FUNCTION__);
-      radeonWaitForIdleLocked( rmesa );
-   }
-
- out:
-   rmesa->store.primnr = 0;
-   rmesa->store.statenr = 0;
-   rmesa->store.cmd_used = 0;
-   rmesa->dma.nr_released_bufs = 0;
-   rmesa->save_on_next_emit = 1;
-
-   return ret;
-}
-
-
-/* Note: does not emit any commands to avoid recursion on
- * radeonAllocCmdBuf.
- */
-void radeonFlushCmdBuf( radeonContextPtr rmesa, const char *caller )
-{
-   int ret;
-
-	      
-   LOCK_HARDWARE( rmesa );
-
-   ret = radeonFlushCmdBufLocked( rmesa, caller );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if (ret) {
-      fprintf(stderr, "drm_radeon_cmd_buffer_t: %d (exiting)\n", ret);
-      exit(ret);
-   }
-}
-
-/* =============================================================
- * Hardware vertex buffer handling
- */
-
-
-void radeonRefillCurrentDmaRegion( radeonContextPtr rmesa )
-{
-   struct radeon_dma_buffer *dmabuf;
-   int fd = rmesa->dri.fd;
-   int index = 0;
-   int size = 0;
-   drmDMAReq dma;
-   int ret;
-
-   if (RADEON_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
-      fprintf(stderr, "%s\n", __FUNCTION__);  
-
-   if (rmesa->dma.flush) {
-      rmesa->dma.flush( rmesa );
-   }
-
-   if (rmesa->dma.current.buf)
-      radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
-
-   if (rmesa->dma.nr_released_bufs > 4)
-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
-
-   dma.context = rmesa->dri.hwContext;
-   dma.send_count = 0;
-   dma.send_list = NULL;
-   dma.send_sizes = NULL;
-   dma.flags = 0;
-   dma.request_count = 1;
-   dma.request_size = RADEON_BUFFER_SIZE;
-   dma.request_list = &index;
-   dma.request_sizes = &size;
-   dma.granted_count = 0;
-
-   LOCK_HARDWARE(rmesa);	/* no need to validate */
-
-   ret = drmDMA( fd, &dma );
-      
-   if (ret != 0) {
-      /* Free some up this way?
-       */
-      if (rmesa->dma.nr_released_bufs) {
-	 radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
-      }
-      
-      if (RADEON_DEBUG & DEBUG_DMA)
-	 fprintf(stderr, "Waiting for buffers\n");
-
-      radeonWaitForIdleLocked( rmesa );
-      ret = drmDMA( fd, &dma );
-
-      if ( ret != 0 ) {
-	 UNLOCK_HARDWARE( rmesa );
-	 fprintf( stderr, "Error: Could not get dma buffer... exiting\n" );
-	 exit( -1 );
-      }
-   }
-
-   UNLOCK_HARDWARE(rmesa);
-
-   if (RADEON_DEBUG & DEBUG_DMA)
-      fprintf(stderr, "Allocated buffer %d\n", index);
-
-   dmabuf = CALLOC_STRUCT( radeon_dma_buffer );
-   dmabuf->buf = &rmesa->radeonScreen->buffers->list[index];
-   dmabuf->refcount = 1;
-
-   rmesa->dma.current.buf = dmabuf;
-   rmesa->dma.current.address = dmabuf->buf->address;
-   rmesa->dma.current.end = dmabuf->buf->total;
-   rmesa->dma.current.start = 0;
-   rmesa->dma.current.ptr = 0;
-
-   rmesa->c_vertexBuffers++;
-}
-
-void radeonReleaseDmaRegion( radeonContextPtr rmesa,
-			     struct radeon_dma_region *region,
-			     const char *caller )
-{
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
-   
-   if (!region->buf)
-      return;
-
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
-
-   if (--region->buf->refcount == 0) {
-      drm_radeon_cmd_header_t *cmd;
-
-      if (RADEON_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
-	 fprintf(stderr, "%s -- DISCARD BUF %d\n", __FUNCTION__,
-		 region->buf->buf->idx);  
-      
-      cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sizeof(*cmd), 
-						     __FUNCTION__ );
-      cmd->dma.cmd_type = RADEON_CMD_DMA_DISCARD;
-      cmd->dma.buf_idx = region->buf->buf->idx;
-      FREE(region->buf);
-      rmesa->dma.nr_released_bufs++;
-   }
-
-   region->buf = NULL;
-   region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void radeonAllocDmaRegion( radeonContextPtr rmesa, 
-			   struct radeon_dma_region *region,
-			   int bytes,
-			   int alignment )
-{
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
-
-   if (region->buf)
-      radeonReleaseDmaRegion( rmesa, region, __FUNCTION__ );
-
-   alignment--;
-   rmesa->dma.current.start = rmesa->dma.current.ptr = 
-      (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      radeonRefillCurrentDmaRegion( rmesa );
-
-   region->start = rmesa->dma.current.start;
-   region->ptr = rmesa->dma.current.start;
-   region->end = rmesa->dma.current.start + bytes;
-   region->address = rmesa->dma.current.address;
-   region->buf = rmesa->dma.current.buf;
-   region->buf->refcount++;
-
-   rmesa->dma.current.ptr += bytes; /* bug - if alignment > 7 */
-   rmesa->dma.current.start = 
-      rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;  
-}
-
-/* ================================================================
- * SwapBuffers with client-side throttling
- */
-
-static uint32_t radeonGetLastFrame (radeonContextPtr rmesa) 
-{
-   drm_radeon_getparam_t gp;
-   int ret;
-   uint32_t frame;
-
-   gp.param = RADEON_PARAM_LAST_FRAME;
-   gp.value = (int *)&frame;
-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_GETPARAM,
-			      &gp, sizeof(gp) );
-
-   if ( ret ) {
-      fprintf( stderr, "%s: drm_radeon_getparam_t: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-
-   return frame;
-}
-
-static void radeonEmitIrqLocked( radeonContextPtr rmesa )
-{
-   drm_radeon_irq_emit_t ie;
-   int ret;
-
-   ie.irq_seq = &rmesa->iw.irq_seq;
-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_IRQ_EMIT, 
-			      &ie, sizeof(ie) );
-   if ( ret ) {
-      fprintf( stderr, "%s: drm_radeon_irq_emit_t: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-}
-
-
-static void radeonWaitIrq( radeonContextPtr rmesa )
-{
-   int ret;
-
-   do {
-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_IRQ_WAIT,
-			     &rmesa->iw, sizeof(rmesa->iw) );
-   } while (ret && (errno == EINTR || errno == EBUSY));
-
-   if ( ret ) {
-      fprintf( stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-}
-
-
-static void radeonWaitForFrameCompletion( radeonContextPtr rmesa )
-{
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
-
-   if (rmesa->do_irqs) {
-      if (radeonGetLastFrame(rmesa) < sarea->last_frame) {
-	 if (!rmesa->irqsEmitted) {
-	    while (radeonGetLastFrame (rmesa) < sarea->last_frame)
-	       ;
-	 }
-	 else {
-	    UNLOCK_HARDWARE( rmesa ); 
-	    radeonWaitIrq( rmesa );	
-	    LOCK_HARDWARE( rmesa ); 
-	 }
-	 rmesa->irqsEmitted = 10;
+      for (i = 0; i + 1 < nr; i += 2) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+		   (rmesa->radeon.tcl.aos[i].stride << 8) |
+		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 OUT_BATCH(voffset);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 OUT_BATCH(voffset);
       }
 
-      if (rmesa->irqsEmitted) {
-	 radeonEmitIrqLocked( rmesa );
-	 rmesa->irqsEmitted--;
+      if (nr & 1) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 OUT_BATCH(voffset);
       }
-   } 
-   else {
-      while (radeonGetLastFrame (rmesa) < sarea->last_frame) {
-	 UNLOCK_HARDWARE( rmesa ); 
-	 if (rmesa->do_usleeps) 
-	    DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa ); 
+      for (i = 0; i + 1 < nr; i += 2) {
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[i+0].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[i+1].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
       }
-   }
-}
-
-/* Copy the back color buffer to the front color buffer.
- */
-void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
-		       const drm_clip_rect_t	  *rect)
-{
-   radeonContextPtr rmesa;
-   GLint nbox, i, ret;
-   GLboolean   missed_target;
-   int64_t ust;
-   __DRIscreenPrivate *psp;
-
-   assert(dPriv);
-   assert(dPriv->driContextPriv);
-   assert(dPriv->driContextPriv->driverPrivate);
-
-   rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
-   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
-      fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *) rmesa->glCtx );
-   }
-
-   RADEON_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-
-   /* Throttle the frame rate -- only allow one pending swap buffers
-    * request at a time.
-    */
-   radeonWaitForFrameCompletion( rmesa );
-   if (!rect)
-   {
-       UNLOCK_HARDWARE( rmesa );
-       driWaitForVBlank( dPriv, & missed_target );
-       LOCK_HARDWARE( rmesa );
-   }
-
-   nbox = dPriv->numClipRects; /* must be in locked region */
-
-   for ( i = 0 ; i < nbox ; ) {
-      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      GLint n = 0;
-
-      for ( ; i < nr ; i++ ) {
-
-	  *b = box[i];
-
-	  if (rect)
-	  {
-	      if (rect->x1 > b->x1)
-		  b->x1 = rect->x1;
-	      if (rect->y1 > b->y1)
-		  b->y1 = rect->y1;
-	      if (rect->x2 < b->x2)
-		  b->x2 = rect->x2;
-	      if (rect->y2 < b->y2)
-		  b->y2 = rect->y2;
-
-	      if (b->x1 >= b->x2 || b->y1 >= b->y2)
-		  continue;
-	  }
-
-	  b++;
-	  n++;
-      }
-      rmesa->sarea->nbox = n;
-
-      if (!n)
-	 continue;
-
-      ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
-
-      if ( ret ) {
-	 fprintf( stderr, "DRM_RADEON_SWAP_BUFFERS: return = %d\n", ret );
-	 UNLOCK_HARDWARE( rmesa );
-	 exit( 1 );
+      if (nr & 1) {
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[nr-1].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
       }
    }
+   END_BATCH();
 
-   UNLOCK_HARDWARE( rmesa );
-   if (!rect)
-   {
-       psp = dPriv->driScreenPriv;
-       rmesa->swap_count++;
-       (*psp->systemTime->getUST)( & ust );
-       if ( missed_target ) {
-	   rmesa->swap_missed_count++;
-	   rmesa->swap_missed_ust = ust - rmesa->swap_ust;
-       }
-
-       rmesa->swap_ust = ust;
-       rmesa->hw.all_dirty = GL_TRUE;
-   }
-}
-
-void radeonPageFlip( __DRIdrawablePrivate *dPriv )
-{
-   radeonContextPtr rmesa;
-   GLint ret;
-   GLboolean   missed_target;
-   __DRIscreenPrivate *psp;
-
-   assert(dPriv);
-   assert(dPriv->driContextPriv);
-   assert(dPriv->driContextPriv->driverPrivate);
-
-   rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-   psp = dPriv->driScreenPriv;
-
-   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
-      fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
-	      rmesa->sarea->pfCurrentPage);
-   }
-
-   RADEON_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-
-   /* Need to do this for the perf box placement:
-    */
-   if (dPriv->numClipRects)
-   {
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      b[0] = box[0];
-      rmesa->sarea->nbox = 1;
-   }
-
-   /* Throttle the frame rate -- only allow a few pending swap buffers
-    * request at a time.
-    */
-   radeonWaitForFrameCompletion( rmesa );
-   UNLOCK_HARDWARE( rmesa );
-   driWaitForVBlank( dPriv, & missed_target );
-   if ( missed_target ) {
-      rmesa->swap_missed_count++;
-      (void) (*psp->systemTime->getUST)( & rmesa->swap_missed_ust );
-   }
-   LOCK_HARDWARE( rmesa );
-
-   ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_FLIP );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
-      exit( 1 );
-   }
-
-   rmesa->swap_count++;
-   (void) (*psp->systemTime->getUST)( & rmesa->swap_ust );
-
-   /* Get ready for drawing next frame.  Update the renderbuffers'
-    * flippedOffset/Pitch fields so we draw into the right place.
-    */
-   driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-                        rmesa->sarea->pfCurrentPage);
-
-   radeonUpdateDrawBuffer(rmesa->glCtx);
+#endif
 }
 
-
 /* ================================================================
  * Buffer clear
  */
 #define RADEON_MAX_CLEARS	256
 
-static void radeonClear( GLcontext *ctx, GLbitfield mask )
+static void radeonKernelClear(GLcontext *ctx, GLuint flags)
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
+     r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   drm_radeon_sarea_t *sarea = rmesa->radeon.sarea;
    uint32_t clear;
-   GLuint flags = 0;
-   GLuint color_mask = 0;
    GLint ret, i;
    GLint cx, cy, cw, ch;
 
-   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
-      fprintf( stderr, "radeonClear\n");
-   }
-
-   {
-      LOCK_HARDWARE( rmesa );
-      UNLOCK_HARDWARE( rmesa );
-      if ( dPriv->numClipRects == 0 ) 
-	 return;
-   }
-   
-   radeonFlush( ctx ); 
-
-   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
-      flags |= RADEON_FRONT;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      mask &= ~BUFFER_BIT_FRONT_LEFT;
-   }
-
-   if ( mask & BUFFER_BIT_BACK_LEFT ) {
-      flags |= RADEON_BACK;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      mask &= ~BUFFER_BIT_BACK_LEFT;
-   }
-
-   if ( mask & BUFFER_BIT_DEPTH ) {
-      flags |= RADEON_DEPTH;
-      mask &= ~BUFFER_BIT_DEPTH;
-   }
-
-   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->state.stencil.hwBuffer ) {
-      flags |= RADEON_STENCIL;
-      mask &= ~BUFFER_BIT_STENCIL;
-   }
-
-   if ( mask ) {
-      if (RADEON_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
-      _swrast_Clear( ctx, mask );
-   }
-
-   if ( !flags ) 
-      return;
-
-   if (rmesa->using_hyperz) {
-      flags |= RADEON_USE_COMP_ZBUF;
-/*      if (rmesa->radeonScreen->chipset & RADEON_CHIPSET_TCL) 
-         flags |= RADEON_USE_HIERZ; */
-      if (!(rmesa->state.stencil.hwBuffer) ||
-	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
-	    ((rmesa->state.stencil.clear & RADEON_STENCIL_WRITE_MASK) == RADEON_STENCIL_WRITE_MASK))) {
-	  flags |= RADEON_CLEAR_FASTZ;
-      }
-   }
-
-   LOCK_HARDWARE( rmesa );
+   LOCK_HARDWARE( &rmesa->radeon );
 
    /* compute region after locking: */
    cx = ctx->DrawBuffer->_Xmin;
@@ -1112,7 +475,7 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
 
       gp.param = RADEON_PARAM_LAST_CLEAR;
       gp.value = (int *)&clear;
-      ret = drmCommandWriteRead( rmesa->dri.fd,
+      ret = drmCommandWriteRead( rmesa->radeon.dri.fd,
 				 DRM_RADEON_GETPARAM, &gp, sizeof(gp) );
 
       if ( ret ) {
@@ -1124,20 +487,20 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
 	 break;
       }
 
-      if ( rmesa->do_usleeps ) {
-	 UNLOCK_HARDWARE( rmesa );
+      if ( rmesa->radeon.do_usleeps ) {
+	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa );
+	 LOCK_HARDWARE( &rmesa->radeon );
       }
    }
 
    /* Send current state to the hardware */
-   radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
+   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
 
    for ( i = 0 ; i < dPriv->numClipRects ; ) {
       GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, dPriv->numClipRects );
       drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      drm_clip_rect_t *b = rmesa->radeon.sarea->boxes;
       drm_radeon_clear_t clear;
       drm_radeon_clear_rect_t depth_boxes[RADEON_NR_SAREA_CLIPRECTS];
       GLint n = 0;
@@ -1172,105 +535,107 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
 	 }
       }
 
-      rmesa->sarea->nbox = n;
+      rmesa->radeon.sarea->nbox = n;
 
       clear.flags       = flags;
-      clear.clear_color = rmesa->state.color.clear;
-      clear.clear_depth = rmesa->state.depth.clear;
+      clear.clear_color = rmesa->radeon.state.color.clear;
+      clear.clear_depth = rmesa->radeon.state.depth.clear;
       clear.color_mask  = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      clear.depth_mask  = rmesa->state.stencil.clear;
+      clear.depth_mask  = rmesa->radeon.state.stencil.clear;
       clear.depth_boxes = depth_boxes;
 
       n--;
-      b = rmesa->sarea->boxes;
+      b = rmesa->radeon.sarea->boxes;
       for ( ; n >= 0 ; n-- ) {
 	 depth_boxes[n].f[CLEAR_X1] = (float)b[n].x1;
 	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
 	 depth_boxes[n].f[CLEAR_X2] = (float)b[n].x2;
 	 depth_boxes[n].f[CLEAR_Y2] = (float)b[n].y2;
-	 depth_boxes[n].f[CLEAR_DEPTH] = 
-	    (float)rmesa->state.depth.clear;
+	 depth_boxes[n].f[CLEAR_DEPTH] =
+	    (float)rmesa->radeon.state.depth.clear;
       }
 
-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_CLEAR,
+      ret = drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_CLEAR,
 			     &clear, sizeof(drm_radeon_clear_t));
 
       if ( ret ) {
-	 UNLOCK_HARDWARE( rmesa );
+	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 fprintf( stderr, "DRM_RADEON_CLEAR: return = %d\n", ret );
 	 exit( 1 );
       }
    }
-
-   UNLOCK_HARDWARE( rmesa );
-   rmesa->hw.all_dirty = GL_TRUE;
+   UNLOCK_HARDWARE( &rmesa->radeon );
 }
 
-
-void radeonWaitForIdleLocked( radeonContextPtr rmesa )
+static void radeonClear( GLcontext *ctx, GLbitfield mask )
 {
-    int fd = rmesa->dri.fd;
-    int to = 0;
-    int ret, i = 0;
-
-    rmesa->c_drawWaits++;
-
-    do {
-        do {
-            ret = drmCommandNone( fd, DRM_RADEON_CP_IDLE);
-        } while ( ret && errno == EBUSY && i++ < RADEON_IDLE_RETRY );
-    } while ( ( ret == -EBUSY ) && ( to++ < RADEON_TIMEOUT ) );
-
-    if ( ret < 0 ) {
-	UNLOCK_HARDWARE( rmesa );
-	fprintf( stderr, "Error: Radeon timed out... exiting\n" );
-	exit( -1 );
-    }
-}
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLuint flags = 0;
+   GLuint color_mask = 0;
+   GLuint orig_mask = mask;
 
+   if ( RADEON_DEBUG & RADEON_IOCTL ) {
+      fprintf( stderr, "radeonClear\n");
+   }
 
-static void radeonWaitForIdle( radeonContextPtr rmesa )
-{
-   LOCK_HARDWARE(rmesa);
-   radeonWaitForIdleLocked( rmesa );
-   UNLOCK_HARDWARE(rmesa);
-}
+   {
+      LOCK_HARDWARE( &rmesa->radeon );
+      UNLOCK_HARDWARE( &rmesa->radeon );
+      if ( dPriv->numClipRects == 0 )
+	 return;
+   }
 
+   radeon_firevertices(&rmesa->radeon);
 
-void radeonFlush( GLcontext *ctx )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
+      flags |= RADEON_FRONT;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_FRONT_LEFT;
+   }
 
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   if ( mask & BUFFER_BIT_BACK_LEFT ) {
+      flags |= RADEON_BACK;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_BACK_LEFT;
+   }
 
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
+   if ( mask & BUFFER_BIT_DEPTH ) {
+      flags |= RADEON_DEPTH;
+      mask &= ~BUFFER_BIT_DEPTH;
+   }
 
-   radeonEmitState( rmesa );
-   
-   if (rmesa->store.cmd_used)
-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
-}
+   if ( (mask & BUFFER_BIT_STENCIL) ) {
+      flags |= RADEON_STENCIL;
+      mask &= ~BUFFER_BIT_STENCIL;
+   }
 
-/* Make sure all commands have been sent to the hardware and have
- * completed processing.
- */
-void radeonFinish( GLcontext *ctx )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   radeonFlush( ctx );
-
-   if (rmesa->do_irqs) {
-      LOCK_HARDWARE( rmesa );
-      radeonEmitIrqLocked( rmesa );
-      UNLOCK_HARDWARE( rmesa );
-      radeonWaitIrq( rmesa );
+   if ( mask ) {
+      if (RADEON_DEBUG & RADEON_FALLBACKS)
+	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
+      _swrast_Clear( ctx, mask );
    }
-   else
-      radeonWaitForIdle( rmesa );
-}
 
+   if ( !flags )
+      return;
+
+   if (rmesa->using_hyperz) {
+      flags |= RADEON_USE_COMP_ZBUF;
+/*      if (rmesa->radeon.radeonScreen->chipset & RADEON_CHIPSET_TCL)
+         flags |= RADEON_USE_HIERZ; */
+      if (((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
+	    ((rmesa->radeon.state.stencil.clear & RADEON_STENCIL_WRITE_MASK) == RADEON_STENCIL_WRITE_MASK))) {
+	  flags |= RADEON_CLEAR_FASTZ;
+      }
+   }
+
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+     radeonUserClear(ctx, orig_mask);
+   else {
+      radeonKernelClear(ctx, flags);
+      rmesa->radeon.hw.all_dirty = GL_TRUE;
+   }
+}
 
 void radeonInitIoctlFuncs( GLcontext *ctx )
 {
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.h b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
index 4e3a44df07..deb53ae313 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.h
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
@@ -38,31 +38,32 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/simple_list.h"
 #include "radeon_lock.h"
+#include "radeon_bocs_wrapper.h"
 
-
-extern void radeonEmitState( radeonContextPtr rmesa );
-extern void radeonEmitVertexAOS( radeonContextPtr rmesa,
+extern void radeonEmitVertexAOS( r100ContextPtr rmesa,
 				 GLuint vertex_size,
+				 struct radeon_bo *bo,
 				 GLuint offset );
 
-extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
+extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
 				GLuint vertex_format,
 				GLuint primitive,
 				GLuint vertex_nr );
 
-extern void radeonFlushElts( radeonContextPtr rmesa );
+extern void radeonFlushElts( GLcontext *ctx );
+			    
 
-extern GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
+extern GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
 					   GLuint vertex_format,
 					   GLuint primitive,
 					   GLuint min_nr );
 
-extern void radeonEmitAOS( radeonContextPtr rmesa,
-			   struct radeon_dma_region **regions,
+
+extern void radeonEmitAOS( r100ContextPtr rmesa,
 			   GLuint n,
 			   GLuint offset );
 
-extern void radeonEmitBlit( radeonContextPtr rmesa,
+extern void radeonEmitBlit( r100ContextPtr rmesa,
 			    GLuint color_fmt,
 			    GLuint src_pitch,
 			    GLuint src_offset,
@@ -72,30 +73,15 @@ extern void radeonEmitBlit( radeonContextPtr rmesa,
 			    GLint dstx, GLint dsty,
 			    GLuint w, GLuint h );
 
-extern void radeonEmitWait( radeonContextPtr rmesa, GLuint flags );
-
-extern void radeonFlushCmdBuf( radeonContextPtr rmesa, const char * );
-extern void radeonRefillCurrentDmaRegion( radeonContextPtr rmesa );
+extern void radeonEmitWait( r100ContextPtr rmesa, GLuint flags );
 
-extern void radeonAllocDmaRegion( radeonContextPtr rmesa,
-				  struct radeon_dma_region *region,
-				  int bytes, 
-				  int alignment );
+extern void radeonFlushCmdBuf( r100ContextPtr rmesa, const char * );
 
-extern void radeonReleaseDmaRegion( radeonContextPtr rmesa,
-				    struct radeon_dma_region *region,
-				    const char *caller );
-
-extern void radeonCopyBuffer( __DRIdrawablePrivate *drawable,
-			      const drm_clip_rect_t	 *rect);
-extern void radeonPageFlip( __DRIdrawablePrivate *drawable );
 extern void radeonFlush( GLcontext *ctx );
 extern void radeonFinish( GLcontext *ctx );
-extern void radeonWaitForIdleLocked( radeonContextPtr rmesa );
-extern void radeonWaitForVBlank( radeonContextPtr rmesa );
 extern void radeonInitIoctlFuncs( GLcontext *ctx );
-extern void radeonGetAllParams( radeonContextPtr rmesa );
-extern void radeonSetUpAtomList( radeonContextPtr rmesa );
+extern void radeonGetAllParams( r100ContextPtr rmesa );
+extern void radeonSetUpAtomList( r100ContextPtr rmesa );
 
 /* ================================================================
  * Helper macros:
@@ -105,33 +91,33 @@ extern void radeonSetUpAtomList( radeonContextPtr rmesa );
  */
 #define RADEON_NEWPRIM( rmesa )			\
 do {						\
-   if ( rmesa->dma.flush )			\
-      rmesa->dma.flush( rmesa );	\
+   if ( rmesa->radeon.dma.flush )			\
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
 } while (0)
 
 /* Can accomodate several state changes and primitive changes without
  * actually firing the buffer.
  */
+
 #define RADEON_STATECHANGE( rmesa, ATOM )			\
 do {								\
    RADEON_NEWPRIM( rmesa );					\
    rmesa->hw.ATOM.dirty = GL_TRUE;				\
-   rmesa->hw.is_dirty = GL_TRUE;				\
+   rmesa->radeon.hw.is_dirty = GL_TRUE;				\
 } while (0)
 
-#define RADEON_DB_STATE( ATOM )			        \
+#define RADEON_DB_STATE( ATOM )				\
    memcpy( rmesa->hw.ATOM.lastcmd, rmesa->hw.ATOM.cmd,	\
 	   rmesa->hw.ATOM.cmd_size * 4)
 
-static INLINE int RADEON_DB_STATECHANGE( 
-   radeonContextPtr rmesa,
-   struct radeon_state_atom *atom )
+static INLINE int RADEON_DB_STATECHANGE(r100ContextPtr rmesa,
+					struct radeon_state_atom *atom )
 {
    if (memcmp(atom->cmd, atom->lastcmd, atom->cmd_size*4)) {
-      int *tmp;
+      GLuint *tmp;
       RADEON_NEWPRIM( rmesa );
       atom->dirty = GL_TRUE;
-      rmesa->hw.is_dirty = GL_TRUE;
+      rmesa->radeon.hw.is_dirty = GL_TRUE;
       tmp = atom->cmd; 
       atom->cmd = atom->lastcmd;
       atom->lastcmd = tmp;
@@ -141,62 +127,55 @@ static INLINE int RADEON_DB_STATECHANGE(
       return 0;
 }
 
-
-/* Fire the buffered vertices no matter what.
- */
-#define RADEON_FIREVERTICES( rmesa )			\
-do {							\
-   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
-      radeonFlush( rmesa->glCtx );			\
-   }							\
-} while (0)
-
 /* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
  * are available, you will also be adding an rmesa->state.max_state_size because
  * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
  */
 #if RADEON_OLD_PACKETS
-#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2))+nr*2)
 #define VERT_AOS_BUFSZ	(0)
 #define ELTS_BUFSZ(nr)	(24 + nr * 2)
-#define VBUF_BUFSZ	(6 * sizeof(int))
+#define VBUF_BUFSZ	(8)
 #else
-#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
-#define VERT_AOS_BUFSZ	(5 * sizeof(int))
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2) + nr*2))
+#define VERT_AOS_BUFSZ	(5)
 #define ELTS_BUFSZ(nr)	(16 + nr * 2)
-#define VBUF_BUFSZ	(4 * sizeof(int))
+#define VBUF_BUFSZ	(4)
 #endif
+#define SCISSOR_BUFSZ	(8)
+#define INDEX_BUFSZ	(7)
 
-/* Ensure that a minimum amount of space is available in the command buffer.
- * This is used to ensure atomicity of state updates with the rendering requests
- * that rely on them.
- *
- * An alternative would be to implement a "soft lock" such that when the buffer
- * wraps at an inopportune time, we grab the lock, flush the current buffer,
- * and hang on to the lock until the critical section is finished and we flush
- * the buffer again and unlock.
- */
-static INLINE void radeonEnsureCmdBufSpace( radeonContextPtr rmesa,
-					      int bytes )
-{
-   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
-   assert( bytes <= RADEON_CMD_BUF_SZ );
-}
 
-/* Alloc space in the command buffer
- */
-static INLINE char *radeonAllocCmdBuf( radeonContextPtr rmesa,
-					 int bytes, const char *where )
+static inline uint32_t cmdpacket3(int cmd_type)
 {
-   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+  drm_radeon_cmd_header_t cmd;
+
+  cmd.i = 0;
+  cmd.header.cmd_type = cmd_type;
+
+  return (uint32_t)cmd.i;
 
-   {
-      char *head = rmesa->store.cmd_buf + rmesa->store.cmd_used;
-      rmesa->store.cmd_used += bytes;
-      return head;
-   }
 }
 
+#define OUT_BATCH_PACKET3(packet, num_extra) do {	      \
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
+      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3));				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    } else {						      \
+      OUT_BATCH(CP_PACKET2);				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    }							      \
+  } while(0)
+
+#define OUT_BATCH_PACKET3_CLIP(packet, num_extra) do {	      \
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
+      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3_CLIP));	      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    } else {						      \
+      OUT_BATCH(CP_PACKET2);				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    }							      \
+  } while(0)
+
+
 #endif /* __RADEON_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_lighting.c b/src/mesa/drivers/dri/radeon/radeon_lighting.c
index 6d9ccfa24d..ba444f2b10 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lighting.c
+++ b/src/mesa/drivers/dri/radeon/radeon_lighting.c
@@ -195,7 +195,7 @@ void radeonUpdateMaterial( GLcontext *ctx )
    if (ctx->Light.ColorMaterialEnabled)
       mask &= ~ctx->Light.ColorMaterialBitmask;
 
-   if (RADEON_DEBUG & DEBUG_STATE)
+   if (RADEON_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
       
@@ -234,7 +234,7 @@ void radeonUpdateMaterial( GLcontext *ctx )
       check_twoside_fallback( ctx );
       update_global_ambient( ctx );
    }
-   else if (RADEON_DEBUG & (DEBUG_PRIMS|DEBUG_STATE))
+   else if (RADEON_DEBUG & (RADEON_PRIMS|DEBUG_STATE))
       fprintf(stderr, "%s: Elided noop material call\n", __FUNCTION__);
 }
 
@@ -246,7 +246,7 @@ void radeonUpdateMaterial( GLcontext *ctx )
  *       _VP_inf_norm
  *       _h_inf_norm
  *       _Position
- *       _NormDirection
+ *       _NormSpotDirection
  *       _ModelViewInvScale
  *       _NeedEyeCoords
  *       _EyeZDir
@@ -308,9 +308,9 @@ void radeonUpdateLighting( GLcontext *ctx )
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    } else {
 	       COPY_4V( &fcmd[LIT_POSITION_X], l->_Position );
-	       fcmd[LIT_DIRECTION_X] = -l->_NormDirection[0];
-	       fcmd[LIT_DIRECTION_Y] = -l->_NormDirection[1];
-	       fcmd[LIT_DIRECTION_Z] = -l->_NormDirection[2];
+	       fcmd[LIT_DIRECTION_X] = -l->_NormSpotDirection[0];
+	       fcmd[LIT_DIRECTION_Y] = -l->_NormSpotDirection[1];
+	       fcmd[LIT_DIRECTION_Z] = -l->_NormSpotDirection[2];
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    }
 
@@ -624,7 +624,7 @@ static void radeonLightingSpaceChange( GLcontext *ctx )
    GLboolean tmp;
    RADEON_STATECHANGE( rmesa, tcl );
 
-   if (RADEON_DEBUG & DEBUG_STATE)
+   if (RADEON_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s %d\n", __FUNCTION__, ctx->_NeedEyeCoords);
 
    if (ctx->_NeedEyeCoords)
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.c b/src/mesa/drivers/dri/radeon/radeon_lock.c
index 64bb3ca103..02de8e5fd1 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.c
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.c
@@ -41,30 +41,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/glheader.h"
 #include "main/mtypes.h"
-#include "radeon_context.h"
+#include "main/colormac.h"
+#include "dri_util.h"
+#include "radeon_screen.h"
+#include "radeon_common.h"
 #include "radeon_lock.h"
-#include "radeon_tex.h"
-#include "radeon_state.h"
-#include "radeon_ioctl.h"
-
 #include "drirenderbuffer.h"
 
-#if DEBUG_LOCKING
-char *prevLockFile = NULL;
-int prevLockLine = 0;
-#endif
-
-/* Turn on/off page flipping according to the flags in the sarea:
- */
-static void radeonUpdatePageFlipping(radeonContextPtr rmesa)
-{
-	rmesa->doPageFlip = rmesa->sarea->pfState;
-	if (rmesa->glCtx->WinSysDrawBuffer) {
-		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-				     rmesa->sarea->pfCurrentPage);
-	}
-}
-
 /* Update the hardware state.  This is called if another context has
  * grabbed the hardware lock, which includes the X server.  This
  * function also updates the driver's window state after the X server
@@ -75,10 +58,11 @@ static void radeonUpdatePageFlipping(radeonContextPtr rmesa)
  */
 void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 {
-	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
+	__DRIdrawablePrivate *const drawable = radeon_get_drawable(rmesa);
+	__DRIdrawablePrivate *const readable = radeon_get_readable(rmesa);
 	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
-	drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+	assert(drawable != NULL);
 
 	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
 
@@ -96,29 +80,89 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 	}
 
 	if (rmesa->lastStamp != drawable->lastStamp) {
-		radeonUpdatePageFlipping(rmesa);
-		radeonSetCliprects(rmesa);
-		radeonUpdateViewportOffset(rmesa->glCtx);
-		driUpdateFramebufferSize(rmesa->glCtx, drawable);
+		radeon_window_moved(rmesa);
+		rmesa->lastStamp = drawable->lastStamp;
 	}
 
-	RADEON_STATECHANGE(rmesa, ctx);
-	if (rmesa->sarea->tiling_enabled) {
-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
-		    RADEON_COLOR_TILE_ENABLE;
-	} else {
-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &=
-		    ~RADEON_COLOR_TILE_ENABLE;
-	}
+	rmesa->vtbl.get_lock(rmesa);
+}
+#ifndef NDEBUG
+struct lock_debug {
+	const char* function;
+	const char* file;
+	int line;
+};
+
+static struct lock_debug ldebug = {0};
+#endif
+
+#if 0
+/** TODO: use atomic operations for reference counting **/
+/** gcc 4.2 has builtin functios for this **/
+#define ATOMIC_INC_AND_FETCH(atomic) __sync_add_and_fetch(&atomic, 1)
+#define ATOMIC_DEC_AND_FETCH(atomic) __sync_sub_and_fetch(&atomic, 1)
+#else
+#define ATOMIC_INC_AND_FETCH(atomic) (++atomic)
+#define ATOMIC_DEC_AND_FETCH(atomic) (--atomic)
+#endif
+
 
-	if (sarea->ctx_owner != rmesa->dri.hwContext) {
-		int i;
-		sarea->ctx_owner = rmesa->dri.hwContext;
+void radeon_lock_hardware(radeonContextPtr radeon
+#ifndef NDEBUG
+		,const char* function
+		,const char* file
+		,const int line
+#endif
+		)
+{
+	char ret = 0;
+	struct radeon_framebuffer *rfb = NULL;
+	struct radeon_renderbuffer *rrb = NULL;
+
+	if (radeon_get_drawable(radeon)) {
+		rfb = radeon_get_drawable(radeon)->driverPrivate;
+
+		if (rfb)
+			rrb = radeon_get_renderbuffer(&rfb->base,
+						      rfb->base._ColorDrawBufferIndexes[0]);
+	}
 
-		for (i = 0; i < rmesa->nr_heaps; i++) {
-			DRI_AGE_TEXTURES(rmesa->texture_heaps[i]);
+	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+		if (ATOMIC_INC_AND_FETCH(radeon->dri.hwLockCount) > 1)
+		{
+#ifndef NDEBUG
+			if ( RADEON_DEBUG & RADEON_SANITY )
+				fprintf(stderr, "*** %d times of recursive call to %s ***\n"
+						"Original call was from %s (file: %s line: %d)\n"
+						"Now call is coming from %s (file: %s line: %d)\n"
+						, radeon->dri.hwLockCount, __FUNCTION__
+						, ldebug.function, ldebug.file, ldebug.line
+						, function, file, line
+					   );
+#endif
+			return;
 		}
+		DRM_CAS(radeon->dri.hwLock, radeon->dri.hwContext,
+			 (DRM_LOCK_HELD | radeon->dri.hwContext), ret );
+		if (ret)
+			radeonGetLock(radeon, 0);
+#ifndef NDEBUG
+		ldebug.function = function;
+		ldebug.file = file;
+		ldebug.line = line;
+#endif
 	}
+}
 
-	rmesa->lost_context = GL_TRUE;
+void radeon_unlock_hardware(radeonContextPtr radeon)
+{
+	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+		if (ATOMIC_DEC_AND_FETCH(radeon->dri.hwLockCount) > 0)
+		{
+			return;
+		}
+		DRM_UNLOCK( radeon->dri.fd,
+			    radeon->dri.hwLock,
+			    radeon->dri.hwContext );
+	}
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.h b/src/mesa/drivers/dri/radeon/radeon_lock.h
index 86e96aa7d2..da5a5b4371 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.h
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.h
@@ -39,74 +39,31 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *   Kevin E. Martin <martin@valinux.com>
  */
 
-#ifndef __RADEON_LOCK_H__
-#define __RADEON_LOCK_H__
+#ifndef COMMON_LOCK_H
+#define COMMON_LOCK_H
 
-extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
-
-/* Turn DEBUG_LOCKING on to find locking conflicts.
- */
-#define DEBUG_LOCKING	0
-
-#if DEBUG_LOCKING
-extern char *prevLockFile;
-extern int prevLockLine;
-
-#define DEBUG_LOCK()							\
-   do {									\
-      prevLockFile = (__FILE__);					\
-      prevLockLine = (__LINE__);					\
-   } while (0)
-
-#define DEBUG_RESET()							\
-   do {									\
-      prevLockFile = 0;							\
-      prevLockLine = 0;							\
-   } while (0)
-
-#define DEBUG_CHECK_LOCK()						\
-   do {									\
-      if ( prevLockFile ) {						\
-	 fprintf( stderr,						\
-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
-		  prevLockFile, prevLockLine, __FILE__, __LINE__ );	\
-	 exit( 1 );							\
-      }									\
-   } while (0)
-
-#else
+#include "main/colormac.h"
+#include "radeon_screen.h"
+#include "radeon_common.h"
 
-#define DEBUG_LOCK()
-#define DEBUG_RESET()
-#define DEBUG_CHECK_LOCK()
+extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
 
+void radeon_lock_hardware(radeonContextPtr rmesa
+#ifndef NDEBUG
+		,const char* function
+		,const char* file
+		,const int line
 #endif
-
-/*
- * !!! We may want to separate locks from locks with validation.  This
- * could be used to improve performance for those things commands that
- * do not do any drawing !!!
- */
+		);
+void radeon_unlock_hardware(radeonContextPtr rmesa);
 
 /* Lock the hardware and validate our state.
  */
-#define LOCK_HARDWARE( rmesa )					\
-   do {								\
-      char __ret = 0;						\
-      DEBUG_CHECK_LOCK();					\
-      DRM_CAS( (rmesa)->dri.hwLock, (rmesa)->dri.hwContext,		\
-	       (DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret );	\
-      if ( __ret )						\
-	 radeonGetLock( (rmesa), 0 );				\
-      DEBUG_LOCK();						\
-   } while (0)
-
-#define UNLOCK_HARDWARE( rmesa )					\
-   do {									\
-      DRM_UNLOCK( (rmesa)->dri.fd,					\
-		  (rmesa)->dri.hwLock,					\
-		  (rmesa)->dri.hwContext );				\
-      DEBUG_RESET();							\
-   } while (0)
+#ifdef NDEBUG
+#define LOCK_HARDWARE( rmesa )	radeon_lock_hardware(rmesa)
+#else
+#define LOCK_HARDWARE( rmesa )	radeon_lock_hardware(rmesa, __FUNCTION__, __FILE__, __LINE__)
+#endif
+#define UNLOCK_HARDWARE( rmesa )  radeon_unlock_hardware(rmesa)
 
-#endif				/* __RADEON_LOCK_H__ */
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos.h b/src/mesa/drivers/dri/radeon/radeon_maos.h
index b8935e84a0..b88eb198d5 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos.h
+++ b/src/mesa/drivers/dri/radeon/radeon_maos.h
@@ -38,6 +38,5 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_context.h"
 
 extern void radeonEmitArrays( GLcontext *ctx, GLuint inputs );
-extern void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs );
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
index de3c3a15a7..08e1c5d00d 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
@@ -40,7 +40,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast_setup/swrast_setup.h"
 #include "math/m_translate.h"
 #include "tnl/tnl.h"
-#include "tnl/tcontext.h"
 
 #include "radeon_context.h"
 #include "radeon_ioctl.h"
@@ -49,160 +48,35 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_maos.h"
 #include "radeon_tcl.h"
 
-#if 0
-/* Usage:
- *   - from radeon_tcl_render
- *   - call radeonEmitArrays to ensure uptodate arrays in dma
- *   - emit primitives (new type?) which reference the data
- *       -- need to use elts for lineloop, quads, quadstrip/flat
- *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
- *
- */
-static void emit_ubyte_rgba3( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   radeon_color_t *out = (radeon_color_t *)(rvb->start + rvb->address);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p\n",
-	      __FUNCTION__, count, stride, (void *)out);
-
-   for (i = 0; i < count; i++) {
-      out->red   = *data;
-      out->green = *(data+1);
-      out->blue  = *(data+2);
-      out->alpha = 0xFF;
-      out++;
-      data += stride;
-   }
-}
-
-static void emit_ubyte_rgba4( GLcontext *ctx,
-			      struct radeon_dma_region *rvb,
-			      char *data,
-			      int stride,
-			      int count )
+static void emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
+			GLvoid *data, int stride, int count)
 {
    int i;
-   int *out = (int *)(rvb->address + rvb->start);
+   uint32_t *out;
+   int size = 1;
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 
-   if (RADEON_DEBUG & DEBUG_VERTS)
+   if (RADEON_DEBUG & RADEON_VERTS)
       fprintf(stderr, "%s count %d stride %d\n",
 	      __FUNCTION__, count, stride);
 
-   if (stride == 4)
-       COPY_DWORDS( out, data, count );
-   else
-      for (i = 0; i < count; i++) {
-	 *out++ = LE32_TO_CPU(*(int *)data);
-	 data += stride;
-      }
-}
-
-
-static void emit_ubyte_rgba( GLcontext *ctx,
-			     struct radeon_dma_region *rvb,
-			     char *data,
-			     int size,
-			     int stride,
-			     int count )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
-
-   assert (!rvb->buf);
-
    if (stride == 0) {
-      radeonAllocDmaRegion( rmesa, rvb, 4, 4 );
+      radeonAllocDmaRegion( rmesa, &aos->bo, &aos->offset, size * 4, 32 );
       count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
+      aos->stride = 0;
    }
    else {
-      radeonAllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
-   }
-
-   /* Emit the data
-    */
-   switch (size) {
-   case 3:
-      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
+      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
+      aos->stride = size;
    }
-}
-#endif
-
-#if defined(USE_X86_ASM)
-#define COPY_DWORDS( dst, src, nr )					\
-do {									\
-	int __tmp;							\
-	__asm__ __volatile__( "rep ; movsl"				\
-			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
-			      : "0" (nr),				\
-			        "D" ((long)dst),			\
-			        "S" ((long)src) );			\
-} while (0)
-#else
-#define COPY_DWORDS( dst, src, nr )		\
-do {						\
-   int j;					\
-   for ( j = 0 ; j < nr ; j++ )			\
-      dst[j] = ((int *)src)[j];			\
-   dst += nr;					\
-} while (0)
-#endif
-
-static void emit_vecfog( GLcontext *ctx,
-			 struct radeon_dma_region *rvb,
-			 char *data,
-			 int stride,
-			 int count )
-{
-   int i;
-   GLfloat *out;
-
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
+   aos->components = size;
+   aos->count = count;
 
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      radeonAllocDmaRegion( rmesa, rvb, 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
-   }
-   else {
-      radeonAllocDmaRegion( rmesa, rvb, count * 4, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
-   }
 
    /* Emit the data
     */
-   out = (GLfloat *)(rvb->address + rvb->start);
+   out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
    for (i = 0; i < count; i++) {
       out[0] = radeonComputeFogBlendFactor( ctx, *(GLfloat *)data );
       out++;
@@ -210,170 +84,10 @@ static void emit_vecfog( GLcontext *ctx,
    }
 }
 
-static void emit_vec4( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 4)
-      COPY_DWORDS( out, data, count );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out++;
-	 data += stride;
-      }
-}
-
-
-static void emit_vec8( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 8)
-      COPY_DWORDS( out, data, count*2 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out += 2;
-	 data += stride;
-      }
-}
-
-static void emit_vec12( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-	      __FUNCTION__, count, stride, (void *)out, (void *)data);
-
-   if (stride == 12)
-      COPY_DWORDS( out, data, count*3 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out[2] = *(int *)(data+8);
-	 out += 3;
-	 data += stride;
-      }
-}
-
-static void emit_vec16( GLcontext *ctx,
-			struct radeon_dma_region *rvb,
-			char *data,
-			int stride,
-			int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 16)
-      COPY_DWORDS( out, data, count*4 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out[2] = *(int *)(data+8);
-	 out[3] = *(int *)(data+12);
-	 out += 4;
-	 data += stride;
-      }
-}
-
-
-static void emit_vector( GLcontext *ctx,
-			 struct radeon_dma_region *rvb,
-			 char *data,
-			 int size,
-			 int stride,
-			 int count )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d size %d stride %d\n",
-	      __FUNCTION__, count, size, stride);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      radeonAllocDmaRegion( rmesa, rvb, size * 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = size;
-   }
-   else {
-      radeonAllocDmaRegion( rmesa, rvb, size * count * 4, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = size;
-      rvb->aos_size = size;
-   }
-
-   /* Emit the data
-    */
-   switch (size) {
-   case 1:
-      emit_vec4( ctx, rvb, data, stride, count );
-      break;
-   case 2:
-      emit_vec8( ctx, rvb, data, stride, count );
-      break;
-   case 3:
-      emit_vec12( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_vec16( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
-   }
-
-}
-
-
-
-static void emit_s0_vec( GLcontext *ctx,
-			 struct radeon_dma_region *rvb,
-			 char *data,
-			 int stride,
-			 int count )
+static void emit_s0_vec(uint32_t *out, GLvoid *data, int stride, int count)
 {
    int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
+   if (RADEON_DEBUG & RADEON_VERTS)
       fprintf(stderr, "%s count %d stride %d\n",
 	      __FUNCTION__, count, stride);
 
@@ -385,16 +99,11 @@ static void emit_s0_vec( GLcontext *ctx,
    }
 }
 
-static void emit_stq_vec( GLcontext *ctx,
-			 struct radeon_dma_region *rvb,
-			 char *data,
-			 int stride,
-			 int count )
+static void emit_stq_vec(uint32_t *out, GLvoid *data, int stride, int count)
 {
    int i;
-   int *out = (int *)(rvb->address + rvb->start);
 
-   if (RADEON_DEBUG & DEBUG_VERTS)
+   if (RADEON_DEBUG & RADEON_VERTS)
       fprintf(stderr, "%s count %d stride %d\n",
 	      __FUNCTION__, count, stride);
 
@@ -410,21 +119,16 @@ static void emit_stq_vec( GLcontext *ctx,
 
 
 
-static void emit_tex_vector( GLcontext *ctx,
-			     struct radeon_dma_region *rvb,
-			     char *data,
-			     int size,
-			     int stride,
-			     int count )
+static void emit_tex_vector(GLcontext *ctx, struct radeon_aos *aos,
+			    GLvoid *data, int size, int stride, int count)
 {
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
    int emitsize;
+   uint32_t *out;
 
-   if (RADEON_DEBUG & DEBUG_VERTS)
+   if (RADEON_DEBUG & RADEON_VERTS)
       fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
 
-   assert (!rvb->buf);
-
    switch (size) {
    case 4: emitsize = 3; break;
    case 3: emitsize = 3; break;
@@ -433,34 +137,33 @@ static void emit_tex_vector( GLcontext *ctx,
 
 
    if (stride == 0) {
-      radeonAllocDmaRegion( rmesa, rvb, 4 * emitsize, 4 );
+      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, emitsize * 4, 32);
       count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = emitsize;
+      aos->stride = 0;
    }
    else {
-      radeonAllocDmaRegion( rmesa, rvb, 4 * emitsize * count, 4 );
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = emitsize;
-      rvb->aos_size = emitsize;
+      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, emitsize * count * 4, 32);
+      aos->stride = emitsize;
    }
 
+   aos->components = emitsize;
+   aos->count = count;
 
    /* Emit the data
     */
+   out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
    switch (size) {
    case 1:
-      emit_s0_vec( ctx, rvb, data, stride, count ); 
+      emit_s0_vec( out, data, stride, count );
       break;
    case 2:
-      emit_vec8( ctx, rvb, data, stride, count );
+      radeonEmitVec8( out, data, stride, count );
       break;
    case 3:
-      emit_vec12( ctx, rvb, data, stride, count );
+      radeonEmitVec12( out, data, stride, count );
       break;
    case 4:
-      emit_stq_vec( ctx, rvb, data, stride, count );
+      emit_stq_vec( out, data, stride, count );
       break;
    default:
       assert(0);
@@ -477,27 +180,26 @@ static void emit_tex_vector( GLcontext *ctx,
  */
 void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    struct vertex_buffer *VB = &TNL_CONTEXT( ctx )->vb;
-   struct radeon_dma_region **component = rmesa->tcl.aos_components;
    GLuint nr = 0;
    GLuint vfmt = 0;
    GLuint count = VB->Count;
    GLuint vtx, unit;
    
 #if 0
-   if (RADEON_DEBUG & DEBUG_VERTS) 
+   if (RADEON_DEBUG & RADEON_VERTS)
       _tnl_print_vert_flags( __FUNCTION__, inputs );
 #endif
 
    if (1) {
       if (!rmesa->tcl.obj.buf) 
-	 emit_vector( ctx, 
-		      &rmesa->tcl.obj, 
-		      (char *)VB->ObjPtr->data,
-		      VB->ObjPtr->size,
-		      VB->ObjPtr->stride,
-		      count);
+	rcommon_emit_vector( ctx, 
+			     &(rmesa->tcl.aos[nr]),
+			     (char *)VB->ObjPtr->data,
+			     VB->ObjPtr->size,
+			     VB->ObjPtr->stride,
+			     count);
 
       switch( VB->ObjPtr->size ) {
       case 4: vfmt |= RADEON_CP_VC_FRMT_W0;
@@ -506,21 +208,21 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       default:
          break;
       }
-      component[nr++] = &rmesa->tcl.obj;
+      nr++;
    }
    
 
    if (inputs & VERT_BIT_NORMAL) {
       if (!rmesa->tcl.norm.buf)
-	 emit_vector( ctx, 
-		      &(rmesa->tcl.norm), 
-		      (char *)VB->NormalPtr->data,
-		      3,
-		      VB->NormalPtr->stride,
-		      count);
+	 rcommon_emit_vector( ctx, 
+			      &(rmesa->tcl.aos[nr]),
+			      (char *)VB->NormalPtr->data,
+			      3,
+			      VB->NormalPtr->stride,
+			      count);
 
       vfmt |= RADEON_CP_VC_FRMT_N0;
-      component[nr++] = &rmesa->tcl.norm;
+      nr++;
    }
 
    if (inputs & VERT_BIT_COLOR0) {
@@ -538,31 +240,30 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       }
 
       if (!rmesa->tcl.rgba.buf)
-	 emit_vector( ctx,
-		      &(rmesa->tcl.rgba),
-		      (char *)VB->ColorPtr[0]->data,
-		      emitsize,
-		      VB->ColorPtr[0]->stride,
-		      count);
-
-
-      component[nr++] = &rmesa->tcl.rgba;
+	rcommon_emit_vector( ctx,
+			     &(rmesa->tcl.aos[nr]),
+			     (char *)VB->ColorPtr[0]->data,
+			     emitsize,
+			     VB->ColorPtr[0]->stride,
+			     count);
+
+      nr++;
    }
 
 
    if (inputs & VERT_BIT_COLOR1) {
       if (!rmesa->tcl.spec.buf) {
 
-	 emit_vector( ctx,
-		      &rmesa->tcl.spec,
-		      (char *)VB->SecondaryColorPtr[0]->data,
-		      3,
-		      VB->SecondaryColorPtr[0]->stride,
-		      count);
+	rcommon_emit_vector( ctx,
+			     &(rmesa->tcl.aos[nr]),
+			     (char *)VB->SecondaryColorPtr[0]->data,
+			     3,
+			     VB->SecondaryColorPtr[0]->stride,
+			     count);
       }
 
       vfmt |= RADEON_CP_VC_FRMT_FPSPEC;
-      component[nr++] = &rmesa->tcl.spec;
+      nr++;
    }
 
 /* FIXME: not sure if this is correct. May need to stitch this together with
@@ -571,13 +272,13 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
    if (inputs & VERT_BIT_FOG) {
       if (!rmesa->tcl.fog.buf)
 	 emit_vecfog( ctx,
-		      &(rmesa->tcl.fog),
+		      &(rmesa->tcl.aos[nr]),
 		      (char *)VB->FogCoordPtr->data,
 		      VB->FogCoordPtr->stride,
 		      count);
 
       vfmt |= RADEON_CP_VC_FRMT_FPFOG;
-      component[nr++] = &rmesa->tcl.fog;
+      nr++;
    }
 
 
@@ -588,11 +289,12 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       if (inputs & VERT_BIT_TEX(unit)) {
 	 if (!rmesa->tcl.tex[unit].buf)
 	    emit_tex_vector( ctx,
-			     &(rmesa->tcl.tex[unit]),
+			     &(rmesa->tcl.aos[nr]),
 			     (char *)VB->TexCoordPtr[unit]->data,
 			     VB->TexCoordPtr[unit]->size,
 			     VB->TexCoordPtr[unit]->stride,
 			     count );
+	 nr++;
 
 	 vfmt |= RADEON_ST_BIT(unit);
          /* assume we need the 3rd coord if texgen is active for r/q OR at least
@@ -610,7 +312,6 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 		 (swaptexmatcol != ((rmesa->TexMatColSwap >> unit) & 1)))
 	       radeonUploadTexMatrix( rmesa, unit, swaptexmatcol ) ;
 	 }
-	 component[nr++] = &rmesa->tcl.tex[unit];
       }
    }
 
@@ -623,34 +324,3 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
    rmesa->tcl.vertex_format = vfmt;
 }
 
-
-void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
-   GLuint unit;
-
-#if 0
-   if (RADEON_DEBUG & DEBUG_VERTS) 
-      _tnl_print_vert_flags( __FUNCTION__, newinputs );
-#endif
-
-   if (newinputs & VERT_BIT_POS) 
-     radeonReleaseDmaRegion( rmesa, &rmesa->tcl.obj, __FUNCTION__ );
-
-   if (newinputs & VERT_BIT_NORMAL) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.norm, __FUNCTION__ );
-
-   if (newinputs & VERT_BIT_COLOR0) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.rgba, __FUNCTION__ );
-
-   if (newinputs & VERT_BIT_COLOR1) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.spec, __FUNCTION__ );
-      
-   if (newinputs & VERT_BIT_FOG)
-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.fog, __FUNCTION__ );
-
-   for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
-      if (newinputs & VERT_BIT_TEX(unit))
-         radeonReleaseDmaRegion( rmesa, &rmesa->tcl.tex[unit], __FUNCTION__ );
-   }
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h b/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
index 034cda8a65..515783135d 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_vbtmp.h
@@ -54,8 +54,7 @@ static void TAG(emit)( GLcontext *ctx,
 
    union emit_union *v = (union emit_union *)dest;
 
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s\n", __FUNCTION__); 
+   radeon_print(RADEON_SWRENDER, RADEON_VERBOSE, "%s\n", __FUNCTION__);
 
    coord = (GLuint (*)[4])VB->ObjPtr->data;
    coord_stride = VB->ObjPtr->stride;
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
index 126d0727c6..78ec119302 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
@@ -310,7 +310,7 @@ static void init_tcl_verts( void )
 
 void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
    GLuint req = 0;
    GLuint unit;
@@ -374,14 +374,15 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 	 break;
 
    if (rmesa->tcl.vertex_format == setup_tab[i].vertex_format &&
-       rmesa->tcl.indexed_verts.buf)
+       rmesa->radeon.tcl.aos[0].bo)
       return;
 
-   if (rmesa->tcl.indexed_verts.buf)
+   if (rmesa->radeon.tcl.aos[0].bo)
       radeonReleaseArrays( ctx, ~0 );
 
-   radeonAllocDmaRegion( rmesa,
-			 &rmesa->tcl.indexed_verts, 
+   radeonAllocDmaRegion( &rmesa->radeon,
+			 &rmesa->radeon.tcl.aos[0].bo,
+			 &rmesa->radeon.tcl.aos[0].offset,
 			 VB->Count * setup_tab[i].vertex_size * 4, 
 			 4);
 
@@ -421,29 +422,12 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 
 
    setup_tab[i].emit( ctx, 0, VB->Count, 
-		      rmesa->tcl.indexed_verts.address + 
-		      rmesa->tcl.indexed_verts.start );
+		      rmesa->radeon.tcl.aos[0].bo->ptr + rmesa->radeon.tcl.aos[0].offset);
 
+   //   rmesa->radeon.tcl.aos[0].size = setup_tab[i].vertex_size;
+   rmesa->radeon.tcl.aos[0].stride = setup_tab[i].vertex_size;
    rmesa->tcl.vertex_format = setup_tab[i].vertex_format;
-   rmesa->tcl.indexed_verts.aos_start = GET_START( &rmesa->tcl.indexed_verts );
-   rmesa->tcl.indexed_verts.aos_size = setup_tab[i].vertex_size;
-   rmesa->tcl.indexed_verts.aos_stride = setup_tab[i].vertex_size;
-
-   rmesa->tcl.aos_components[0] = &rmesa->tcl.indexed_verts;
-   rmesa->tcl.nr_aos_components = 1;
+   rmesa->radeon.tcl.aos_count = 1;
 }
 
 
-
-void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
-
-#if 0
-   if (RADEON_DEBUG & DEBUG_VERTS) 
-      _tnl_print_vert_flags( __FUNCTION__, newinputs );
-#endif
-
-   if (newinputs) 
-     radeonReleaseDmaRegion( rmesa, &rmesa->tcl.indexed_verts, __FUNCTION__ );
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
new file mode 100644
index 0000000000..38db305e2a
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_mipmap_tree.h"
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "main/simple_list.h"
+#include "main/texcompress.h"
+#include "main/texformat.h"
+
+static GLuint radeon_compressed_texture_size(GLcontext *ctx,
+		GLsizei width, GLsizei height, GLsizei depth,
+		GLuint mesaFormat)
+{
+	GLuint size = _mesa_compressed_texture_size(ctx, width, height, depth, mesaFormat);
+
+	if (mesaFormat == MESA_FORMAT_RGB_DXT1 ||
+	    mesaFormat == MESA_FORMAT_RGBA_DXT1) {
+		if (width + 3 < 8)	/* width one block */
+			size = size * 4;
+		else if (width + 3 < 16)
+			size = size * 2;
+	} else {
+		/* DXT3/5, 16 bytes per block */
+	  //		WARN_ONCE("DXT 3/5 suffers from multitexturing problems!\n");
+		if (width + 3 < 8)
+			size = size * 2;
+	}
+
+	return size;
+}
+
+
+static int radeon_compressed_num_bytes(GLuint mesaFormat)
+{
+   int bytes = 0;
+   switch(mesaFormat) {
+     
+   case MESA_FORMAT_RGB_FXT1:
+   case MESA_FORMAT_RGBA_FXT1:
+   case MESA_FORMAT_RGB_DXT1:
+   case MESA_FORMAT_RGBA_DXT1:
+     bytes = 2;
+     break;
+     
+   case MESA_FORMAT_RGBA_DXT3:
+   case MESA_FORMAT_RGBA_DXT5:
+     bytes = 4;
+   default:
+     break;
+   }
+   
+   return bytes;
+}
+
+/**
+ * Compute sizes and fill in offset and blit information for the given
+ * image (determined by \p face and \p level).
+ *
+ * \param curOffset points to the offset at which the image is to be stored
+ * and is updated by this function according to the size of the image.
+ */
+static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree *mt,
+	GLuint face, GLuint level, GLuint* curOffset)
+{
+	radeon_mipmap_level *lvl = &mt->levels[level];
+	uint32_t row_align;
+
+	/* Find image size in bytes */
+	if (mt->compressed) {
+		/* TODO: Is this correct? Need test cases for compressed textures! */
+		row_align = rmesa->texture_compressed_row_align - 1;
+		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
+		lvl->size = radeon_compressed_texture_size(mt->radeon->glCtx,
+							   lvl->width, lvl->height, lvl->depth, mt->compressed);
+	} else if (mt->target == GL_TEXTURE_RECTANGLE_NV) {
+		row_align = rmesa->texture_rect_row_align - 1;
+		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
+		lvl->size = lvl->rowstride * lvl->height;
+	} else if (mt->tilebits & RADEON_TXO_MICRO_TILE) {
+		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+		 * though the actual offset may be different (if texture is less than
+		 * 32 bytes width) to the untiled case */
+		lvl->rowstride = (lvl->width * mt->bpp * 2 + 31) & ~31;
+		lvl->size = lvl->rowstride * ((lvl->height + 1) / 2) * lvl->depth;
+	} else {
+		row_align = rmesa->texture_row_align - 1;
+		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
+		lvl->size = lvl->rowstride * lvl->height * lvl->depth;
+	}
+	assert(lvl->size > 0);
+
+	/* All images are aligned to a 32-byte offset */
+	*curOffset = (*curOffset + 0x1f) & ~0x1f;
+	lvl->faces[face].offset = *curOffset;
+	*curOffset += lvl->size;
+
+	if (RADEON_DEBUG & RADEON_TEXTURE)
+	  fprintf(stderr,
+		  "level %d, face %d: rs:%d %dx%d at %d\n",
+		  level, face, lvl->rowstride, lvl->width, lvl->height, lvl->faces[face].offset);
+}
+
+static GLuint minify(GLuint size, GLuint levels)
+{
+	size = size >> levels;
+	if (size < 1)
+		size = 1;
+	return size;
+}
+
+
+static void calculate_miptree_layout_r100(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
+{
+	GLuint curOffset;
+	GLuint numLevels;
+	GLuint i;
+	GLuint face;
+
+	numLevels = mt->lastLevel - mt->firstLevel + 1;
+	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+
+	curOffset = 0;
+	for(face = 0; face < mt->faces; face++) {
+
+		for(i = 0; i < numLevels; i++) {
+			mt->levels[i].width = minify(mt->width0, i);
+			mt->levels[i].height = minify(mt->height0, i);
+			mt->levels[i].depth = minify(mt->depth0, i);
+			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+		}
+	}
+
+	/* Note the required size in memory */
+	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+}
+
+static void calculate_miptree_layout_r300(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
+{
+	GLuint curOffset;
+	GLuint numLevels;
+	GLuint i;
+
+	numLevels = mt->lastLevel - mt->firstLevel + 1;
+	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+
+	curOffset = 0;
+	for(i = 0; i < numLevels; i++) {
+		GLuint face;
+
+		mt->levels[i].width = minify(mt->width0, i);
+		mt->levels[i].height = minify(mt->height0, i);
+		mt->levels[i].depth = minify(mt->depth0, i);
+
+		for(face = 0; face < mt->faces; face++)
+			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+	}
+
+	/* Note the required size in memory */
+	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+}
+
+/**
+ * Create a new mipmap tree, calculate its layout and allocate memory.
+ */
+radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
+		GLenum target, GLenum internal_format, GLuint firstLevel, GLuint lastLevel,
+		GLuint width0, GLuint height0, GLuint depth0,
+		GLuint bpp, GLuint tilebits, GLuint compressed)
+{
+	radeon_mipmap_tree *mt = CALLOC_STRUCT(_radeon_mipmap_tree);
+
+	mt->radeon = rmesa;
+	mt->internal_format = internal_format;
+	mt->refcount = 1;
+	mt->t = t;
+	mt->target = target;
+	mt->faces = (target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+	mt->firstLevel = firstLevel;
+	mt->lastLevel = lastLevel;
+	mt->width0 = width0;
+	mt->height0 = height0;
+	mt->depth0 = depth0;
+	mt->bpp = compressed ? radeon_compressed_num_bytes(compressed) : bpp;
+	mt->tilebits = tilebits;
+	mt->compressed = compressed;
+
+	if (rmesa->radeonScreen->chip_family >= CHIP_FAMILY_R300)
+		calculate_miptree_layout_r300(rmesa, mt);
+	else
+		calculate_miptree_layout_r100(rmesa, mt);
+
+	mt->bo = radeon_bo_open(rmesa->radeonScreen->bom,
+                            0, mt->totalsize, 1024,
+                            RADEON_GEM_DOMAIN_VRAM,
+                            0);
+
+	return mt;
+}
+
+void radeon_miptree_reference(radeon_mipmap_tree *mt)
+{
+	mt->refcount++;
+	assert(mt->refcount > 0);
+}
+
+void radeon_miptree_unreference(radeon_mipmap_tree *mt)
+{
+	if (!mt)
+		return;
+
+	assert(mt->refcount > 0);
+	mt->refcount--;
+	if (!mt->refcount) {
+		radeon_bo_unref(mt->bo);
+		free(mt);
+	}
+}
+
+
+/**
+ * Calculate first and last mip levels for the given texture object,
+ * where the dimensions are taken from the given texture image at
+ * the given level.
+ *
+ * Note: level is the OpenGL level number, which is not necessarily the same
+ * as the first level that is actually present.
+ *
+ * The base level image of the given texture face must be non-null,
+ * or this will fail.
+ */
+static void calculate_first_last_level(struct gl_texture_object *tObj,
+				       GLuint *pfirstLevel, GLuint *plastLevel,
+				       GLuint face, GLuint level)
+{
+	const struct gl_texture_image * const baseImage =
+		tObj->Image[face][level];
+
+	assert(baseImage);
+	
+	/* These must be signed values.  MinLod and MaxLod can be negative numbers,
+	* and having firstLevel and lastLevel as signed prevents the need for
+	* extra sign checks.
+	*/
+	int   firstLevel;
+	int   lastLevel;
+
+	/* Yes, this looks overly complicated, but it's all needed.
+	*/
+	switch (tObj->Target) {
+	case GL_TEXTURE_1D:
+	case GL_TEXTURE_2D:
+	case GL_TEXTURE_3D:
+	case GL_TEXTURE_CUBE_MAP:
+		if (tObj->MinFilter == GL_NEAREST || tObj->MinFilter == GL_LINEAR) {
+			/* GL_NEAREST and GL_LINEAR only care about GL_TEXTURE_BASE_LEVEL.
+			*/
+			firstLevel = lastLevel = tObj->BaseLevel;
+		} else {
+			firstLevel = tObj->BaseLevel + (GLint)(tObj->MinLod + 0.5);
+			firstLevel = MAX2(firstLevel, tObj->BaseLevel);
+			firstLevel = MIN2(firstLevel, level + baseImage->MaxLog2);
+			lastLevel = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
+			lastLevel = MAX2(lastLevel, tObj->BaseLevel);
+			lastLevel = MIN2(lastLevel, level + baseImage->MaxLog2);
+			lastLevel = MIN2(lastLevel, tObj->MaxLevel);
+			lastLevel = MAX2(firstLevel, lastLevel); /* need at least one level */
+		}
+		break;
+	case GL_TEXTURE_RECTANGLE_NV:
+	case GL_TEXTURE_4D_SGIS:
+		firstLevel = lastLevel = 0;
+		break;
+	default:
+		return;
+	}
+
+	/* save these values */
+	*pfirstLevel = firstLevel;
+	*plastLevel = lastLevel;
+}
+
+
+/**
+ * Checks whether the given miptree can hold the given texture image at the
+ * given face and level.
+ */
+GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
+		struct gl_texture_image *texImage, GLuint face, GLuint level)
+{
+	radeon_mipmap_level *lvl;
+
+	if (face >= mt->faces || level < mt->firstLevel || level > mt->lastLevel)
+		return GL_FALSE;
+
+	if (texImage->InternalFormat != mt->internal_format ||
+	    texImage->IsCompressed != mt->compressed)
+		return GL_FALSE;
+
+	if (!texImage->IsCompressed &&
+	    !mt->compressed &&
+	    texImage->TexFormat->TexelBytes != mt->bpp)
+		return GL_FALSE;
+
+	lvl = &mt->levels[level - mt->firstLevel];
+	if (lvl->width != texImage->Width ||
+	    lvl->height != texImage->Height ||
+	    lvl->depth != texImage->Depth)
+		return GL_FALSE;
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Checks whether the given miptree has the right format to store the given texture object.
+ */
+GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj)
+{
+	struct gl_texture_image *firstImage;
+	GLuint compressed;
+	GLuint numfaces = 1;
+	GLuint firstLevel, lastLevel;
+
+	calculate_first_last_level(texObj, &firstLevel, &lastLevel, 0, texObj->BaseLevel);
+	if (texObj->Target == GL_TEXTURE_CUBE_MAP)
+		numfaces = 6;
+
+	firstImage = texObj->Image[0][firstLevel];
+	compressed = firstImage->IsCompressed ? firstImage->TexFormat->MesaFormat : 0;
+
+	return (mt->firstLevel == firstLevel &&
+	        mt->lastLevel == lastLevel &&
+	        mt->width0 == firstImage->Width &&
+	        mt->height0 == firstImage->Height &&
+	        mt->depth0 == firstImage->Depth &&
+	        mt->compressed == compressed &&
+	        (!mt->compressed ? (mt->bpp == firstImage->TexFormat->TexelBytes) : 1));
+}
+
+
+/**
+ * Try to allocate a mipmap tree for the given texture that will fit the
+ * given image in the given position.
+ */
+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
+		radeon_texture_image *image, GLuint face, GLuint level)
+{
+	GLuint compressed = image->base.IsCompressed ? image->base.TexFormat->MesaFormat : 0;
+	GLuint numfaces = 1;
+	GLuint firstLevel, lastLevel;
+
+	assert(!t->mt);
+
+	calculate_first_last_level(&t->base, &firstLevel, &lastLevel, face, level);
+	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
+		numfaces = 6;
+
+	if (level != firstLevel || face >= numfaces)
+		return;
+
+	t->mt = radeon_miptree_create(rmesa, t, t->base.Target,
+		image->base.InternalFormat,
+		firstLevel, lastLevel,
+		image->base.Width, image->base.Height, image->base.Depth,
+		image->base.TexFormat->TexelBytes, t->tile_bits, compressed);
+}
+
+/* Although we use the image_offset[] array to store relative offsets
+ * to cube faces, Mesa doesn't know anything about this and expects
+ * each cube face to be treated as a separate image.
+ *
+ * These functions present that view to mesa:
+ */
+void
+radeon_miptree_depth_offsets(radeon_mipmap_tree *mt, GLuint level, GLuint *offsets)
+{
+     if (mt->target != GL_TEXTURE_3D || mt->faces == 1)
+        offsets[0] = 0;
+     else {
+	int i;
+	for (i = 0; i < 6; i++)
+		offsets[i] = mt->levels[level].faces[i].offset;
+     }
+}
+
+GLuint
+radeon_miptree_image_offset(radeon_mipmap_tree *mt,
+			    GLuint face, GLuint level)
+{
+   if (mt->target == GL_TEXTURE_CUBE_MAP_ARB)
+      return (mt->levels[level].faces[face].offset);
+   else
+      return mt->levels[level].faces[0].offset;
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
new file mode 100644
index 0000000000..db28252da3
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_MIPMAP_TREE_H_
+#define __RADEON_MIPMAP_TREE_H_
+
+#include "radeon_common.h"
+
+typedef struct _radeon_mipmap_tree radeon_mipmap_tree;
+typedef struct _radeon_mipmap_level radeon_mipmap_level;
+typedef struct _radeon_mipmap_image radeon_mipmap_image;
+
+struct _radeon_mipmap_image {
+	GLuint offset; /** Offset of this image from the start of mipmap tree buffer, in bytes */
+};
+
+struct _radeon_mipmap_level {
+	GLuint width;
+	GLuint height;
+	GLuint depth;
+	GLuint size; /** Size of each image, in bytes */
+	GLuint rowstride; /** in bytes */
+	radeon_mipmap_image faces[6];
+};
+
+/* store the max possible in the miptree */
+#define RADEON_MIPTREE_MAX_TEXTURE_LEVELS 13
+
+/**
+ * A mipmap tree contains texture images in the layout that the hardware
+ * expects.
+ *
+ * The meta-data of mipmap trees is immutable, i.e. you cannot change the
+ * layout on-the-fly; however, the texture contents (i.e. texels) can be
+ * changed.
+ */
+struct _radeon_mipmap_tree {
+	radeonContextPtr radeon;
+	radeonTexObj *t;
+	struct radeon_bo *bo;
+	GLuint refcount;
+
+	GLuint totalsize; /** total size of the miptree, in bytes */
+
+	GLenum target; /** GL_TEXTURE_xxx */
+	GLenum internal_format;
+	GLuint faces; /** # of faces: 6 for cubemaps, 1 otherwise */
+	GLuint firstLevel; /** First mip level stored in this mipmap tree */
+	GLuint lastLevel; /** Last mip level stored in this mipmap tree */
+
+	GLuint width0; /** Width of firstLevel image */
+	GLuint height0; /** Height of firstLevel image */
+	GLuint depth0; /** Depth of firstLevel image */
+
+	GLuint bpp; /** Bytes per texel */
+	GLuint tilebits; /** RADEON_TXO_xxx_TILE */
+	GLuint compressed; /** MESA_FORMAT_xxx indicating a compressed format, or 0 if uncompressed */
+
+	radeon_mipmap_level levels[RADEON_MIPTREE_MAX_TEXTURE_LEVELS];
+};
+
+radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
+		GLenum target, GLenum internal_format, GLuint firstLevel, GLuint lastLevel,
+		GLuint width0, GLuint height0, GLuint depth0,
+		GLuint bpp, GLuint tilebits, GLuint compressed);
+void radeon_miptree_reference(radeon_mipmap_tree *mt);
+void radeon_miptree_unreference(radeon_mipmap_tree *mt);
+
+GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
+		struct gl_texture_image *texImage, GLuint face, GLuint level);
+GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj);
+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
+			      radeon_texture_image *texImage, GLuint face, GLuint level);
+GLuint radeon_miptree_image_offset(radeon_mipmap_tree *mt,
+				   GLuint face, GLuint level);
+void radeon_miptree_depth_offsets(radeon_mipmap_tree *mt, GLuint level, GLuint *offsets);
+#endif /* __RADEON_MIPMAP_TREE_H_ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_queryobj.c b/src/mesa/drivers/dri/radeon/radeon_queryobj.c
new file mode 100644
index 0000000000..b79d864ba2
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_queryobj.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright © 2008-2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Maciej Cencora <m.cencora@gmail.com>
+ *
+ */
+#include "radeon_common.h"
+#include "radeon_queryobj.h"
+#include "radeon_debug.h"
+
+#include "main/imports.h"
+#include "main/simple_list.h"
+
+static int radeonQueryIsFlushed(GLcontext *ctx, struct gl_query_object *q)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct radeon_query_object *tmp, *query = (struct radeon_query_object *)q;
+
+	foreach(tmp, &radeon->query.not_flushed_head) {
+		if (tmp == query) {
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static void radeonQueryGetResult(GLcontext *ctx, struct gl_query_object *q)
+{
+	struct radeon_query_object *query = (struct radeon_query_object *)q;
+	uint32_t *result;
+	int i;
+
+	radeon_print(RADEON_STATE, RADEON_VERBOSE,
+			"%s: query id %d, result %d\n",
+			__FUNCTION__, query->Base.Id, (int) query->Base.Result);
+
+	radeon_bo_map(query->bo, GL_FALSE);
+
+	result = query->bo->ptr;
+
+	query->Base.Result = 0;
+	for (i = 0; i < query->curr_offset/sizeof(uint32_t); ++i) {
+		query->Base.Result += result[i];
+		radeon_print(RADEON_STATE, RADEON_TRACE, "result[%d] = %d\n", i, result[i]);
+	}
+
+	radeon_bo_unmap(query->bo);
+}
+
+static struct gl_query_object * radeonNewQueryObject(GLcontext *ctx, GLuint id)
+{
+	struct radeon_query_object *query;
+
+	query = _mesa_calloc(sizeof(struct radeon_query_object));
+
+	query->Base.Id = id;
+	query->Base.Result = 0;
+	query->Base.Active = GL_FALSE;
+	query->Base.Ready = GL_TRUE;
+
+	radeon_print(RADEON_STATE, RADEON_VERBOSE,"%s: query id %d\n", __FUNCTION__, query->Base.Id);
+
+	return &query->Base;
+}
+
+static void radeonDeleteQuery(GLcontext *ctx, struct gl_query_object *q)
+{
+	struct radeon_query_object *query = (struct radeon_query_object *)q;
+
+	radeon_print(RADEON_STATE, RADEON_NORMAL, "%s: query id %d\n", __FUNCTION__, q->Id);
+
+	if (query->bo) {
+		radeon_bo_unref(query->bo);
+	}
+
+	_mesa_free(query);
+}
+
+static void radeonWaitQuery(GLcontext *ctx, struct gl_query_object *q)
+{
+	struct radeon_query_object *query = (struct radeon_query_object *)q;
+
+	/* If the cmdbuf with packets for this query hasn't been flushed yet, do it now */
+	if (!radeonQueryIsFlushed(ctx, q))
+		ctx->Driver.Flush(ctx);
+
+	radeon_print(RADEON_STATE, RADEON_VERBOSE, "%s: query id %d, bo %p, offset %d\n", __FUNCTION__, q->Id, query->bo, query->curr_offset);
+
+	radeonQueryGetResult(ctx, q);
+
+	query->Base.Ready = GL_TRUE;
+}
+
+
+static void radeonBeginQuery(GLcontext *ctx, struct gl_query_object *q)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct radeon_query_object *query = (struct radeon_query_object *)q;
+
+	radeon_print(RADEON_STATE, RADEON_NORMAL, "%s: query id %d\n", __FUNCTION__, q->Id);
+
+	assert(radeon->query.current == NULL);
+
+	if (radeon->dma.flush)
+		radeon->dma.flush(radeon->glCtx);
+
+	if (!query->bo) {
+		query->bo = radeon_bo_open(radeon->radeonScreen->bom, 0, RADEON_QUERY_PAGE_SIZE, RADEON_QUERY_PAGE_SIZE, RADEON_GEM_DOMAIN_GTT, 0);
+	}
+	query->curr_offset = 0;
+
+	radeon->query.current = query;
+
+	radeon->query.queryobj.dirty = GL_TRUE;
+	radeon->hw.is_dirty = GL_TRUE;
+	insert_at_tail(&radeon->query.not_flushed_head, query);
+
+}
+
+void radeonEmitQueryEnd(GLcontext *ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct radeon_query_object *query = radeon->query.current;
+
+	if (!query)
+		return;
+
+	if (query->emitted_begin == GL_FALSE)
+		return;
+
+	radeon_print(RADEON_STATE, RADEON_NORMAL, "%s: query id %d, bo %p, offset %d\n", __FUNCTION__, query->Base.Id, query->bo, query->curr_offset);
+
+	radeon_cs_space_check_with_bo(radeon->cmdbuf.cs,
+				      query->bo,
+				      0, RADEON_GEM_DOMAIN_GTT);
+
+	radeon->vtbl.emit_query_finish(radeon);
+}
+
+static void radeonEndQuery(GLcontext *ctx, struct gl_query_object *q)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	radeon_print(RADEON_STATE, RADEON_NORMAL, "%s: query id %d\n", __FUNCTION__, q->Id);
+
+	if (radeon->dma.flush)
+		radeon->dma.flush(radeon->glCtx);
+	radeonEmitQueryEnd(ctx);
+
+	radeon->query.current = NULL;
+}
+
+static void radeonCheckQuery(GLcontext *ctx, struct gl_query_object *q)
+{
+	radeon_print(RADEON_STATE, RADEON_TRACE, "%s: query id %d\n", __FUNCTION__, q->Id);
+
+#ifdef DRM_RADEON_GEM_BUSY
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+	if (radeon->radeonScreen->kernel_mm) {
+		struct radeon_query_object *query = (struct radeon_query_object *)q;
+		uint32_t domain;
+
+		/* Need to perform a flush, as per ARB_occlusion_query spec */
+		if (!radeonQueryIsFlushed(ctx, q)) {
+			ctx->Driver.Flush(ctx);
+		}
+
+		if (radeon_bo_is_busy(query->bo, &domain) == 0) {
+			radeonQueryGetResult(ctx, q);
+			query->Base.Ready = GL_TRUE;
+		}
+	} else {
+		radeonWaitQuery(ctx, q);
+	}
+#else
+	radeonWaitQuery(ctx, q);
+#endif
+}
+
+void radeonInitQueryObjFunctions(struct dd_function_table *functions)
+{
+	functions->NewQueryObject = radeonNewQueryObject;
+	functions->DeleteQuery = radeonDeleteQuery;
+	functions->BeginQuery = radeonBeginQuery;
+	functions->EndQuery = radeonEndQuery;
+	functions->CheckQuery = radeonCheckQuery;
+	functions->WaitQuery = radeonWaitQuery;
+}
+
+int radeon_check_query_active(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct radeon_query_object *query = radeon->query.current;
+
+	if (!query || query->emitted_begin)
+		return 0;
+	return atom->cmd_size;
+}
+
+void radeon_emit_queryobj(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	BATCH_LOCALS(radeon);
+	int dwords;
+
+	dwords = (*atom->check) (ctx, atom);
+
+	BEGIN_BATCH_NO_AUTOSTATE(dwords);
+	OUT_BATCH_TABLE(atom->cmd, dwords);
+	END_BATCH();
+
+	radeon->query.current->emitted_begin = GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_queryobj.h b/src/mesa/drivers/dri/radeon/radeon_queryobj.h
new file mode 100644
index 0000000000..19374dc76b
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_queryobj.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2008 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Maciej Cencora <m.cencora@gmail.com>
+ *
+ */
+
+#include "main/imports.h"
+#include "main/simple_list.h"
+#include "radeon_common_context.h"
+
+extern void radeonEmitQueryBegin(GLcontext *ctx);
+extern void radeonEmitQueryEnd(GLcontext *ctx);
+
+extern void radeonInitQueryObjFunctions(struct dd_function_table *functions);
+
+#define RADEON_QUERY_PAGE_SIZE 4096
+
+int radeon_check_query_active(GLcontext *ctx, struct radeon_state_atom *atom);
+void radeon_emit_queryobj(GLcontext *ctx, struct radeon_state_atom *atom);
+
+static inline void radeon_init_query_stateobj(radeonContextPtr radeon, int SZ)
+{
+	radeon->query.queryobj.cmd_size = (SZ);
+	radeon->query.queryobj.cmd = (uint32_t*)CALLOC((SZ) * sizeof(uint32_t));
+	radeon->query.queryobj.name = "queryobj";
+	radeon->query.queryobj.idx = 0;
+	radeon->query.queryobj.check = radeon_check_query_active;
+	radeon->query.queryobj.dirty = GL_FALSE;
+	radeon->query.queryobj.emit = radeon_emit_queryobj;
+
+	radeon->hw.max_state_size += (SZ);
+	insert_at_tail(&radeon->hw.atomlist, &radeon->query.queryobj);
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_sanity.c b/src/mesa/drivers/dri/radeon/radeon_sanity.c
index 6613757fce..1ab570f507 100644
--- a/src/mesa/drivers/dri/radeon/radeon_sanity.c
+++ b/src/mesa/drivers/dri/radeon/radeon_sanity.c
@@ -44,11 +44,11 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define MORE_VERBOSE 1
 
 #if MORE_VERBOSE
-#define VERBOSE (RADEON_DEBUG & DEBUG_VERBOSE)
+#define VERBOSE (RADEON_DEBUG & RADEON_VERBOSE)
 #define NORMAL  (1)
 #else
 #define VERBOSE 0
-#define NORMAL  (RADEON_DEBUG & DEBUG_VERBOSE)
+#define NORMAL  (RADEON_DEBUG & RADEON_VERBOSE)
 #endif
 
 
@@ -973,7 +973,7 @@ static int radeon_emit_packet3_cliprect( drm_radeon_cmd_buffer_t *cmdbuf )
 }
 
 
-int radeonSanityCmdBuffer( radeonContextPtr rmesa,
+int radeonSanityCmdBuffer( r100ContextPtr rmesa,
 			   int nbox,
 			   drm_clip_rect_t *boxes )
 {
diff --git a/src/mesa/drivers/dri/radeon/radeon_sanity.h b/src/mesa/drivers/dri/radeon/radeon_sanity.h
index 1ec06bc586..f30eb1c4f1 100644
--- a/src/mesa/drivers/dri/radeon/radeon_sanity.h
+++ b/src/mesa/drivers/dri/radeon/radeon_sanity.h
@@ -1,7 +1,7 @@
 #ifndef RADEON_SANITY_H
 #define RADEON_SANITY_H
 
-extern int radeonSanityCmdBuffer( radeonContextPtr rmesa,
+extern int radeonSanityCmdBuffer( r100ContextPtr rmesa,
 				  int nbox,
 				  drm_clip_rect_t *boxes );
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 5f32dd575e..5ffb55db5e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -35,6 +35,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \author  Gareth Hughes <gareth@valinux.com>
  */
 
+#include <errno.h>
 #include "main/glheader.h"
 #include "main/imports.h"
 #include "main/mtypes.h"
@@ -45,32 +46,42 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_chipset.h"
 #include "radeon_macros.h"
 #include "radeon_screen.h"
+#include "radeon_common.h"
+#include "radeon_span.h"
 #if !RADEON_COMMON
 #include "radeon_context.h"
-#include "radeon_span.h"
 #include "radeon_tex.h"
 #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
 #include "r200_context.h"
 #include "r200_ioctl.h"
-#include "r200_span.h"
 #include "r200_tex.h"
 #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
 #include "r300_context.h"
-#include "r300_fragprog.h"
 #include "r300_tex.h"
-#include "radeon_span.h"
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#include "r600_context.h"
+#include "r700_driconf.h" /* +r6/r7 */
+#include "r600_tex.h"     /* +r6/r7 */
 #endif
 
 #include "utils.h"
 #include "vblank.h"
 #include "drirenderbuffer.h"
 
+#include "radeon_bocs_wrapper.h"
+
 #include "GL/internal/dri_interface.h"
 
 /* Radeon configuration
  */
 #include "xmlpool.h"
 
+#define DRI_CONF_COMMAND_BUFFER_SIZE(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(command_buffer_size,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,"Size of command buffer (in KB)") \
+        DRI_CONF_DESC(de,"Grösse des Befehlspuffers (in KB)") \
+DRI_CONF_OPT_END
+
 #if !RADEON_COMMON	/* R100 */
 PUBLIC const char __driConfigOptions[] =
 DRI_CONF_BEGIN
@@ -80,6 +91,7 @@ DRI_CONF_BEGIN
         DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
         DRI_CONF_MAX_TEXTURE_UNITS(3,2,3)
         DRI_CONF_HYPERZ(false)
+        DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_QUALITY
         DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
@@ -95,7 +107,7 @@ DRI_CONF_BEGIN
         DRI_CONF_NO_RAST(false)
     DRI_CONF_SECTION_END
 DRI_CONF_END;
-static const GLuint __driNConfigOptions = 14;
+static const GLuint __driNConfigOptions = 15;
 
 #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
 
@@ -107,6 +119,7 @@ DRI_CONF_BEGIN
         DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
         DRI_CONF_MAX_TEXTURE_UNITS(6,2,6)
         DRI_CONF_HYPERZ(false)
+        DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_QUALITY
         DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
@@ -126,7 +139,7 @@ DRI_CONF_BEGIN
         DRI_CONF_NV_VERTEX_PROGRAM(false)
     DRI_CONF_SECTION_END
 DRI_CONF_END;
-static const GLuint __driNConfigOptions = 16;
+static const GLuint __driNConfigOptions = 17;
 
 extern const struct dri_extension blend_extensions[];
 extern const struct dri_extension ARB_vp_extension[];
@@ -134,7 +147,10 @@ extern const struct dri_extension NV_vp_extension[];
 extern const struct dri_extension ATI_fs_extension[];
 extern const struct dri_extension point_extensions[];
 
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#elif RADEON_COMMON && (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
+
+#define DRI_CONF_FP_OPTIMIZATION_SPEED   0
+#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
 
 /* TODO: integrate these into xmlpool.h! */
 #define DRI_CONF_MAX_TEXTURE_IMAGE_UNITS(def,min,max) \
@@ -149,11 +165,7 @@ DRI_CONF_OPT_BEGIN_V(texture_coord_units,int,def, # min ":" # max ) \
         DRI_CONF_DESC(de,"Anzahl der Texturkoordinateneinheiten") \
 DRI_CONF_OPT_END
 
-#define DRI_CONF_COMMAND_BUFFER_SIZE(def,min,max) \
-DRI_CONF_OPT_BEGIN_V(command_buffer_size,int,def, # min ":" # max ) \
-        DRI_CONF_DESC(en,"Size of command buffer (in KB)") \
-        DRI_CONF_DESC(de,"Grösse des Befehlspuffers (in KB)") \
-DRI_CONF_OPT_END
+
 
 #define DRI_CONF_DISABLE_S3TC(def) \
 DRI_CONF_OPT_BEGIN(disable_s3tc,bool,def) \
@@ -206,47 +218,44 @@ DRI_CONF_BEGIN
 DRI_CONF_END;
 static const GLuint __driNConfigOptions = 17;
 
-#ifndef RADEON_DEBUG
-int RADEON_DEBUG = 0;
-
-static const struct dri_debug_control debug_control[] = {
-	{"fall", DEBUG_FALLBACKS},
-	{"tex", DEBUG_TEXTURE},
-	{"ioctl", DEBUG_IOCTL},
-	{"prim", DEBUG_PRIMS},
-	{"vert", DEBUG_VERTS},
-	{"state", DEBUG_STATE},
-	{"code", DEBUG_CODEGEN},
-	{"vfmt", DEBUG_VFMT},
-	{"vtxf", DEBUG_VFMT},
-	{"verb", DEBUG_VERBOSE},
-	{"dri", DEBUG_DRI},
-	{"dma", DEBUG_DMA},
-	{"san", DEBUG_SANITY},
-	{"sync", DEBUG_SYNC},
-	{"pix", DEBUG_PIXEL},
-	{"mem", DEBUG_MEMORY},
-	{"allmsg", ~DEBUG_SYNC}, /* avoid the term "sync" because the parser uses strstr */
-	{NULL, 0}
-};
-#endif /* RADEON_DEBUG */
+extern const struct dri_extension gl_20_extension[];
 
 #endif /* RADEON_COMMON && defined(RADEON_COMMON_FOR_R300) */
 
 extern const struct dri_extension card_extensions[];
+extern const struct dri_extension mm_extensions[];
 
 static int getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo );
 
 static int
-radeonGetParam(int fd, int param, void *value)
+radeonGetParam(__DRIscreenPrivate *sPriv, int param, void *value)
 {
   int ret;
-  drm_radeon_getparam_t gp;
-
-  gp.param = param;
-  gp.value = value;
+  drm_radeon_getparam_t gp = { 0 };
+  struct drm_radeon_info info = { 0 };
+
+  if (sPriv->drm_version.major >= 2) {
+      info.value = (uint64_t)(uintptr_t)value;
+      switch (param) {
+      case RADEON_PARAM_DEVICE_ID:
+          info.request = RADEON_INFO_DEVICE_ID;
+          break;
+      case RADEON_PARAM_NUM_GB_PIPES:
+          info.request = RADEON_INFO_NUM_GB_PIPES;
+          break;
+      case RADEON_PARAM_NUM_Z_PIPES:
+          info.request = RADEON_INFO_NUM_Z_PIPES;
+          break;
+      default:
+          return -EINVAL;
+      }
+      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_INFO, &info, sizeof(info));
+  } else {
+      gp.param = param;
+      gp.value = value;
 
-  ret = drmCommandWriteRead( fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+  }
   return ret;
 }
 
@@ -259,8 +268,6 @@ radeonFillInModes( __DRIscreenPrivate *psp,
     __GLcontextModes *m;
     unsigned depth_buffer_factor;
     unsigned back_buffer_factor;
-    GLenum fb_format;
-    GLenum fb_type;
     int i;
 
     /* Right now GLX_SWAP_COPY_OML isn't supported, but it would be easy
@@ -274,7 +281,7 @@ radeonFillInModes( __DRIscreenPrivate *psp,
 
     uint8_t depth_bits_array[2];
     uint8_t stencil_bits_array[2];
-
+    uint8_t msaa_samples_array[1];
 
     depth_bits_array[0] = depth_bits;
     depth_bits_array[1] = depth_bits;
@@ -283,25 +290,35 @@ radeonFillInModes( __DRIscreenPrivate *psp,
      * with a stencil buffer.  It will be a sw fallback, but some apps won't
      * care about that.
      */
-    stencil_bits_array[0] = 0;
+    stencil_bits_array[0] = stencil_bits;
     stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
 
-    depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 2 : 1;
+    msaa_samples_array[0] = 0;
+
+    depth_buffer_factor = (stencil_bits == 0) ? 2 : 1;
     back_buffer_factor  = (have_back_buffer) ? 2 : 1;
 
-    if ( pixel_bits == 16 ) {
-        fb_format = GL_RGB;
-        fb_type = GL_UNSIGNED_SHORT_5_6_5;
-    }
-    else {
-        fb_format = GL_BGRA;
-        fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
-    }
+    if (pixel_bits == 16) {
+	__DRIconfig **configs_a8r8g8b8;
+	__DRIconfig **configs_r5g6b5;
+
+	configs_r5g6b5 = driCreateConfigs(GL_RGB, GL_UNSIGNED_SHORT_5_6_5,
+					  depth_bits_array, stencil_bits_array,
+					  depth_buffer_factor, back_buffer_modes,
+					  back_buffer_factor, msaa_samples_array,
+					  1);
+	configs_a8r8g8b8 = driCreateConfigs(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV,
+					    depth_bits_array, stencil_bits_array,
+					    1, back_buffer_modes, 1,
+					    msaa_samples_array, 1);
+	configs = driConcatConfigs(configs_r5g6b5, configs_a8r8g8b8);
+   } else
+	configs = driCreateConfigs(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV,
+				   depth_bits_array, stencil_bits_array,
+				   depth_buffer_factor,
+				   back_buffer_modes, back_buffer_factor,
+				   msaa_samples_array, 1);
 
-    configs = driCreateConfigs(fb_format, fb_type,
-			       depth_bits_array, stencil_bits_array,
-			       depth_buffer_factor,
-			       back_buffer_modes, back_buffer_factor);
     if (configs == NULL) {
 	fprintf( stderr, "[%s:%u] Error creating FBConfig!\n",
 		 __func__, __LINE__ );
@@ -325,6 +342,12 @@ static const __DRItexOffsetExtension radeonTexOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
     radeonSetTexOffset,
 };
+
+static const __DRItexBufferExtension radeonTexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   radeonSetTexBuffer,
+   radeonSetTexBuffer2,
+};
 #endif
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
@@ -339,6 +362,12 @@ static const __DRItexOffsetExtension r200texOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
    r200SetTexOffset,
 };
+
+static const __DRItexBufferExtension r200TexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   r200SetTexBuffer,
+   r200SetTexBuffer2,
+};
 #endif
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
@@ -346,137 +375,32 @@ static const __DRItexOffsetExtension r300texOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
    r300SetTexOffset,
 };
-#endif
 
-/* Create the device specific screen private data struct.
- */
-static radeonScreenPtr
-radeonCreateScreen( __DRIscreenPrivate *sPriv )
-{
-   radeonScreenPtr screen;
-   RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
-   unsigned char *RADEONMMIO;
-   int i;
-   int ret;
-   uint32_t temp;
-
-   if (sPriv->devPrivSize != sizeof(RADEONDRIRec)) {
-      fprintf(stderr,"\nERROR!  sizeof(RADEONDRIRec) does not match passed size from device driver\n");
-      return GL_FALSE;
-   }
-
-   /* Allocate the private area */
-   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
-   if ( !screen ) {
-      __driUtilMessage("%s: Could not allocate memory for screen structure",
-		       __FUNCTION__);
-      return NULL;
-   }
-
-#if DO_DEBUG && RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
-	RADEON_DEBUG = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
+static const __DRItexBufferExtension r300TexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   r300SetTexBuffer,
+   r300SetTexBuffer2,
+};
 #endif
 
-   /* parse information in __driConfigOptions */
-   driParseOptionInfo (&screen->optionCache,
-		       __driConfigOptions, __driNConfigOptions);
-
-   /* This is first since which regions we map depends on whether or
-    * not we are using a PCI card.
-    */
-   screen->card_type = (dri_priv->IsPCI ? RADEON_CARD_PCI : RADEON_CARD_AGP);
-   {
-      int ret;
-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BUFFER_OFFSET,
-			    &screen->gart_buffer_offset);
-
-      if (ret) {
-	 FREE( screen );
-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BUFFER_OFFSET): %d\n", ret);
-	 return NULL;
-      }
-
-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BASE,
-			    &screen->gart_base);
-      if (ret) {
-	 FREE( screen );
-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BASE): %d\n", ret);
-	 return NULL;
-      }
-
-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_IRQ_NR,
-			    &screen->irq);
-      if (ret) {
-	 FREE( screen );
-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_IRQ_NR): %d\n", ret);
-	 return NULL;
-      }
-      screen->drmSupportsCubeMapsR200 = (sPriv->drm_version.minor >= 7);
-      screen->drmSupportsBlendColor = (sPriv->drm_version.minor >= 11);
-      screen->drmSupportsTriPerf = (sPriv->drm_version.minor >= 16);
-      screen->drmSupportsFragShader = (sPriv->drm_version.minor >= 18);
-      screen->drmSupportsPointSprites = (sPriv->drm_version.minor >= 13);
-      screen->drmSupportsCubeMapsR100 = (sPriv->drm_version.minor >= 15);
-      screen->drmSupportsVertexProgram = (sPriv->drm_version.minor >= 25);
-   }
-
-   screen->mmio.handle = dri_priv->registerHandle;
-   screen->mmio.size   = dri_priv->registerSize;
-   if ( drmMap( sPriv->fd,
-		screen->mmio.handle,
-		screen->mmio.size,
-		&screen->mmio.map ) ) {
-      FREE( screen );
-      __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
-      return NULL;
-   }
-
-   RADEONMMIO = screen->mmio.map;
-
-   screen->status.handle = dri_priv->statusHandle;
-   screen->status.size   = dri_priv->statusSize;
-   if ( drmMap( sPriv->fd,
-		screen->status.handle,
-		screen->status.size,
-		&screen->status.map ) ) {
-      drmUnmap( screen->mmio.map, screen->mmio.size );
-      FREE( screen );
-      __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
-      return NULL;
-   }
-   screen->scratch = (__volatile__ uint32_t *)
-      ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
-
-   screen->buffers = drmMapBufs( sPriv->fd );
-   if ( !screen->buffers ) {
-      drmUnmap( screen->status.map, screen->status.size );
-      drmUnmap( screen->mmio.map, screen->mmio.size );
-      FREE( screen );
-      __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
-      return NULL;
-   }
-
-   if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
-      screen->gartTextures.handle = dri_priv->gartTexHandle;
-      screen->gartTextures.size   = dri_priv->gartTexMapSize;
-      if ( drmMap( sPriv->fd,
-		   screen->gartTextures.handle,
-		   screen->gartTextures.size,
-		   (drmAddressPtr)&screen->gartTextures.map ) ) {
-	 drmUnmapBufs( screen->buffers );
-	 drmUnmap( screen->status.map, screen->status.size );
-	 drmUnmap( screen->mmio.map, screen->mmio.size );
-	 FREE( screen );
-	 __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
-	 return NULL;
-      }
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+static const __DRItexOffsetExtension r600texOffsetExtension = {
+    { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
+   r600SetTexOffset, /* +r6/r7 */
+};
 
-      screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
-   }
+static const __DRItexBufferExtension r600TexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   r600SetTexBuffer,  /* +r6/r7 */
+   r600SetTexBuffer2, /* +r6/r7 */
+};
+#endif
 
+static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
+{
+   screen->device_id = device_id;
    screen->chip_flags = 0;
-   /* XXX: add more chipsets */
-   switch ( dri_priv->deviceID ) {
+   switch ( device_id ) {
    case PCI_CHIP_RADEON_LY:
    case PCI_CHIP_RADEON_LZ:
    case PCI_CHIP_RADEON_QY:
@@ -551,11 +475,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
       screen->chip_family = CHIP_FAMILY_RS300;
       break;
 
-      /* 9500 with 1 pipe verified by: Reid Linnemann <lreid@cs.okstate.edu> */
    case PCI_CHIP_R300_AD:
-      screen->chip_family = CHIP_FAMILY_RV350;
-      screen->chip_flags = RADEON_CHIPSET_TCL;
-      break;
    case PCI_CHIP_R300_AE:
    case PCI_CHIP_R300_AF:
    case PCI_CHIP_R300_AG:
@@ -680,6 +600,12 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
       screen->chip_family = CHIP_FAMILY_RS400;
       break;
 
+   case PCI_CHIP_RS600_793F:
+   case PCI_CHIP_RS600_7941:
+   case PCI_CHIP_RS600_7942:
+      screen->chip_family = CHIP_FAMILY_RS600;
+      break;
+
    case PCI_CHIP_RS690_791E:
    case PCI_CHIP_RS690_791F:
       screen->chip_family = CHIP_FAMILY_RS690;
@@ -806,11 +732,325 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
       screen->chip_flags = RADEON_CHIPSET_TCL;
       break;
 
+   case PCI_CHIP_R600_9400:
+   case PCI_CHIP_R600_9401:
+   case PCI_CHIP_R600_9402:
+   case PCI_CHIP_R600_9403:
+   case PCI_CHIP_R600_9405:
+   case PCI_CHIP_R600_940A:
+   case PCI_CHIP_R600_940B:
+   case PCI_CHIP_R600_940F:
+      screen->chip_family = CHIP_FAMILY_R600;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV610_94C0:
+   case PCI_CHIP_RV610_94C1:
+   case PCI_CHIP_RV610_94C3:
+   case PCI_CHIP_RV610_94C4:
+   case PCI_CHIP_RV610_94C5:
+   case PCI_CHIP_RV610_94C6:
+   case PCI_CHIP_RV610_94C7:
+   case PCI_CHIP_RV610_94C8:
+   case PCI_CHIP_RV610_94C9:
+   case PCI_CHIP_RV610_94CB:
+   case PCI_CHIP_RV610_94CC:
+   case PCI_CHIP_RV610_94CD:
+      screen->chip_family = CHIP_FAMILY_RV610;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV630_9580:
+   case PCI_CHIP_RV630_9581:
+   case PCI_CHIP_RV630_9583:
+   case PCI_CHIP_RV630_9586:
+   case PCI_CHIP_RV630_9587:
+   case PCI_CHIP_RV630_9588:
+   case PCI_CHIP_RV630_9589:
+   case PCI_CHIP_RV630_958A:
+   case PCI_CHIP_RV630_958B:
+   case PCI_CHIP_RV630_958C:
+   case PCI_CHIP_RV630_958D:
+   case PCI_CHIP_RV630_958E:
+   case PCI_CHIP_RV630_958F:
+      screen->chip_family = CHIP_FAMILY_RV630;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV670_9500:
+   case PCI_CHIP_RV670_9501:
+   case PCI_CHIP_RV670_9504:
+   case PCI_CHIP_RV670_9505:
+   case PCI_CHIP_RV670_9506:
+   case PCI_CHIP_RV670_9507:
+   case PCI_CHIP_RV670_9508:
+   case PCI_CHIP_RV670_9509:
+   case PCI_CHIP_RV670_950F:
+   case PCI_CHIP_RV670_9511:
+   case PCI_CHIP_RV670_9515:
+   case PCI_CHIP_RV670_9517:
+   case PCI_CHIP_RV670_9519:
+      screen->chip_family = CHIP_FAMILY_RV670;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV620_95C0:
+   case PCI_CHIP_RV620_95C2:
+   case PCI_CHIP_RV620_95C4:
+   case PCI_CHIP_RV620_95C5:
+   case PCI_CHIP_RV620_95C6:
+   case PCI_CHIP_RV620_95C7:
+   case PCI_CHIP_RV620_95C9:
+   case PCI_CHIP_RV620_95CC:
+   case PCI_CHIP_RV620_95CD:
+   case PCI_CHIP_RV620_95CE:
+   case PCI_CHIP_RV620_95CF:
+      screen->chip_family = CHIP_FAMILY_RV620;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV635_9590:
+   case PCI_CHIP_RV635_9591:
+   case PCI_CHIP_RV635_9593:
+   case PCI_CHIP_RV635_9595:
+   case PCI_CHIP_RV635_9596:
+   case PCI_CHIP_RV635_9597:
+   case PCI_CHIP_RV635_9598:
+   case PCI_CHIP_RV635_9599:
+   case PCI_CHIP_RV635_959B:
+      screen->chip_family = CHIP_FAMILY_RV635;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RS780_9610:
+   case PCI_CHIP_RS780_9611:
+   case PCI_CHIP_RS780_9612:
+   case PCI_CHIP_RS780_9613:
+   case PCI_CHIP_RS780_9614:
+   case PCI_CHIP_RS780_9615:
+   case PCI_CHIP_RS780_9616:
+      screen->chip_family = CHIP_FAMILY_RS780;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+   case PCI_CHIP_RS880_9710:
+   case PCI_CHIP_RS880_9711:
+   case PCI_CHIP_RS880_9712:
+   case PCI_CHIP_RS880_9713:
+   case PCI_CHIP_RS880_9714:
+      screen->chip_family = CHIP_FAMILY_RS880;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV770_9440:
+   case PCI_CHIP_RV770_9441:
+   case PCI_CHIP_RV770_9442:
+   case PCI_CHIP_RV770_9443:
+   case PCI_CHIP_RV770_9444:
+   case PCI_CHIP_RV770_9446:
+   case PCI_CHIP_RV770_944A:
+   case PCI_CHIP_RV770_944B:
+   case PCI_CHIP_RV770_944C:
+   case PCI_CHIP_RV770_944E:
+   case PCI_CHIP_RV770_9450:
+   case PCI_CHIP_RV770_9452:
+   case PCI_CHIP_RV770_9456:
+   case PCI_CHIP_RV770_945A:
+   case PCI_CHIP_RV770_945B:
+   case PCI_CHIP_RV790_9460:
+   case PCI_CHIP_RV790_9462:
+   case PCI_CHIP_RV770_946A:
+   case PCI_CHIP_RV770_946B:
+   case PCI_CHIP_RV770_947A:
+   case PCI_CHIP_RV770_947B:
+      screen->chip_family = CHIP_FAMILY_RV770;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV730_9480:
+   case PCI_CHIP_RV730_9487:
+   case PCI_CHIP_RV730_9488:
+   case PCI_CHIP_RV730_9489:
+   case PCI_CHIP_RV730_948F:
+   case PCI_CHIP_RV730_9490:
+   case PCI_CHIP_RV730_9491:
+   case PCI_CHIP_RV730_9495:
+   case PCI_CHIP_RV730_9498:
+   case PCI_CHIP_RV730_949C:
+   case PCI_CHIP_RV730_949E:
+   case PCI_CHIP_RV730_949F:
+      screen->chip_family = CHIP_FAMILY_RV730;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV710_9540:
+   case PCI_CHIP_RV710_9541:
+   case PCI_CHIP_RV710_9542:
+   case PCI_CHIP_RV710_954E:
+   case PCI_CHIP_RV710_954F:
+   case PCI_CHIP_RV710_9552:
+   case PCI_CHIP_RV710_9553:
+   case PCI_CHIP_RV710_9555:
+   case PCI_CHIP_RV710_9557:
+      screen->chip_family = CHIP_FAMILY_RV710;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV740_94A0:
+   case PCI_CHIP_RV740_94A1:
+   case PCI_CHIP_RV740_94A3:
+   case PCI_CHIP_RV740_94B1:
+   case PCI_CHIP_RV740_94B3:
+   case PCI_CHIP_RV740_94B4:
+   case PCI_CHIP_RV740_94B5:
+   case PCI_CHIP_RV740_94B9:
+      screen->chip_family = CHIP_FAMILY_RV740;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
    default:
       fprintf(stderr, "unknown chip id 0x%x, can't guess.\n",
-	      dri_priv->deviceID);
+	      device_id);
+      return -1;
+   }
+
+   return 0;
+}
+
+
+/* Create the device specific screen private data struct.
+ */
+static radeonScreenPtr
+radeonCreateScreen( __DRIscreenPrivate *sPriv )
+{
+   radeonScreenPtr screen;
+   RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
+   unsigned char *RADEONMMIO = NULL;
+   int i;
+   int ret;
+   uint32_t temp = 0;
+
+   if (sPriv->devPrivSize != sizeof(RADEONDRIRec)) {
+      fprintf(stderr,"\nERROR!  sizeof(RADEONDRIRec) does not match passed size from device driver\n");
+      return GL_FALSE;
+   }
+
+   /* Allocate the private area */
+   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
+   if ( !screen ) {
+      __driUtilMessage("%s: Could not allocate memory for screen structure",
+		       __FUNCTION__);
       return NULL;
    }
+
+   radeon_init_debug();
+
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo (&screen->optionCache,
+		       __driConfigOptions, __driNConfigOptions);
+
+   /* This is first since which regions we map depends on whether or
+    * not we are using a PCI card.
+    */
+   screen->card_type = (dri_priv->IsPCI ? RADEON_CARD_PCI : RADEON_CARD_AGP);
+   {
+      int ret;
+
+      ret = radeonGetParam(sPriv, RADEON_PARAM_GART_BUFFER_OFFSET,
+			    &screen->gart_buffer_offset);
+
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BUFFER_OFFSET): %d\n", ret);
+	 return NULL;
+      }
+
+      ret = radeonGetParam(sPriv, RADEON_PARAM_GART_BASE,
+			    &screen->gart_base);
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BASE): %d\n", ret);
+	 return NULL;
+      }
+
+      ret = radeonGetParam(sPriv, RADEON_PARAM_IRQ_NR,
+			    &screen->irq);
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_IRQ_NR): %d\n", ret);
+	 return NULL;
+      }
+      screen->drmSupportsCubeMapsR200 = (sPriv->drm_version.minor >= 7);
+      screen->drmSupportsBlendColor = (sPriv->drm_version.minor >= 11);
+      screen->drmSupportsTriPerf = (sPriv->drm_version.minor >= 16);
+      screen->drmSupportsFragShader = (sPriv->drm_version.minor >= 18);
+      screen->drmSupportsPointSprites = (sPriv->drm_version.minor >= 13);
+      screen->drmSupportsCubeMapsR100 = (sPriv->drm_version.minor >= 15);
+      screen->drmSupportsVertexProgram = (sPriv->drm_version.minor >= 25);
+      screen->drmSupportsOcclusionQueries = (sPriv->drm_version.minor >= 30);
+   }
+
+   ret = radeon_set_screen_flags(screen, dri_priv->deviceID);
+   if (ret == -1)
+     return NULL;
+
+   screen->mmio.handle = dri_priv->registerHandle;
+   screen->mmio.size   = dri_priv->registerSize;
+   if ( drmMap( sPriv->fd,
+		screen->mmio.handle,
+		screen->mmio.size,
+		&screen->mmio.map ) ) {
+     FREE( screen );
+     __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
+     return NULL;
+   }
+
+   RADEONMMIO = screen->mmio.map;
+
+   screen->status.handle = dri_priv->statusHandle;
+   screen->status.size   = dri_priv->statusSize;
+   if ( drmMap( sPriv->fd,
+		screen->status.handle,
+		screen->status.size,
+		&screen->status.map ) ) {
+     drmUnmap( screen->mmio.map, screen->mmio.size );
+     FREE( screen );
+     __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
+     return NULL;
+   }
+   if (screen->chip_family < CHIP_FAMILY_R600)
+	   screen->scratch = (__volatile__ uint32_t *)
+		   ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
+   else
+	   screen->scratch = (__volatile__ uint32_t *)
+		   ((GLubyte *)screen->status.map + R600_SCRATCH_REG_OFFSET);
+
+   screen->buffers = drmMapBufs( sPriv->fd );
+   if ( !screen->buffers ) {
+     drmUnmap( screen->status.map, screen->status.size );
+     drmUnmap( screen->mmio.map, screen->mmio.size );
+     FREE( screen );
+     __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
+     return NULL;
+   }
+
+   if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
+     screen->gartTextures.handle = dri_priv->gartTexHandle;
+     screen->gartTextures.size   = dri_priv->gartTexMapSize;
+     if ( drmMap( sPriv->fd,
+		  screen->gartTextures.handle,
+		  screen->gartTextures.size,
+		  (drmAddressPtr)&screen->gartTextures.map ) ) {
+       drmUnmapBufs( screen->buffers );
+       drmUnmap( screen->status.map, screen->status.size );
+       drmUnmap( screen->mmio.map, screen->mmio.size );
+       FREE( screen );
+       __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
+       return NULL;
+    }
+
+     screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
+   }
+
    if ((screen->chip_family == CHIP_FAMILY_R350 || screen->chip_family == CHIP_FAMILY_R300) &&
        sPriv->ddx_version.minor < 2) {
       fprintf(stderr, "xf86-video-ati-6.6.2 or newer needed for Radeon 9500/9700/9800 cards.\n");
@@ -823,35 +1063,57 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    }
 
    if (getenv("R300_NO_TCL"))
-     screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+	   screen->chip_flags &= ~RADEON_CHIPSET_TCL;
 
    if (screen->chip_family <= CHIP_FAMILY_RS200)
-      screen->chip_flags |= RADEON_CLASS_R100;
+	   screen->chip_flags |= RADEON_CLASS_R100;
    else if (screen->chip_family <= CHIP_FAMILY_RV280)
-      screen->chip_flags |= RADEON_CLASS_R200;
+	   screen->chip_flags |= RADEON_CLASS_R200;
+   else if (screen->chip_family <= CHIP_FAMILY_RV570)
+	   screen->chip_flags |= RADEON_CLASS_R300;
    else
-      screen->chip_flags |= RADEON_CLASS_R300;
+	   screen->chip_flags |= RADEON_CLASS_R600;
 
    screen->cpp = dri_priv->bpp / 8;
    screen->AGPMode = dri_priv->AGPMode;
 
-   ret = radeonGetParam( sPriv->fd, RADEON_PARAM_FB_LOCATION,
-                         &temp);
-   if (ret) {
-       if (screen->chip_family < CHIP_FAMILY_RS690)
-	   screen->fbLocation      = ( INREG( RADEON_MC_FB_LOCATION ) & 0xffff) << 16;
-       else {
-           FREE( screen );
-           fprintf(stderr, "Unable to get fb location need newer drm\n");
-           return NULL;
+   ret = radeonGetParam(sPriv, RADEON_PARAM_FB_LOCATION, &temp);
+
+   /* +r6/r7 */
+   if(screen->chip_family >= CHIP_FAMILY_R600)
+   {
+       if (ret)
+       {
+            FREE( screen );
+            fprintf(stderr, "Unable to get fb location need newer drm\n");
+            return NULL;
        }
-   } else {
-       screen->fbLocation = (temp & 0xffff) << 16;
+       else
+       {
+            screen->fbLocation = (temp & 0xffff) << 24;
+       }
+   }
+   else
+   {
+        if (ret)
+        {
+            if (screen->chip_family < CHIP_FAMILY_RS600 && !screen->kernel_mm)
+	            screen->fbLocation      = ( INREG( RADEON_MC_FB_LOCATION ) & 0xffff) << 16;
+            else
+            {
+                FREE( screen );
+                fprintf(stderr, "Unable to get fb location need newer drm\n");
+                return NULL;
+            }
+        }
+        else
+        {
+            screen->fbLocation = (temp & 0xffff) << 16;
+        }
    }
 
-   if (screen->chip_family >= CHIP_FAMILY_RV515) {
-       ret = radeonGetParam( sPriv->fd, RADEON_PARAM_NUM_GB_PIPES,
-			     &temp);
+   if (IS_R300_CLASS(screen)) {
+       ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_GB_PIPES, &temp);
        if (ret) {
 	   fprintf(stderr, "Unable to get num_pipes, need newer drm\n");
 	   switch (screen->chip_family) {
@@ -877,6 +1139,26 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
        } else {
 	   screen->num_gb_pipes = temp;
        }
+
+       /* pipe overrides */
+       switch (dri_priv->deviceID) {
+       case PCI_CHIP_R300_AD: /* 9500 with 1 quadpipe verified by: Reid Linnemann <lreid@cs.okstate.edu> */
+       case PCI_CHIP_RV410_5E4C: /* RV410 SE only have 1 quadpipe */
+       case PCI_CHIP_RV410_5E4F: /* RV410 SE only have 1 quadpipe */
+	   screen->num_gb_pipes = 1;
+	   break;
+       default:
+	   break;
+       }
+
+       if ( sPriv->drm_version.minor >= 31 ) {
+	       ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_Z_PIPES, &temp);
+	       if (ret)
+		       screen->num_z_pipes = 2;
+	       else
+		       screen->num_z_pipes = temp;
+       } else
+	       screen->num_z_pipes = 2;
    }
 
    if ( sPriv->drm_version.minor >= 10 ) {
@@ -946,7 +1228,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
    if (IS_R200_CLASS(screen))
-       screen->extensions[i++] = &r200AllocateExtension.base;
+      screen->extensions[i++] = &r200AllocateExtension.base;
 
    screen->extensions[i++] = &r200texOffsetExtension.base;
 #endif
@@ -955,11 +1237,173 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    screen->extensions[i++] = &r300texOffsetExtension.base;
 #endif
 
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   screen->extensions[i++] = &r600texOffsetExtension.base;
+#endif
+
    screen->extensions[i++] = NULL;
    sPriv->extensions = screen->extensions;
 
    screen->driScreen = sPriv;
    screen->sarea_priv_offset = dri_priv->sarea_priv_offset;
+   screen->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
+					       screen->sarea_priv_offset);
+
+   screen->bom = radeon_bo_manager_legacy_ctor(screen);
+   if (screen->bom == NULL) {
+     free(screen);
+     return NULL;
+   }
+
+   return screen;
+}
+
+static radeonScreenPtr
+radeonCreateScreen2(__DRIscreenPrivate *sPriv)
+{
+   radeonScreenPtr screen;
+   int i;
+   int ret;
+   uint32_t device_id = 0;
+   uint32_t temp = 0;
+
+   /* Allocate the private area */
+   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
+   if ( !screen ) {
+      __driUtilMessage("%s: Could not allocate memory for screen structure",
+		       __FUNCTION__);
+      fprintf(stderr, "leaving here\n");
+      return NULL;
+   }
+
+   radeon_init_debug();
+
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo (&screen->optionCache,
+		       __driConfigOptions, __driNConfigOptions);
+
+   screen->kernel_mm = 1;
+   screen->chip_flags = 0;
+
+   /* if we have kms we can support all of these */
+   screen->drmSupportsCubeMapsR200 = 1;
+   screen->drmSupportsBlendColor = 1;
+   screen->drmSupportsTriPerf = 1;
+   screen->drmSupportsFragShader = 1;
+   screen->drmSupportsPointSprites = 1;
+   screen->drmSupportsCubeMapsR100 = 1;
+   screen->drmSupportsVertexProgram = 1;
+   screen->drmSupportsOcclusionQueries = 1;
+   screen->irq = 1;
+
+   ret = radeonGetParam(sPriv, RADEON_PARAM_DEVICE_ID, &device_id);
+   if (ret) {
+     FREE( screen );
+     fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_DEVICE_ID): %d\n", ret);
+     return NULL;
+   }
+
+   ret = radeon_set_screen_flags(screen, device_id);
+   if (ret == -1)
+     return NULL;
+
+   if (getenv("R300_NO_TCL"))
+	   screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+
+   if (screen->chip_family <= CHIP_FAMILY_RS200)
+	   screen->chip_flags |= RADEON_CLASS_R100;
+   else if (screen->chip_family <= CHIP_FAMILY_RV280)
+	   screen->chip_flags |= RADEON_CLASS_R200;
+   else if (screen->chip_family <= CHIP_FAMILY_RV570)
+	   screen->chip_flags |= RADEON_CLASS_R300;
+   else
+	   screen->chip_flags |= RADEON_CLASS_R600;
+
+   if (IS_R300_CLASS(screen)) {
+       ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_GB_PIPES, &temp);
+       if (ret) {
+	   fprintf(stderr, "Unable to get num_pipes, need newer drm\n");
+	   switch (screen->chip_family) {
+	   case CHIP_FAMILY_R300:
+	   case CHIP_FAMILY_R350:
+	       screen->num_gb_pipes = 2;
+	       break;
+	   case CHIP_FAMILY_R420:
+	   case CHIP_FAMILY_R520:
+	   case CHIP_FAMILY_R580:
+	   case CHIP_FAMILY_RV560:
+	   case CHIP_FAMILY_RV570:
+	       screen->num_gb_pipes = 4;
+	       break;
+	   case CHIP_FAMILY_RV350:
+	   case CHIP_FAMILY_RV515:
+	   case CHIP_FAMILY_RV530:
+	   case CHIP_FAMILY_RV410:
+	   default:
+	       screen->num_gb_pipes = 1;
+	       break;
+	   }
+       } else {
+	   screen->num_gb_pipes = temp;
+       }
+
+       /* pipe overrides */
+       switch (device_id) {
+       case PCI_CHIP_R300_AD: /* 9500 with 1 quadpipe verified by: Reid Linnemann <lreid@cs.okstate.edu> */
+       case PCI_CHIP_RV410_5E4C: /* RV410 SE only have 1 quadpipe */
+       case PCI_CHIP_RV410_5E4F: /* RV410 SE only have 1 quadpipe */
+	   screen->num_gb_pipes = 1;
+	   break;
+       default:
+	   break;
+       }
+
+       ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_Z_PIPES, &temp);
+       if (ret)
+	       screen->num_z_pipes = 2;
+       else
+	       screen->num_z_pipes = temp;
+
+   }
+
+   i = 0;
+   screen->extensions[i++] = &driCopySubBufferExtension.base;
+   screen->extensions[i++] = &driFrameTrackingExtension.base;
+   screen->extensions[i++] = &driReadDrawableExtension;
+
+   if ( screen->irq != 0 ) {
+       screen->extensions[i++] = &driSwapControlExtension.base;
+       screen->extensions[i++] = &driMediaStreamCounterExtension.base;
+   }
+
+#if !RADEON_COMMON
+   screen->extensions[i++] = &radeonTexBufferExtension.base;
+#endif
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   if (IS_R200_CLASS(screen))
+       screen->extensions[i++] = &r200AllocateExtension.base;
+
+   screen->extensions[i++] = &r200TexBufferExtension.base;
+#endif
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+   screen->extensions[i++] = &r300TexBufferExtension.base;
+#endif
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   screen->extensions[i++] = &r600TexBufferExtension.base;
+#endif
+
+   screen->extensions[i++] = NULL;
+   sPriv->extensions = screen->extensions;
+
+   screen->driScreen = sPriv;
+   screen->bom = radeon_bo_manager_gem_ctor(sPriv->fd);
+   if (screen->bom == NULL) {
+       free(screen);
+       return NULL;
+   }
    return screen;
 }
 
@@ -968,23 +1412,32 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
 static void
 radeonDestroyScreen( __DRIscreenPrivate *sPriv )
 {
-   radeonScreenPtr screen = (radeonScreenPtr)sPriv->private;
+    radeonScreenPtr screen = (radeonScreenPtr)sPriv->private;
 
-   if (!screen)
-      return;
+    if (!screen)
+        return;
 
-   if ( screen->gartTextures.map ) {
-      drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
-   }
-   drmUnmapBufs( screen->buffers );
-   drmUnmap( screen->status.map, screen->status.size );
-   drmUnmap( screen->mmio.map, screen->mmio.size );
+    if (screen->kernel_mm) {
+#ifdef RADEON_BO_TRACK
+        radeon_tracker_print(&screen->bom->tracker, stderr);
+#endif
+        radeon_bo_manager_gem_dtor(screen->bom);
+    } else {
+        radeon_bo_manager_legacy_dtor(screen->bom);
+
+        if ( screen->gartTextures.map ) {
+            drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
+        }
+        drmUnmapBufs( screen->buffers );
+        drmUnmap( screen->status.map, screen->status.size );
+        drmUnmap( screen->mmio.map, screen->mmio.size );
+    }
 
-   /* free all option information */
-   driDestroyOptionInfo (&screen->optionCache);
+    /* free all option information */
+    driDestroyOptionInfo (&screen->optionCache);
 
-   FREE( screen );
-   sPriv->private = NULL;
+    FREE( screen );
+    sPriv->private = NULL;
 }
 
 
@@ -993,16 +1446,21 @@ radeonDestroyScreen( __DRIscreenPrivate *sPriv )
 static GLboolean
 radeonInitDriver( __DRIscreenPrivate *sPriv )
 {
-   sPriv->private = (void *) radeonCreateScreen( sPriv );
-   if ( !sPriv->private ) {
-      radeonDestroyScreen( sPriv );
-      return GL_FALSE;
-   }
+    if (sPriv->dri2.enabled) {
+        sPriv->private = (void *) radeonCreateScreen2( sPriv );
+    } else {
+        sPriv->private = (void *) radeonCreateScreen( sPriv );
+    }
+    if ( !sPriv->private ) {
+        radeonDestroyScreen( sPriv );
+        return GL_FALSE;
+    }
 
-   return GL_TRUE;
+    return GL_TRUE;
 }
 
 
+
 /**
  * Create the Mesa framebuffer and renderbuffers for a given window/drawable.
  *
@@ -1015,131 +1473,112 @@ radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
                     const __GLcontextModes *mesaVis,
                     GLboolean isPixmap )
 {
-   radeonScreenPtr screen = (radeonScreenPtr) driScrnPriv->private;
+    radeonScreenPtr screen = (radeonScreenPtr) driScrnPriv->private;
 
-   if (isPixmap) {
+    const GLboolean swDepth = GL_FALSE;
+    const GLboolean swAlpha = GL_FALSE;
+    const GLboolean swAccum = mesaVis->accumRedBits > 0;
+    const GLboolean swStencil = mesaVis->stencilBits > 0 &&
+	mesaVis->depthBits != 24;
+    GLenum rgbFormat;
+    struct radeon_framebuffer *rfb;
+
+    if (isPixmap)
       return GL_FALSE; /* not implemented */
-   }
-   else {
-      const GLboolean swDepth = GL_FALSE;
-      const GLboolean swAlpha = GL_FALSE;
-      const GLboolean swAccum = mesaVis->accumRedBits > 0;
-      const GLboolean swStencil = mesaVis->stencilBits > 0 &&
-         mesaVis->depthBits != 24;
-      struct gl_framebuffer *fb = _mesa_create_framebuffer(mesaVis);
-
-      /* front color renderbuffer */
-      {
-         driRenderbuffer *frontRb
-            = driNewRenderbuffer(GL_RGBA,
-                                 driScrnPriv->pFB + screen->frontOffset,
-                                 screen->cpp,
-                                 screen->frontOffset, screen->frontPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(frontRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_FRONT_LEFT, &frontRb->Base);
-      }
 
-      /* back color renderbuffer */
-      if (mesaVis->doubleBufferMode) {
-         driRenderbuffer *backRb
-            = driNewRenderbuffer(GL_RGBA,
-                                 driScrnPriv->pFB + screen->backOffset,
-                                 screen->cpp,
-                                 screen->backOffset, screen->backPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(backRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_BACK_LEFT, &backRb->Base);
-      }
+    rfb = CALLOC_STRUCT(radeon_framebuffer);
+    if (!rfb)
+      return GL_FALSE;
 
-      /* depth renderbuffer */
-      if (mesaVis->depthBits == 16) {
-         driRenderbuffer *depthRb
-            = driNewRenderbuffer(GL_DEPTH_COMPONENT16,
-                                 driScrnPriv->pFB + screen->depthOffset,
-                                 screen->cpp,
-                                 screen->depthOffset, screen->depthPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(depthRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
-	 depthRb->depthHasSurface = screen->depthHasSurface;
-      }
-      else if (mesaVis->depthBits == 24) {
-         driRenderbuffer *depthRb
-            = driNewRenderbuffer(GL_DEPTH_COMPONENT24,
-                                 driScrnPriv->pFB + screen->depthOffset,
-                                 screen->cpp,
-                                 screen->depthOffset, screen->depthPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(depthRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
-	 depthRb->depthHasSurface = screen->depthHasSurface;
-      }
+    _mesa_initialize_framebuffer(&rfb->base, mesaVis);
+
+    if (mesaVis->redBits == 5)
+        rgbFormat = GL_RGB5;
+    else if (mesaVis->alphaBits == 0)
+        rgbFormat = GL_RGB8;
+    else
+        rgbFormat = GL_RGBA8;
+
+    /* front color renderbuffer */
+    rfb->color_rb[0] = radeon_create_renderbuffer(rgbFormat, driDrawPriv);
+    _mesa_add_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT, &rfb->color_rb[0]->base);
+    rfb->color_rb[0]->has_surface = 1;
+
+    /* back color renderbuffer */
+    if (mesaVis->doubleBufferMode) {
+      rfb->color_rb[1] = radeon_create_renderbuffer(rgbFormat, driDrawPriv);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_BACK_LEFT, &rfb->color_rb[1]->base);
+	rfb->color_rb[1]->has_surface = 1;
+    }
 
-      /* stencil renderbuffer */
-      if (mesaVis->stencilBits > 0 && !swStencil) {
-         driRenderbuffer *stencilRb
-            = driNewRenderbuffer(GL_STENCIL_INDEX8_EXT,
-                                 driScrnPriv->pFB + screen->depthOffset,
-                                 screen->cpp,
-                                 screen->depthOffset, screen->depthPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(stencilRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_STENCIL, &stencilRb->Base);
-	 stencilRb->depthHasSurface = screen->depthHasSurface;
+    if (mesaVis->depthBits == 24) {
+      if (mesaVis->stencilBits == 8) {
+	struct radeon_renderbuffer *depthStencilRb = radeon_create_renderbuffer(GL_DEPTH24_STENCIL8_EXT, driDrawPriv);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depthStencilRb->base);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_STENCIL, &depthStencilRb->base);
+	depthStencilRb->has_surface = screen->depthHasSurface;
+      } else {
+	/* depth renderbuffer */
+	struct radeon_renderbuffer *depth = radeon_create_renderbuffer(GL_DEPTH_COMPONENT24, driDrawPriv);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depth->base);
+	depth->has_surface = screen->depthHasSurface;
       }
+    } else if (mesaVis->depthBits == 16) {
+      /* just 16-bit depth buffer, no hw stencil */
+	struct radeon_renderbuffer *depth = radeon_create_renderbuffer(GL_DEPTH_COMPONENT16, driDrawPriv);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depth->base);
+	depth->has_surface = screen->depthHasSurface;
+    }
 
-      _mesa_add_soft_renderbuffers(fb,
-                                   GL_FALSE, /* color */
-                                   swDepth,
-                                   swStencil,
-                                   swAccum,
-                                   swAlpha,
-                                   GL_FALSE /* aux */);
-      driDrawPriv->driverPrivate = (void *) fb;
+    _mesa_add_soft_renderbuffers(&rfb->base,
+	    GL_FALSE, /* color */
+	    swDepth,
+	    swStencil,
+	    swAccum,
+	    swAlpha,
+	    GL_FALSE /* aux */);
+    driDrawPriv->driverPrivate = (void *) rfb;
 
-      return (driDrawPriv->driverPrivate != NULL);
-   }
+    return (driDrawPriv->driverPrivate != NULL);
 }
 
 
-static void
-radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
-{
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
-}
-
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
-/**
- * Choose the appropriate CreateContext function based on the chipset.
- * Eventually, all drivers will go through this process.
- */
-static GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
-				     __DRIcontextPrivate * driContextPriv,
-				     void *sharedContextPriv)
+static void radeon_cleanup_renderbuffers(struct radeon_framebuffer *rfb)
 {
-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
-	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	struct radeon_renderbuffer *rb;
 
-	if (IS_R300_CLASS(screen))
-		return r300CreateContext(glVisual, driContextPriv, sharedContextPriv);
-        return GL_FALSE;
+	rb = rfb->color_rb[0];
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = rfb->color_rb[1];
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
 }
 
-/**
- * Choose the appropriate DestroyContext function based on the chipset.
- */
-static void radeonDestroyContext(__DRIcontextPrivate * driContextPriv)
+void
+radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
-
-	if (IS_R300_CLASS(radeon->radeonScreen))
-		return r300DestroyContext(driContextPriv);
+    struct radeon_framebuffer *rfb;
+    if (!driDrawPriv)
+	return;
+
+    rfb = (void*)driDrawPriv->driverPrivate;
+    if (!rfb)
+	return;
+    radeon_cleanup_renderbuffers(rfb);
+    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
-#endif
-
 /**
  * This is the driver specific part of the createNewScreen entry point.
  *
@@ -1165,6 +1604,11 @@ radeonInitScreen(__DRIscreenPrivate *psp)
    static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
    static const __DRIversion drm_expected = { 1, 24, 0 };
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   static const char *driver_name = "R600";
+   static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
+   static const __DRIversion dri_expected = { 4, 0, 0 };
+   static const __DRIversion drm_expected = { 1, 24, 0 };
 #endif
    RADEONDRIPtr dri_priv = (RADEONDRIPtr) psp->pDevPriv;
 
@@ -1192,18 +1636,112 @@ radeonInitScreen(__DRIscreenPrivate *psp)
    driInitSingleExtension( NULL, NV_vp_extension );
    driInitSingleExtension( NULL, ATI_fs_extension );
    driInitExtensions( NULL, point_extensions, GL_FALSE );
+#elif (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
+   driInitSingleExtension( NULL, gl_20_extension );
 #endif
 
    if (!radeonInitDriver(psp))
        return NULL;
 
+   /* for now fill in all modes */
    return radeonFillInModes( psp,
 			     dri_priv->bpp,
 			     (dri_priv->bpp == 16) ? 16 : 24,
-			     (dri_priv->bpp == 16) ? 0  : 8,
-			     (dri_priv->backOffset != dri_priv->depthOffset) );
+			     (dri_priv->bpp == 16) ? 0  : 8, 1);
 }
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
 
+/**
+ * This is the driver specific part of the createNewScreen entry point.
+ * Called when using DRI2.
+ *
+ * \return the __GLcontextModes supported by this driver
+ */
+static const
+__DRIconfig **radeonInitScreen2(__DRIscreenPrivate *psp)
+{
+   GLenum fb_format[3];
+   GLenum fb_type[3];
+   /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
+    * support pageflipping at all.
+    */
+   static const GLenum back_buffer_modes[] = {
+     GLX_NONE, GLX_SWAP_UNDEFINED_OML, /*, GLX_SWAP_COPY_OML*/
+   };
+   uint8_t depth_bits[4], stencil_bits[4], msaa_samples_array[1];
+   int color;
+   __DRIconfig **configs = NULL;
+
+   /* Calling driInitExtensions here, with a NULL context pointer,
+    * does not actually enable the extensions.  It just makes sure
+    * that all the dispatch offsets for all the extensions that
+    * *might* be enables are known.  This is needed because the
+    * dispatch offsets need to be known when _mesa_context_create
+    * is called, but we can't enable the extensions until we have a
+    * context pointer.
+    *
+    * Hello chicken.  Hello egg.  How are you two today?
+    */
+   driInitExtensions( NULL, card_extensions, GL_FALSE );
+   driInitExtensions( NULL, mm_extensions, GL_FALSE );
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   driInitExtensions( NULL, blend_extensions, GL_FALSE );
+   driInitSingleExtension( NULL, ARB_vp_extension );
+   driInitSingleExtension( NULL, NV_vp_extension );
+   driInitSingleExtension( NULL, ATI_fs_extension );
+   driInitExtensions( NULL, point_extensions, GL_FALSE );
+#elif (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
+   driInitSingleExtension( NULL, gl_20_extension );
+#endif
+
+   if (!radeonInitDriver(psp)) {
+       return NULL;
+    }
+   depth_bits[0] = 0;
+   stencil_bits[0] = 0;
+   depth_bits[1] = 16;
+   stencil_bits[1] = 0;
+   depth_bits[2] = 24;
+   stencil_bits[2] = 0;
+   depth_bits[3] = 24;
+   stencil_bits[3] = 8;
+
+   msaa_samples_array[0] = 0;
+
+   fb_format[0] = GL_RGB;
+   fb_type[0] = GL_UNSIGNED_SHORT_5_6_5;
+
+   fb_format[1] = GL_BGR;
+   fb_type[1] = GL_UNSIGNED_INT_8_8_8_8_REV;
+
+   fb_format[2] = GL_BGRA;
+   fb_type[2] = GL_UNSIGNED_INT_8_8_8_8_REV;
+
+   for (color = 0; color < ARRAY_SIZE(fb_format); color++) {
+      __DRIconfig **new_configs;
+
+      new_configs = driCreateConfigs(fb_format[color], fb_type[color],
+				     depth_bits,
+				     stencil_bits,
+				     ARRAY_SIZE(depth_bits),
+				     back_buffer_modes,
+				     ARRAY_SIZE(back_buffer_modes),
+				     msaa_samples_array,
+				     ARRAY_SIZE(msaa_samples_array));
+      if (configs == NULL)
+	 configs = new_configs;
+      else
+	 configs = driConcatConfigs(configs, new_configs);
+   }
+
+   if (configs == NULL) {
+      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+              __LINE__);
+      return NULL;
+   }
+
+   return (const __DRIconfig **)configs;
+}
 
 /**
  * Get information about previous buffer swaps.
@@ -1211,36 +1749,42 @@ radeonInitScreen(__DRIscreenPrivate *psp)
 static int
 getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
 {
-#if !RADEON_COMMON || (RADEON_COMMON && defined(RADEON_COMMON_FOR_R300))
-   radeonContextPtr  rmesa;
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
-   r200ContextPtr  rmesa;
-#endif
+    struct radeon_framebuffer *rfb;
 
-   if ( (dPriv == NULL) || (dPriv->driContextPriv == NULL)
-	|| (dPriv->driContextPriv->driverPrivate == NULL)
-	|| (sInfo == NULL) ) {
-      return -1;
+    if ( (dPriv == NULL) || (dPriv->driContextPriv == NULL)
+	 || (dPriv->driContextPriv->driverPrivate == NULL)
+	 || (sInfo == NULL) ) {
+	return -1;
    }
 
-   rmesa = dPriv->driContextPriv->driverPrivate;
-   sInfo->swap_count = rmesa->swap_count;
-   sInfo->swap_ust = rmesa->swap_ust;
-   sInfo->swap_missed_count = rmesa->swap_missed_count;
+    rfb = dPriv->driverPrivate;
+    sInfo->swap_count = rfb->swap_count;
+    sInfo->swap_ust = rfb->swap_ust;
+    sInfo->swap_missed_count = rfb->swap_missed_count;
 
    sInfo->swap_missed_usage = (sInfo->swap_missed_count != 0)
-       ? driCalculateSwapUsage( dPriv, 0, rmesa->swap_missed_ust )
+       ? driCalculateSwapUsage( dPriv, 0, rfb->swap_missed_ust )
        : 0.0;
 
    return 0;
 }
 
-#if !RADEON_COMMON || (RADEON_COMMON && defined(RADEON_COMMON_FOR_R300))
 const struct __DriverAPIRec driDriverAPI = {
    .InitScreen      = radeonInitScreen,
    .DestroyScreen   = radeonDestroyScreen,
-   .CreateContext   = radeonCreateContext,
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   .CreateContext   = r200CreateContext,
+   .DestroyContext  = r200DestroyContext,
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   .CreateContext   = r600CreateContext,
+   .DestroyContext  = radeonDestroyContext,
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+   .CreateContext   = r300CreateContext,
    .DestroyContext  = radeonDestroyContext,
+#else
+   .CreateContext   = r100CreateContext,
+   .DestroyContext  = radeonDestroyContext,
+#endif
    .CreateBuffer    = radeonCreateBuffer,
    .DestroyBuffer   = radeonDestroyBuffer,
    .SwapBuffers     = radeonSwapBuffers,
@@ -1252,23 +1796,7 @@ const struct __DriverAPIRec driDriverAPI = {
    .WaitForSBC      = NULL,
    .SwapBuffersMSC  = NULL,
    .CopySubBuffer   = radeonCopySubBuffer,
+    /* DRI2 */
+   .InitScreen2     = radeonInitScreen2,
 };
-#else
-const struct __DriverAPIRec driDriverAPI = {
-   .InitScreen      = radeonInitScreen,
-   .DestroyScreen   = radeonDestroyScreen,
-   .CreateContext   = r200CreateContext,
-   .DestroyContext  = r200DestroyContext,
-   .CreateBuffer    = radeonCreateBuffer,
-   .DestroyBuffer   = radeonDestroyBuffer,
-   .SwapBuffers     = r200SwapBuffers,
-   .MakeCurrent     = r200MakeCurrent,
-   .UnbindContext   = r200UnbindContext,
-   .GetSwapInfo     = getSwapInfo,
-   .GetDrawableMSC  = driDrawableGetMSC32,
-   .WaitForMSC      = driWaitForMSC32,
-   .WaitForSBC      = NULL,
-   .SwapBuffersMSC  = NULL,
-   .CopySubBuffer   = r200CopySubBuffer,
-};
-#endif
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
index b84c70bfae..15744e8828 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
@@ -54,11 +54,12 @@ typedef struct {
    drmAddress map;			/* Mapping of the DRM region */
 } radeonRegionRec, *radeonRegionPtr;
 
-typedef struct {
+typedef struct radeon_screen {
    int chip_family;
    int chip_flags;
    int cpp;
    int card_type;
+   int device_id; /* PCI ID */
    int AGPMode;
    unsigned int irq;			/* IRQ number (0 means none) */
 
@@ -98,14 +99,19 @@ typedef struct {
    GLboolean drmSupportsPointSprites;   /* need radeon kernel module >= 1.13 */
    GLboolean drmSupportsCubeMapsR100;   /* need radeon kernel module >= 1.15 */
    GLboolean drmSupportsVertexProgram;  /* need radeon kernel module >= 1.25 */
+   GLboolean drmSupportsOcclusionQueries; /* need radeon kernel module >= 1.30 */
    GLboolean depthHasSurface;
 
    /* Configuration cache with default values for all contexts */
    driOptionCache optionCache;
 
-   const __DRIextension *extensions[8];
+   const __DRIextension *extensions[16];
 
    int num_gb_pipes;
+   int num_z_pipes;
+   int kernel_mm;
+   drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+   struct radeon_bo_manager *bom;
 } radeonScreenRec, *radeonScreenPtr;
 
 #define IS_R100_CLASS(screen) \
@@ -114,5 +120,8 @@ typedef struct {
 	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R200)
 #define IS_R300_CLASS(screen) \
 	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R300)
+#define IS_R600_CLASS(screen) \
+	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R600)
 
+extern void radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv);
 #endif /* __RADEON_SCREEN_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c
index 12051ff1c8..4e100d854e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.c
+++ b/src/mesa/drivers/dri/radeon/radeon_span.c
@@ -43,46 +43,222 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/glheader.h"
 #include "swrast/swrast.h"
 
-#include "radeon_context.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
+#include "radeon_common.h"
+#include "radeon_lock.h"
 #include "radeon_span.h"
-#include "radeon_tex.h"
-
-#include "drirenderbuffer.h"
 
 #define DBG 0
 
+static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
+
+
+/* r200 depth buffer is always tiled - this is the formula
+   according to the docs unless I typo'ed in it
+*/
+#if defined(RADEON_COMMON_FOR_R200)
+static GLubyte *r200_depth_2byte(const struct radeon_renderbuffer * rrb,
+				 GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset;
+    if (rrb->has_surface) {
+	offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+	GLuint b;
+	offset = 0;
+	b = (((y  >> 4) * (rrb->pitch >> 8) + (x >> 6)));
+	offset += (b >> 1) << 12;
+	offset += (((rrb->pitch >> 8) & 0x1) ? (b & 0x1) : ((b & 0x1) ^ ((y >> 4) & 0x1))) << 11;
+	offset += ((y >> 2) & 0x3) << 9;
+	offset += ((x >> 3) & 0x1) << 8;
+	offset += ((x >> 4) & 0x3) << 6;
+	offset += ((x >> 2) & 0x1) << 5;
+	offset += ((y >> 1) & 0x1) << 4;
+	offset += ((x >> 1) & 0x1) << 3;
+	offset += (y & 0x1) << 2;
+	offset += (x & 0x1) << 1;
+    }
+    return &ptr[offset];
+}
+
+static GLubyte *r200_depth_4byte(const struct radeon_renderbuffer * rrb,
+				 GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    GLint offset;
+    if (rrb->has_surface) {
+	offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+	GLuint b;
+	offset = 0;
+	b = (((y & 0x7ff) >> 4) * (rrb->pitch >> 7) + (x >> 5));
+	offset += (b >> 1) << 12;
+	offset += (((rrb->pitch >> 7) & 0x1) ? (b & 0x1) : ((b & 0x1) ^ ((y >> 4) & 0x1))) << 11;
+	offset += ((y >> 2) & 0x3) << 9;
+	offset += ((x >> 2) & 0x1) << 8;
+	offset += ((x >> 3) & 0x3) << 6;
+	offset += ((y >> 1) & 0x1) << 5;
+	offset += ((x >> 1) & 0x1) << 4;
+	offset += (y & 0x1) << 3;
+	offset += (x & 0x1) << 2;
+    }
+    return &ptr[offset];
+}
+#endif
+
+/* radeon tiling on r300-r500 has 4 states,
+   macro-linear/micro-linear
+   macro-linear/micro-tiled
+   macro-tiled /micro-linear
+   macro-tiled /micro-tiled
+   1 byte surface 
+   2 byte surface - two types - we only provide 8x2 microtiling
+   4 byte surface
+   8/16 byte (unused)
+*/
+static GLubyte *radeon_ptr_4byte(const struct radeon_renderbuffer * rrb,
+			     GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
+    GLint offset;
+
+    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
+        offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+        offset = 0;
+        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+	    if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
+		offset = ((y >> 4) * (rrb->pitch >> 7) + (x >> 5)) << 11;
+		offset += (((y >> 3) ^ (x >> 5)) & 0x1) << 10;
+		offset += (((y >> 4) ^ (x >> 4)) & 0x1) << 9;
+		offset += (((y >> 2) ^ (x >> 4)) & 0x1) << 8;
+		offset += (((y >> 3) ^ (x >> 3)) & 0x1) << 7;
+		offset += ((y >> 1) & 0x1) << 6;
+		offset += ((x >> 2) & 0x1) << 5;
+		offset += (y & 1) << 4;
+		offset += (x & 3) << 2;
+            } else {
+		offset = ((y >> 3) * (rrb->pitch >> 8) + (x >> 6)) << 11;
+		offset += (((y >> 2) ^ (x >> 6)) & 0x1) << 10;
+		offset += (((y >> 3) ^ (x >> 5)) & 0x1) << 9;
+		offset += (((y >> 1) ^ (x >> 5)) & 0x1) << 8;
+		offset += (((y >> 2) ^ (x >> 4)) & 0x1) << 7;
+		offset += (y & 1) << 6;
+		offset += (x & 15) << 2;
+            }
+        } else {
+	    offset = ((y >> 1) * (rrb->pitch >> 4) + (x >> 2)) << 5;
+	    offset += (y & 1) << 4;
+	    offset += (x & 3) << 2;
+        }
+    }
+    return &ptr[offset];
+}
+
+static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
+				     GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
+    GLint offset;
+
+    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
+        offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+        offset = 0;
+        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+            if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
+		offset = ((y >> 4) * (rrb->pitch >> 7) + (x >> 6)) << 11;
+		offset += (((y >> 3) ^ (x >> 6)) & 0x1) << 10;
+		offset += (((y >> 4) ^ (x >> 5)) & 0x1) << 9;
+		offset += (((y >> 2) ^ (x >> 5)) & 0x1) << 8;
+		offset += (((y >> 3) ^ (x >> 4)) & 0x1) << 7;
+		offset += ((y >> 1) & 0x1) << 6;
+		offset += ((x >> 3) & 0x1) << 5;
+		offset += (y & 1) << 4;
+		offset += (x & 3) << 2;
+            } else {
+		offset = ((y >> 3) * (rrb->pitch >> 8) + (x >> 7)) << 11;
+		offset += (((y >> 2) ^ (x >> 7)) & 0x1) << 10;
+		offset += (((y >> 3) ^ (x >> 6)) & 0x1) << 9;
+		offset += (((y >> 1) ^ (x >> 6)) & 0x1) << 8;
+		offset += (((y >> 2) ^ (x >> 5)) & 0x1) << 7;
+		offset += (y & 1) << 6;
+		offset += ((x >> 4) & 0x1) << 5;
+                offset += (x & 15) << 2;
+            }
+        } else {
+	    offset = ((y >> 1) * (rrb->pitch >> 4) + (x >> 3)) << 5;
+	    offset += (y & 0x1) << 4;
+	    offset += (x & 0x7) << 1;
+        }
+    }
+    return &ptr[offset];
+}
+
+#ifndef COMPILE_R300
+static uint32_t
+z24s8_to_s8z24(uint32_t val)
+{
+   return (val << 24) | (val >> 8);
+}
+
+static uint32_t
+s8z24_to_z24s8(uint32_t val)
+{
+   return (val >> 24) | (val << 8);
+}
+#endif
+
 /*
  * Note that all information needed to access pixels in a renderbuffer
  * should be obtained through the gl_renderbuffer parameter, not per-context
  * information.
  */
 #define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
-   const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
-   GLuint p;							\
-   (void) p;
+   struct radeon_context *radeon = RADEON_CONTEXT(ctx);			\
+   struct radeon_renderbuffer *rrb = (void *) rb;		\
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : rrb->base.Height - 1;\
+   unsigned int num_cliprects;						\
+   struct drm_clip_rect *cliprects;					\
+   int x_off, y_off;							\
+   GLuint p;						\
+   (void)p;						\
+   radeon_get_cliprects(radeon, &cliprects, &num_cliprects, &x_off, &y_off);
 
 #define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
-   const GLuint bottom = dPriv->h - 1;			\
-   GLuint xo = dPriv->x;				\
-   GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
+   struct radeon_context *radeon = RADEON_CONTEXT(ctx);			\
+   struct radeon_renderbuffer *rrb = (void *) rb;	\
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : rrb->base.Height - 1;\
+   unsigned int num_cliprects;						\
+   struct drm_clip_rect *cliprects;					\
+   int x_off, y_off;							\
+  radeon_get_cliprects(radeon, &cliprects, &num_cliprects, &x_off, &y_off);
 
 #define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
 
-#define Y_FLIP(Y) (bottom - (Y))
+#define Y_FLIP(_y) ((_y) * yScale + yBias)
 
 #define HW_LOCK()
 
 #define HW_UNLOCK()
 
+/* XXX FBO: this is identical to the macro in spantmp2.h except we get
+ * the cliprect info from the context, not the driDrawable.
+ * Move this into spantmp2.h someday.
+ */
+#define HW_CLIPLOOP()							\
+   do {									\
+      int _nc = num_cliprects;						\
+      while ( _nc-- ) {							\
+	 int minx = cliprects[_nc].x1 - x_off;				\
+	 int miny = cliprects[_nc].y1 - y_off;				\
+	 int maxx = cliprects[_nc].x2 - x_off;				\
+	 int maxy = cliprects[_nc].y2 - y_off;
+
 /* ================================================================
  * Color buffer
  */
@@ -94,7 +270,41 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define TAG(x)    radeon##x##_RGB565
 #define TAG2(x,y) radeon##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 16 bit, ARGB1555 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5_REV
+
+#define TAG(x)    radeon##x##_ARGB1555
+#define TAG2(x,y) radeon##x##_ARGB1555##y
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 16 bit, RGBA4 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4_REV
+
+#define TAG(x)    radeon##x##_ARGB4444
+#define TAG2(x,y) radeon##x##_ARGB4444##y
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 32 bit, xRGB8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+
+#define TAG(x)    radeon##x##_xRGB8888
+#define TAG2(x,y) radeon##x##_xRGB8888##y
+#define GET_VALUE(_x, _y) ((*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) | 0xff000000))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
 #include "spantmp2.h"
 
 /* 32 bit, ARGB8888 color spanline and pixel functions
@@ -104,7 +314,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define TAG(x)    radeon##x##_ARGB8888
 #define TAG2(x,y) radeon##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+#define GET_VALUE(_x, _y) (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
 #include "spantmp2.h"
 
 /* ================================================================
@@ -121,106 +335,127 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * too...
  */
 
-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 4 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0..1] = 0           */
-
-#ifdef COMPILE_R300
-		ba = (y / 8) * (pitch / 8) + (x / 8);
-#else
-		ba = (y / 16) * (pitch / 16) + (x / 16);
-#endif
-
-		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
-		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
-		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-static INLINE GLuint
-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 2 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0]    = 0           */
-
-		ba = (y / 16) * (pitch / 32) + (x / 32);
-
-		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
-		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
-		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
 /* 16-bit depth buffer functions
  */
 #define VALUE_TYPE GLushort
 
+#if defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+   *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off) = d
+#else
 #define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
+   *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off) = d
+#endif
 
+#if defined(RADEON_COMMON_FOR_R200)
 #define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
+   d = *(GLushort *)r200_depth_2byte(rrb, _x + x_off, _y + y_off)
+#else
+#define READ_DEPTH( d, _x, _y )						\
+   d = *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off)
+#endif
 
 #define TAG(x) radeon##x##_z16
 #include "depthtmp.h"
 
-/* 24 bit depth, 8 bit stencil depthbuffer functions
+/* 24 bit depth
  *
  * Careful: It looks like the R300 uses ZZZS byte order while the R200
  * uses SZZZ for 24 bit depth, 8 bit stencil mode.
  */
 #define VALUE_TYPE GLuint
 
-#ifdef COMPILE_R300
+#if defined(COMPILE_R300)
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0x000000ff;							\
    tmp |= ((d << 8) & 0xffffff00);					\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
+} while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
+   tmp &= 0xff000000;							\
+   tmp |= ((d) & 0x00ffffff);						\
+   *_ptr = tmp;					\
 } while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
+   GLuint tmp = *_ptr;							\
    tmp &= 0xff000000;							\
    tmp |= ((d) & 0x00ffffff);						\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
 } while (0)
 #endif
 
-#ifdef COMPILE_R300
+#if defined(COMPILE_R300)
 #define READ_DEPTH( d, _x, _y )						\
-  do { \
-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
-					 _y + yo )) & 0xffffff00) >> 8; \
+  do {									\
+    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) & 0xffffff00) >> 8; \
+  }while(0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define READ_DEPTH( d, _x, _y )						\
+  do {									\
+    d = *(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)) & 0x00ffffff; \
   }while(0)
 #else
+#define READ_DEPTH( d, _x, _y )	\
+  d = *(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off)) & 0x00ffffff;
+#endif
+
+#define TAG(x) radeon##x##_z24
+#include "depthtmp.h"
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ * EXT_depth_stencil
+ *
+ * Careful: It looks like the R300 uses ZZZS byte order while the R200
+ * uses SZZZ for 24 bit depth, 8 bit stencil mode.
+ */
+#define VALUE_TYPE GLuint
+
+#if defined(COMPILE_R300)
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = z24s8_to_s8z24(d);					\
+   *_ptr = tmp;								\
+} while (0)
+#else
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
+   GLuint tmp = z24s8_to_s8z24(d);					\
+   *_ptr = tmp;					\
+} while (0)
+#endif
+
+#if defined(COMPILE_R300)
+#define READ_DEPTH( d, _x, _y )						\
+  do { \
+    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));	\
+  }while(0)
+#elif defined(RADEON_COMMON_FOR_R200)
 #define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
-					 _y + yo )) & 0x00ffffff;
+  do { \
+    d = s8z24_to_z24s8(*(GLuint*)(r200_depth_4byte(rrb, _x + x_off, _y + y_off)));	\
+  }while(0)
+#else
+#define READ_DEPTH( d, _x, _y )	do {					\
+    d = s8z24_to_z24s8(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off ))); \
+  } while (0)
 #endif
 
 #define TAG(x) radeon##x##_z24_s8
@@ -235,35 +470,51 @@ do {									\
 #ifdef COMPILE_R300
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0xffffff00;							\
    tmp |= (d) & 0xff;							\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
+} while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define WRITE_STENCIL( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
+   tmp &= 0x00ffffff;							\
+   tmp |= (((d) & 0xff) << 24);						\
+   *_ptr = tmp;					\
 } while (0)
 #else
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0x00ffffff;							\
    tmp |= (((d) & 0xff) << 24);						\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
 } while (0)
 #endif
 
 #ifdef COMPILE_R300
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    d = tmp & 0x000000ff;						\
 } while (0)
+#elif defined(RADEON_COMMON_FOR_R200)
+#define READ_STENCIL( d, _x, _y )					\
+do {									\
+   GLuint *_ptr = (GLuint*)r200_depth_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
+   d = (tmp & 0xff000000) >> 24;					\
+} while (0)
 #else
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    d = (tmp & 0xff000000) >> 24;					\
 } while (0)
 #endif
@@ -271,29 +522,110 @@ do {									\
 #define TAG(x) radeon##x##_z24_s8
 #include "stenciltmp.h"
 
-/* Move locking out to get reasonable span performance (10x better
- * than doing this in HW_LOCK above).  WaitForIdle() is the main
- * culprit.
- */
+
+static void map_unmap_rb(struct gl_renderbuffer *rb, int flag)
+{
+	struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
+	int r;
+
+	if (rrb == NULL || !rrb->bo)
+		return;
+
+	if (flag) {
+		if (rrb->bo->bom->funcs->bo_wait)
+			radeon_bo_wait(rrb->bo);
+		r = radeon_bo_map(rrb->bo, 1);
+		if (r) {
+			fprintf(stderr, "(%s) error(%d) mapping buffer.\n",
+				__FUNCTION__, r);
+		}
+
+		radeonSetSpanFunctions(rrb);
+	} else {
+		radeon_bo_unmap(rrb->bo);
+		rb->GetRow = NULL;
+		rb->PutRow = NULL;
+	}
+}
+
+static void
+radeon_map_unmap_buffers(GLcontext *ctx, GLboolean map)
+{
+	GLuint i, j;
+
+	/* color draw buffers */
+	for (j = 0; j < ctx->DrawBuffer->_NumColorDrawBuffers; j++)
+		map_unmap_rb(ctx->DrawBuffer->_ColorDrawBuffers[j], map);
+
+	/* check for render to textures */
+	for (i = 0; i < BUFFER_COUNT; i++) {
+		struct gl_renderbuffer_attachment *att =
+			ctx->DrawBuffer->Attachment + i;
+		struct gl_texture_object *tex = att->Texture;
+		if (tex) {
+			/* Render to texture. Note that a mipmapped texture need not
+			 * be complete for render to texture, so we must restrict to
+			 * mapping only the attached image.
+			 */
+			radeon_texture_image *image = get_radeon_texture_image(tex->Image[att->CubeMapFace][att->TextureLevel]);
+			ASSERT(att->Renderbuffer);
+
+			if (map)
+				radeon_teximage_map(image, GL_TRUE);
+			else
+				radeon_teximage_unmap(image);
+		}
+	}
+
+	map_unmap_rb(ctx->ReadBuffer->_ColorReadBuffer, map);
+
+	/* depth buffer (Note wrapper!) */
+	if (ctx->DrawBuffer->_DepthBuffer)
+		map_unmap_rb(ctx->DrawBuffer->_DepthBuffer->Wrapped, map);
+
+	if (ctx->DrawBuffer->_StencilBuffer)
+		map_unmap_rb(ctx->DrawBuffer->_StencilBuffer->Wrapped, map);
+}
 
 static void radeonSpanRenderStart(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-#ifdef COMPILE_R300
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-	R300_FIREVERTICES(r300);
-#else
-	RADEON_FIREVERTICES(rmesa);
-#endif
-	LOCK_HARDWARE(rmesa);
-	radeonWaitForIdleLocked(rmesa);
+	int i;
+
+	radeon_firevertices(rmesa);
+
+	/* The locking and wait for idle should really only be needed in classic mode.
+	 * In a future memory manager based implementation, this should become
+	 * unnecessary due to the fact that mapping our buffers, textures, etc.
+	 * should implicitly wait for any previous rendering commands that must
+	 * be waited on. */
+	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
+		LOCK_HARDWARE(rmesa);
+		radeonWaitForIdleLocked(rmesa);
+	}
+
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled)
+			ctx->Driver.MapTexture(ctx, ctx->Texture.Unit[i]._Current);
+	}
+
+	radeon_map_unmap_buffers(ctx, 1);
 }
 
 static void radeonSpanRenderFinish(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	int i;
 	_swrast_flush(ctx);
-	UNLOCK_HARDWARE(rmesa);
+	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
+		UNLOCK_HARDWARE(rmesa);
+	}
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled)
+			ctx->Driver.UnmapTexture(ctx, ctx->Texture.Unit[i]._Current);
+	}
+
+	radeon_map_unmap_buffers(ctx, 0);
 }
 
 void radeonInitSpanFuncs(GLcontext * ctx)
@@ -307,20 +639,27 @@ void radeonInitSpanFuncs(GLcontext * ctx)
 /**
  * Plug in the Get/Put routines for the given driRenderbuffer.
  */
-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
+static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb)
 {
-	if (drb->Base.InternalFormat == GL_RGBA) {
-		if (vis->redBits == 5 && vis->greenBits == 6
-		    && vis->blueBits == 5) {
-			radeonInitPointers_RGB565(&drb->Base);
-		} else {
-			radeonInitPointers_ARGB8888(&drb->Base);
-		}
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-		radeonInitDepthPointers_z16(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-		radeonInitDepthPointers_z24_s8(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-		radeonInitStencilPointers_z24_s8(&drb->Base);
+	if (rrb->base._ActualFormat == GL_RGB5) {
+		radeonInitPointers_RGB565(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGB8) {
+		radeonInitPointers_xRGB8888(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGBA8) {
+		radeonInitPointers_ARGB8888(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGBA4) {
+		radeonInitPointers_ARGB4444(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGB5_A1) {
+		radeonInitPointers_ARGB1555(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT16) {
+		radeonInitDepthPointers_z16(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT24) {
+		radeonInitDepthPointers_z24(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
+		radeonInitDepthPointers_z24_s8(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_STENCIL_INDEX8_EXT) {
+		radeonInitStencilPointers_z24_s8(&rrb->base);
+	} else {
+		fprintf(stderr, "radeonSetSpanFunctions: bad actual format: 0x%04X\n", rrb->base._ActualFormat);
 	}
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.h b/src/mesa/drivers/dri/radeon/radeon_span.h
index 9abe0864b1..ea6a2e7fb4 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.h
+++ b/src/mesa/drivers/dri/radeon/radeon_span.h
@@ -42,9 +42,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __RADEON_SPAN_H__
 #define __RADEON_SPAN_H__
 
-#include "drirenderbuffer.h"
-
 extern void radeonInitSpanFuncs(GLcontext * ctx);
-extern void radeonSetSpanFunctions(driRenderbuffer * rb, const GLvisual * vis);
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index 32bcff3360..4d0d35ee0c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -40,6 +40,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/state.h"
 #include "main/context.h"
 #include "main/framebuffer.h"
+#include "main/simple_list.h"
 
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
@@ -47,6 +48,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast_setup/swrast_setup.h"
 
 #include "radeon_context.h"
+#include "radeon_mipmap_tree.h"
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
 #include "radeon_tcl.h"
@@ -62,7 +64,7 @@ static void radeonUpdateSpecular( GLcontext *ctx );
 
 static void radeonAlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    int pp_misc = rmesa->hw.ctx.cmd[CTX_PP_MISC];
    GLubyte refByte;
 
@@ -106,7 +108,7 @@ static void radeonAlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
 static void radeonBlendEquationSeparate( GLcontext *ctx,
 					 GLenum modeRGB, GLenum modeA )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & ~RADEON_COMB_FCN_MASK;
    GLboolean fallback = GL_FALSE;
 
@@ -147,8 +149,8 @@ static void radeonBlendFuncSeparate( GLcontext *ctx,
 				     GLenum sfactorRGB, GLenum dfactorRGB,
 				     GLenum sfactorA, GLenum dfactorA )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & 
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] &
       ~(RADEON_SRC_BLEND_MASK | RADEON_DST_BLEND_MASK);
    GLboolean fallback = GL_FALSE;
 
@@ -257,7 +259,7 @@ static void radeonBlendFuncSeparate( GLcontext *ctx,
 
 static void radeonDepthFunc( GLcontext *ctx, GLenum func )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, ctx );
    rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_Z_TEST_MASK;
@@ -293,7 +295,7 @@ static void radeonDepthFunc( GLcontext *ctx, GLenum func )
 
 static void radeonDepthMask( GLcontext *ctx, GLboolean flag )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    RADEON_STATECHANGE( rmesa, ctx );
 
    if ( ctx->Depth.Mask ) {
@@ -305,16 +307,16 @@ static void radeonDepthMask( GLcontext *ctx, GLboolean flag )
 
 static void radeonClearDepth( GLcontext *ctx, GLclampd d )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint format = (rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &
 		    RADEON_DEPTH_FORMAT_MASK);
 
    switch ( format ) {
    case RADEON_DEPTH_FORMAT_16BIT_INT_Z:
-      rmesa->state.depth.clear = d * 0x0000ffff;
+      rmesa->radeon.state.depth.clear = d * 0x0000ffff;
       break;
    case RADEON_DEPTH_FORMAT_24BIT_INT_Z:
-      rmesa->state.depth.clear = d * 0x00ffffff;
+      rmesa->radeon.state.depth.clear = d * 0x00ffffff;
       break;
    }
 }
@@ -327,7 +329,7 @@ static void radeonClearDepth( GLcontext *ctx, GLclampd d )
 
 static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    union { int i; float f; } c, d;
    GLchan col[4];
 
@@ -391,7 +393,7 @@ static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 	 rmesa->hw.fog.cmd[FOG_D] = d.i;
       }
       break;
-   case GL_FOG_COLOR: 
+   case GL_FOG_COLOR:
       RADEON_STATECHANGE( rmesa, ctx );
       UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~RADEON_FOG_COLOR_MASK;
@@ -406,109 +408,13 @@ static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
    }
 }
 
-
-/* =============================================================
- * Scissoring
- */
-
-
-static GLboolean intersect_rect( drm_clip_rect_t *out,
-				 drm_clip_rect_t *a,
-				 drm_clip_rect_t *b )
-{
-   *out = *a;
-   if ( b->x1 > out->x1 ) out->x1 = b->x1;
-   if ( b->y1 > out->y1 ) out->y1 = b->y1;
-   if ( b->x2 < out->x2 ) out->x2 = b->x2;
-   if ( b->y2 < out->y2 ) out->y2 = b->y2;
-   if ( out->x1 >= out->x2 ) return GL_FALSE;
-   if ( out->y1 >= out->y2 ) return GL_FALSE;
-   return GL_TRUE;
-}
-
-
-void radeonRecalcScissorRects( radeonContextPtr rmesa )
-{
-   drm_clip_rect_t *out;
-   int i;
-
-   /* Grow cliprect store?
-    */
-   if (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
-      while (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
-	 rmesa->state.scissor.numAllocedClipRects += 1;	/* zero case */
-	 rmesa->state.scissor.numAllocedClipRects *= 2;
-      }
-
-      if (rmesa->state.scissor.pClipRects)
-	 FREE(rmesa->state.scissor.pClipRects);
-
-      rmesa->state.scissor.pClipRects = 
-	 MALLOC( rmesa->state.scissor.numAllocedClipRects * 
-		 sizeof(drm_clip_rect_t) );
-
-      if ( rmesa->state.scissor.pClipRects == NULL ) {
-	 rmesa->state.scissor.numAllocedClipRects = 0;
-	 return;
-      }
-   }
-   
-   out = rmesa->state.scissor.pClipRects;
-   rmesa->state.scissor.numClipRects = 0;
-
-   for ( i = 0 ; i < rmesa->numClipRects ;  i++ ) {
-      if ( intersect_rect( out, 
-			   &rmesa->pClipRects[i], 
-			   &rmesa->state.scissor.rect ) ) {
-	 rmesa->state.scissor.numClipRects++;
-	 out++;
-      }
-   }
-}
-
-
-static void radeonUpdateScissor( GLcontext *ctx )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if ( rmesa->dri.drawable ) {
-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-
-      int x = ctx->Scissor.X;
-      int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
-      int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
-      int h = dPriv->h - ctx->Scissor.Y - 1;
-
-      rmesa->state.scissor.rect.x1 = x + dPriv->x;
-      rmesa->state.scissor.rect.y1 = y + dPriv->y;
-      rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
-      rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
-
-      radeonRecalcScissorRects( rmesa );
-   }
-}
-
-
-static void radeonScissor( GLcontext *ctx,
-			   GLint x, GLint y, GLsizei w, GLsizei h )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if ( ctx->Scissor.Enabled ) {
-      RADEON_FIREVERTICES( rmesa );	/* don't pipeline cliprect changes */
-      radeonUpdateScissor( ctx );
-   }
-
-}
-
-
 /* =============================================================
  * Culling
  */
 
 static void radeonCullFace( GLcontext *ctx, GLenum unused )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
    GLuint t = rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL];
 
@@ -545,7 +451,7 @@ static void radeonCullFace( GLcontext *ctx, GLenum unused )
 
 static void radeonFrontFace( GLcontext *ctx, GLenum mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, set );
    rmesa->hw.set.cmd[SET_SE_CNTL] &= ~RADEON_FFACE_CULL_DIR_MASK;
@@ -553,6 +459,10 @@ static void radeonFrontFace( GLcontext *ctx, GLenum mode )
    RADEON_STATECHANGE( rmesa, tcl );
    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_CULL_FRONT_IS_CCW;
 
+   /* Winding is inverted when rendering to FBO */
+   if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+      mode = (mode == GL_CW) ? GL_CCW : GL_CW;
+
    switch ( mode ) {
    case GL_CW:
       rmesa->hw.set.cmd[SET_SE_CNTL] |= RADEON_FFACE_CULL_CW;
@@ -570,7 +480,7 @@ static void radeonFrontFace( GLcontext *ctx, GLenum mode )
  */
 static void radeonLineWidth( GLcontext *ctx, GLfloat widthf )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, lin );
    RADEON_STATECHANGE( rmesa, set );
@@ -587,10 +497,10 @@ static void radeonLineWidth( GLcontext *ctx, GLfloat widthf )
 
 static void radeonLineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, lin );
-   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] =
       ((((GLuint)factor & 0xff) << 16) | ((GLuint)pattern));
 }
 
@@ -602,12 +512,19 @@ static void radeonColorMask( GLcontext *ctx,
 			     GLboolean r, GLboolean g,
 			     GLboolean b, GLboolean a )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint mask = radeonPackColor( rmesa->radeonScreen->cpp,
-				  ctx->Color.ColorMask[RCOMP],
-				  ctx->Color.ColorMask[GCOMP],
-				  ctx->Color.ColorMask[BCOMP],
-				  ctx->Color.ColorMask[ACOMP] );
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb;
+   GLuint mask;
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   if (!rrb)
+     return;
+
+   mask = radeonPackColor( rrb->cpp,
+			   ctx->Color.ColorMask[RCOMP],
+			   ctx->Color.ColorMask[GCOMP],
+			   ctx->Color.ColorMask[BCOMP],
+			   ctx->Color.ColorMask[ACOMP] );
 
    if ( rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] != mask ) {
       RADEON_STATECHANGE( rmesa, msk );
@@ -623,8 +540,9 @@ static void radeonColorMask( GLcontext *ctx,
 static void radeonPolygonOffset( GLcontext *ctx,
 				 GLfloat factor, GLfloat units )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   float_ui32_type constant =  { units * rmesa->state.depth.scale };
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   float_ui32_type constant =  { units * depthScale };
    float_ui32_type factoru = { factor };
 
    RADEON_STATECHANGE( rmesa, zbs );
@@ -632,41 +550,16 @@ static void radeonPolygonOffset( GLcontext *ctx,
    rmesa->hw.zbs.cmd[ZBS_SE_ZBIAS_CONSTANT] = constant.ui32;
 }
 
-static void radeonPolygonStipple( GLcontext *ctx, const GLubyte *mask )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint i;
-   drm_radeon_stipple_t stipple;
-
-   /* Must flip pattern upside down.
-    */
-   for ( i = 0 ; i < 32 ; i++ ) {
-      rmesa->state.stipple.mask[31 - i] = ((GLuint *) mask)[i];
-   }
-
-   /* TODO: push this into cmd mechanism
-    */
-   RADEON_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-
-   /* FIXME: Use window x,y offsets into stipple RAM.
-    */
-   stipple.mask = rmesa->state.stipple.mask;
-   drmCommandWrite( rmesa->dri.fd, DRM_RADEON_STIPPLE, 
-                    &stipple, sizeof(drm_radeon_stipple_t) );
-   UNLOCK_HARDWARE( rmesa );
-}
-
 static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
 
    /* Can't generally do unfilled via tcl, but some good special
-    * cases work. 
+    * cases work.
     */
    TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_UNFILLED, flag);
-   if (rmesa->TclFallback) {
+   if (rmesa->radeon.TclFallback) {
       radeonChooseRenderState( ctx );
       radeonChooseVertexState( ctx );
    }
@@ -686,7 +579,7 @@ static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
  */
 static void radeonUpdateSpecular( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    uint32_t p = rmesa->hw.ctx.cmd[CTX_PP_CNTL];
    GLuint flag = 0;
 
@@ -711,7 +604,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
       p |=  RADEON_SPECULAR_ENABLE;
-      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= 
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &=
 	 ~RADEON_DIFFUSE_SPECULAR_COMBINE;
    }
    else if (ctx->Light.Enabled) {
@@ -741,7 +634,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
 	    RADEON_TCL_COMPUTE_SPECULAR) != 0;
       }
    }
- 
+
    TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_FOGCOORDSPEC, flag);
 
    if (NEED_SECONDARY_COLOR(ctx)) {
@@ -757,7 +650,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
 
    /* Update vertex/render formats
     */
-   if (rmesa->TclFallback) { 
+   if (rmesa->radeon.TclFallback) {
       radeonChooseRenderState( ctx );
       radeonChooseVertexState( ctx );
    }
@@ -769,12 +662,12 @@ static void radeonUpdateSpecular( GLcontext *ctx )
  */
 
 
-/* Update on colormaterial, material emmissive/ambient, 
+/* Update on colormaterial, material emmissive/ambient,
  * lightmodel.globalambient
  */
 static void update_global_ambient( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    float *fcmd = (float *)RADEON_DB_STATE( glt );
 
    /* Need to do more if both emmissive & ambient are PREMULT:
@@ -782,23 +675,23 @@ static void update_global_ambient( GLcontext *ctx )
     */
    if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &
        ((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
-	(3 << RADEON_AMBIENT_SOURCE_SHIFT))) == 0) 
+	(3 << RADEON_AMBIENT_SOURCE_SHIFT))) == 0)
    {
-      COPY_3V( &fcmd[GLT_RED], 
+      COPY_3V( &fcmd[GLT_RED],
 	       ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_EMISSION]);
       ACC_SCALE_3V( &fcmd[GLT_RED],
 		   ctx->Light.Model.Ambient,
 		   ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_AMBIENT]);
-   } 
+   }
    else
    {
       COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
    }
-   
+
    RADEON_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
 }
 
-/* Update on change to 
+/* Update on change to
  *    - light[p].colors
  *    - light[p].enabled
  */
@@ -809,13 +702,13 @@ static void update_light_colors( GLcontext *ctx, GLuint p )
 /*     fprintf(stderr, "%s\n", __FUNCTION__); */
 
    if (l->Enabled) {
-      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+      r100ContextPtr rmesa = R100_CONTEXT(ctx);
       float *fcmd = (float *)RADEON_DB_STATE( lit[p] );
 
-      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );
       COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
       COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
-      
+
       RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
    }
 }
@@ -829,7 +722,7 @@ static void check_twoside_fallback( GLcontext *ctx )
 
    if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
       if (ctx->Light.ColorMaterialEnabled &&
-	  (ctx->Light.ColorMaterialBitmask & BACK_MATERIAL_BITS) != 
+	  (ctx->Light.ColorMaterialBitmask & BACK_MATERIAL_BITS) !=
 	  ((ctx->Light.ColorMaterialBitmask & FRONT_MATERIAL_BITS)<<1))
 	 fallback = GL_TRUE;
       else {
@@ -837,7 +730,7 @@ static void check_twoside_fallback( GLcontext *ctx )
 	    if (memcmp( ctx->Light.Material.Attrib[i],
 			ctx->Light.Material.Attrib[i+1],
 			sizeof(GLfloat)*4) != 0) {
-	       fallback = GL_TRUE;  
+	       fallback = GL_TRUE;
 	       break;
 	    }
       }
@@ -849,14 +742,14 @@ static void check_twoside_fallback( GLcontext *ctx )
 
 static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 {
-      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+      r100ContextPtr rmesa = R100_CONTEXT(ctx);
       GLuint light_model_ctl1 = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL];
 
       light_model_ctl1 &= ~((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
 			   (3 << RADEON_AMBIENT_SOURCE_SHIFT) |
 			   (3 << RADEON_DIFFUSE_SOURCE_SHIFT) |
-			   (3 << RADEON_SPECULAR_SOURCE_SHIFT)); 
-   
+			   (3 << RADEON_SPECULAR_SOURCE_SHIFT));
+
    if (ctx->Light.ColorMaterialEnabled) {
       GLuint mask = ctx->Light.ColorMaterialBitmask;
 
@@ -877,7 +770,7 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
 			     RADEON_AMBIENT_SOURCE_SHIFT);
       }
-	 
+
       if (mask & MAT_BIT_FRONT_DIFFUSE) {
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
 			     RADEON_DIFFUSE_SOURCE_SHIFT);
@@ -886,7 +779,7 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
 			     RADEON_DIFFUSE_SOURCE_SHIFT);
       }
-   
+
       if (mask & MAT_BIT_FRONT_SPECULAR) {
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
 			     RADEON_SPECULAR_SOURCE_SHIFT);
@@ -904,27 +797,27 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_DIFFUSE_SOURCE_SHIFT) |
 		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_SPECULAR_SOURCE_SHIFT);
    }
-   
+
       if (light_model_ctl1 != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) {
 	 RADEON_STATECHANGE( rmesa, tcl );
-	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = light_model_ctl1;      
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = light_model_ctl1;
    }
 }
 
 void radeonUpdateMaterial( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLfloat (*mat)[4] = ctx->Light.Material.Attrib;
    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( mtl );
    GLuint mask = ~0;
-   
+
    if (ctx->Light.ColorMaterialEnabled)
       mask &= ~ctx->Light.ColorMaterialBitmask;
 
-   if (RADEON_DEBUG & DEBUG_STATE)
+   if (RADEON_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-      
+
    if (mask & MAT_BIT_FRONT_EMISSION) {
       fcmd[MTL_EMMISSIVE_RED]   = mat[MAT_ATTRIB_FRONT_EMISSION][0];
       fcmd[MTL_EMMISSIVE_GREEN] = mat[MAT_ATTRIB_FRONT_EMISSION][1];
@@ -967,18 +860,18 @@ void radeonUpdateMaterial( GLcontext *ctx )
  *       _VP_inf_norm
  *       _h_inf_norm
  *       _Position
- *       _NormDirection
+ *       _NormSpotDirection
  *       _ModelViewInvScale
  *       _NeedEyeCoords
  *       _EyeZDir
  *
  * which are calculated in light.c and are correct for the current
  * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
- * and _MESA_NEW_NEED_EYE_COORDS.  
+ * and _MESA_NEW_NEED_EYE_COORDS.
  */
 static void update_light( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    /* Have to check these, or have an automatic shortcircuit mechanism
     * to remove noop statechanges. (Or just do a better job on the
@@ -991,12 +884,12 @@ static void update_light( GLcontext *ctx )
 	 tmp &= ~RADEON_LIGHT_IN_MODELSPACE;
       else
 	 tmp |= RADEON_LIGHT_IN_MODELSPACE;
-      
+
 
       /* Leave this test disabled: (unexplained q3 lockup) (even with
          new packets)
       */
-      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) 
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL])
       {
 	 RADEON_STATECHANGE( rmesa, tcl );
 	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = tmp;
@@ -1020,17 +913,17 @@ static void update_light( GLcontext *ctx )
 	 if (ctx->Light.Light[p].Enabled) {
 	    struct gl_light *l = &ctx->Light.Light[p];
 	    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( lit[p] );
-	    
+
 	    if (l->EyePosition[3] == 0.0) {
-	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
-	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm );
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm );
 	       fcmd[LIT_POSITION_W] = 0;
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    } else {
 	       COPY_4V( &fcmd[LIT_POSITION_X], l->_Position );
-	       fcmd[LIT_DIRECTION_X] = -l->_NormDirection[0];
-	       fcmd[LIT_DIRECTION_Y] = -l->_NormDirection[1];
-	       fcmd[LIT_DIRECTION_Z] = -l->_NormDirection[2];
+	       fcmd[LIT_DIRECTION_X] = -l->_NormSpotDirection[0];
+	       fcmd[LIT_DIRECTION_Y] = -l->_NormSpotDirection[1];
+	       fcmd[LIT_DIRECTION_Z] = -l->_NormSpotDirection[2];
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    }
 
@@ -1043,30 +936,30 @@ static void update_light( GLcontext *ctx )
 static void radeonLightfv( GLcontext *ctx, GLenum light,
 			   GLenum pname, const GLfloat *params )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLint p = light - GL_LIGHT0;
    struct gl_light *l = &ctx->Light.Light[p];
    GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
-   
+
 
    switch (pname) {
-   case GL_AMBIENT:		
+   case GL_AMBIENT:
    case GL_DIFFUSE:
    case GL_SPECULAR:
       update_light_colors( ctx, p );
       break;
 
-   case GL_SPOT_DIRECTION: 
-      /* picked up in update_light */	
+   case GL_SPOT_DIRECTION:
+      /* picked up in update_light */
       break;
 
    case GL_POSITION: {
-      /* positions picked up in update_light, but can do flag here */	
+      /* positions picked up in update_light, but can do flag here */
       GLuint flag;
       GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
 
       /* FIXME: Set RANGE_ATTEN only when needed */
-      if (p&1) 
+      if (p&1)
 	 flag = RADEON_LIGHT_1_IS_LOCAL;
       else
 	 flag = RADEON_LIGHT_0_IS_LOCAL;
@@ -1158,16 +1051,16 @@ static void radeonLightfv( GLcontext *ctx, GLenum light,
    }
 }
 
-		  
+
 
 
 static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
 				const GLfloat *param )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    switch (pname) {
-      case GL_LIGHT_MODEL_AMBIENT: 
+      case GL_LIGHT_MODEL_AMBIENT:
 	 update_global_ambient( ctx );
 	 break;
 
@@ -1188,7 +1081,7 @@ static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
 
 	 check_twoside_fallback( ctx );
 
-	 if (rmesa->TclFallback) {
+	 if (rmesa->radeon.TclFallback) {
 	    radeonChooseRenderState( ctx );
 	    radeonChooseVertexState( ctx );
 	 }
@@ -1205,7 +1098,7 @@ static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
 
 static void radeonShadeModel( GLcontext *ctx, GLenum mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
 
    s &= ~(RADEON_DIFFUSE_SHADE_MASK |
@@ -1244,7 +1137,7 @@ static void radeonShadeModel( GLcontext *ctx, GLenum mode )
 static void radeonClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 {
    GLint p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
 
    RADEON_STATECHANGE( rmesa, ucp[p] );
@@ -1256,7 +1149,7 @@ static void radeonClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 
 static void radeonUpdateClipPlanes( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint p;
 
    for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
@@ -1281,7 +1174,7 @@ static void
 radeonStencilFuncSeparate( GLcontext *ctx, GLenum face, GLenum func,
                            GLint ref, GLuint mask )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint refmask = (((ctx->Stencil.Ref[0] & 0xff) << RADEON_STENCIL_REF_SHIFT) |
 		     ((ctx->Stencil.ValueMask[0] & 0xff) << RADEON_STENCIL_MASK_SHIFT));
 
@@ -1325,7 +1218,7 @@ radeonStencilFuncSeparate( GLcontext *ctx, GLenum face, GLenum func,
 static void
 radeonStencilMaskSeparate( GLcontext *ctx, GLenum face, GLuint mask )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, msk );
    rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] &= ~RADEON_STENCIL_WRITE_MASK;
@@ -1336,20 +1229,20 @@ radeonStencilMaskSeparate( GLcontext *ctx, GLenum face, GLuint mask )
 static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
                                      GLenum zfail, GLenum zpass )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    /* radeon 7200 have stencil bug, DEC and INC_WRAP will actually both do DEC_WRAP,
       and DEC_WRAP (and INVERT) will do INVERT. No way to get correct INC_WRAP and DEC,
       but DEC_WRAP can be fixed by using DEC and INC_WRAP at least use INC. */
-   
+
    GLuint tempRADEON_STENCIL_FAIL_DEC_WRAP;
    GLuint tempRADEON_STENCIL_FAIL_INC_WRAP;
    GLuint tempRADEON_STENCIL_ZFAIL_DEC_WRAP;
    GLuint tempRADEON_STENCIL_ZFAIL_INC_WRAP;
    GLuint tempRADEON_STENCIL_ZPASS_DEC_WRAP;
    GLuint tempRADEON_STENCIL_ZPASS_INC_WRAP;
-   
-   if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_BROKEN_STENCIL) {
+
+   if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_BROKEN_STENCIL) {
       tempRADEON_STENCIL_FAIL_DEC_WRAP = RADEON_STENCIL_FAIL_DEC;
       tempRADEON_STENCIL_FAIL_INC_WRAP = RADEON_STENCIL_FAIL_INC;
       tempRADEON_STENCIL_ZFAIL_DEC_WRAP = RADEON_STENCIL_ZFAIL_DEC;
@@ -1365,7 +1258,7 @@ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
       tempRADEON_STENCIL_ZPASS_DEC_WRAP = RADEON_STENCIL_ZPASS_DEC_WRAP;
       tempRADEON_STENCIL_ZPASS_INC_WRAP = RADEON_STENCIL_ZPASS_INC_WRAP;
    }
-   
+
    RADEON_STATECHANGE( rmesa, ctx );
    rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~(RADEON_STENCIL_FAIL_MASK |
 					       RADEON_STENCIL_ZFAIL_MASK |
@@ -1455,9 +1348,9 @@ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
 
 static void radeonClearStencil( GLcontext *ctx, GLint s )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   rmesa->state.stencil.clear = 
+   rmesa->radeon.state.stencil.clear =
       ((GLuint) (ctx->Stencil.Clear & 0xff) |
        (0xff << RADEON_STENCIL_MASK_SHIFT) |
        ((ctx->Stencil.WriteMask[0] & 0xff) << RADEON_STENCIL_WRITEMASK_SHIFT));
@@ -1481,20 +1374,30 @@ static void radeonClearStencil( GLcontext *ctx, GLint s )
  */
 void radeonUpdateWindow( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-   GLfloat xoffset = (GLfloat)dPriv->x;
-   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+   GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer ? (ctx->DrawBuffer->Name != 0) : 0);
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   GLfloat y_scale, y_bias;
+
+   if (render_to_fbo) {
+      y_scale = 1.0;
+      y_bias = 0;
+   } else {
+      y_scale = -1.0;
+      y_bias = yoffset;
+   }
 
    float_ui32_type sx = { v[MAT_SX] };
    float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
-   float_ui32_type sy = { - v[MAT_SY] };
-   float_ui32_type ty = { (- v[MAT_TY]) + yoffset + SUBPIXEL_Y };
-   float_ui32_type sz = { v[MAT_SZ] * rmesa->state.depth.scale };
-   float_ui32_type tz = { v[MAT_TZ] * rmesa->state.depth.scale };
+   float_ui32_type sy = { v[MAT_SY] * y_scale };
+   float_ui32_type ty = { (v[MAT_TY] * y_scale) + y_bias + SUBPIXEL_Y };
+   float_ui32_type sz = { v[MAT_SZ] * depthScale };
+   float_ui32_type tz = { v[MAT_TZ] * depthScale };
 
-   RADEON_FIREVERTICES( rmesa );
    RADEON_STATECHANGE( rmesa, vpt );
 
    rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = sx.ui32;
@@ -1514,6 +1417,8 @@ static void radeonViewport( GLcontext *ctx, GLint x, GLint y,
     * values, or keep the originals hanging around.
     */
    radeonUpdateWindow( ctx );
+
+   radeon_viewport(ctx, x, y, width, height);
 }
 
 static void radeonDepthRange( GLcontext *ctx, GLclampd nearval,
@@ -1524,8 +1429,8 @@ static void radeonDepthRange( GLcontext *ctx, GLclampd nearval,
 
 void radeonUpdateViewportOffset( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1555,8 +1460,8 @@ void radeonUpdateViewportOffset( GLcontext *ctx )
                 RADEON_STIPPLE_Y_OFFSET_MASK);
 
          /* add magic offsets, then invert */
-         stx = 31 - ((rmesa->dri.drawable->x - 1) & RADEON_STIPPLE_COORD_MASK);
-         sty = 31 - ((rmesa->dri.drawable->y + rmesa->dri.drawable->h - 1)
+         stx = 31 - ((dPriv->x - 1) & RADEON_STIPPLE_COORD_MASK);
+         sty = 31 - ((dPriv->y + dPriv->h - 1)
                      & RADEON_STIPPLE_COORD_MASK);
 
          m |= ((stx << RADEON_STIPPLE_X_OFFSET_SHIFT) |
@@ -1580,20 +1485,26 @@ void radeonUpdateViewportOffset( GLcontext *ctx )
 
 static void radeonClearColor( GLcontext *ctx, const GLfloat color[4] )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLubyte c[4];
+   struct radeon_renderbuffer *rrb;
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   if (!rrb)
+     return;
+     
    CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
    CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
    CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
    CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
-   rmesa->state.color.clear = radeonPackColor( rmesa->radeonScreen->cpp,
+   rmesa->radeon.state.color.clear = radeonPackColor( rrb->cpp,
 					       c[0], c[1], c[2], c[3] );
 }
 
 
 static void radeonRenderMode( GLcontext *ctx, GLenum mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    FALLBACK( rmesa, RADEON_FALLBACK_RENDER_MODE, (mode != GL_RENDER) );
 }
 
@@ -1619,7 +1530,7 @@ static GLuint radeon_rop_tab[] = {
 
 static void radeonLogicOpCode( GLcontext *ctx, GLenum opcode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint rop = (GLuint)opcode - GL_CLEAR;
 
    ASSERT( rop < 16 );
@@ -1628,108 +1539,16 @@ static void radeonLogicOpCode( GLcontext *ctx, GLenum opcode )
    rmesa->hw.msk.cmd[MSK_RB3D_ROPCNTL] = radeon_rop_tab[rop];
 }
 
-
-/**
- * Set up the cliprects for either front or back-buffer drawing.
- */
-void radeonSetCliprects( radeonContextPtr rmesa )
-{
-   __DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-   __DRIdrawablePrivate *const readable = rmesa->dri.readable;
-   GLframebuffer *const draw_fb = (GLframebuffer*) drawable->driverPrivate;
-   GLframebuffer *const read_fb = (GLframebuffer*) readable->driverPrivate;
-
-   if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-      /* Can't ignore 2d windows if we are page flipping.
-       */
-      if ( drawable->numBackClipRects == 0 || rmesa->doPageFlip ) {
-	 rmesa->numClipRects = drawable->numClipRects;
-	 rmesa->pClipRects = drawable->pClipRects;
-      }
-      else {
-	 rmesa->numClipRects = drawable->numBackClipRects;
-	 rmesa->pClipRects = drawable->pBackClipRects;
-      }
-   }
-   else {
-      /* front buffer (or none, or multiple buffers */
-      rmesa->numClipRects = drawable->numClipRects;
-      rmesa->pClipRects = drawable->pClipRects;
-   }
-
-   if ((draw_fb->Width != drawable->w) || (draw_fb->Height != drawable->h)) {
-      _mesa_resize_framebuffer(rmesa->glCtx, draw_fb,
-			       drawable->w, drawable->h);
-      draw_fb->Initialized = GL_TRUE;
-   }
-
-   if (drawable != readable) {
-      if ((read_fb->Width != readable->w) || (read_fb->Height != readable->h)) {
-	 _mesa_resize_framebuffer(rmesa->glCtx, read_fb,
-				  readable->w, readable->h);
-	 read_fb->Initialized = GL_TRUE;
-      }
-   }
-
-   if (rmesa->state.scissor.enabled)
-      radeonRecalcScissorRects( rmesa );
-
-   rmesa->lastStamp = drawable->lastStamp;
-}
-
-
-/**
- * Called via glDrawBuffer.
- */
-static void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (RADEON_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "%s %s\n", __FUNCTION__,
-	      _mesa_lookup_enum_by_nr( mode ));
-
-   RADEON_FIREVERTICES(rmesa);	/* don't pipeline cliprect changes */
-
-   if (ctx->DrawBuffer->_NumColorDrawBuffers != 1) {
-      /* 0 (GL_NONE) buffers or multiple color drawing buffers */
-      FALLBACK( rmesa, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE );
-      return;
-   }
-
-   switch ( ctx->DrawBuffer->_ColorDrawBufferIndexes[0] ) {
-   case BUFFER_FRONT_LEFT:
-   case BUFFER_BACK_LEFT:
-      FALLBACK( rmesa, RADEON_FALLBACK_DRAW_BUFFER, GL_FALSE );
-      break;
-   default:
-      FALLBACK( rmesa, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE );
-      return;
-   }
-
-   radeonSetCliprects( rmesa );
-
-   /* We'll set the drawing engine's offset/pitch parameters later
-    * when we update other state.
-    */
-}
-
-static void radeonReadBuffer( GLcontext *ctx, GLenum mode )
-{
-   /* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
-}
-
-
 /* =============================================================
  * State enable/disable
  */
 
 static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint p, flag;
 
-   if ( RADEON_DEBUG & DEBUG_STATE )
+   if ( RADEON_DEBUG & RADEON_STATE )
       fprintf( stderr, "%s( %s = %s )\n", __FUNCTION__,
 	       _mesa_lookup_enum_by_nr( cap ),
 	       state ? "GL_TRUE" : "GL_FALSE" );
@@ -1787,7 +1606,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_CLIP_PLANE2:
    case GL_CLIP_PLANE3:
    case GL_CLIP_PLANE4:
-   case GL_CLIP_PLANE5: 
+   case GL_CLIP_PLANE5:
       p = cap-GL_CLIP_PLANE0;
       RADEON_STATECHANGE( rmesa, tcl );
       if (state) {
@@ -1821,10 +1640,10 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
       RADEON_STATECHANGE(rmesa, ctx );
       if ( state ) {
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_DITHER_ENABLE;
-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->state.color.roundEnable;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->radeon.state.color.roundEnable;
       } else {
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_DITHER_ENABLE;
-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->state.color.roundEnable;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->radeon.state.color.roundEnable;
       }
       break;
 
@@ -1852,13 +1671,13 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_LIGHT7:
       RADEON_STATECHANGE(rmesa, tcl);
       p = cap - GL_LIGHT0;
-      if (p&1) 
+      if (p&1)
 	 flag = (RADEON_LIGHT_1_ENABLE |
-		 RADEON_LIGHT_1_ENABLE_AMBIENT | 
+		 RADEON_LIGHT_1_ENABLE_AMBIENT |
 		 RADEON_LIGHT_1_ENABLE_SPECULAR);
       else
 	 flag = (RADEON_LIGHT_0_ENABLE |
-		 RADEON_LIGHT_0_ENABLE_AMBIENT | 
+		 RADEON_LIGHT_0_ENABLE_AMBIENT |
 		 RADEON_LIGHT_0_ENABLE_SPECULAR);
 
       if (state)
@@ -1866,7 +1685,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
       else
 	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] &= ~flag;
 
-      /* 
+      /*
        */
       update_light_colors( ctx, p );
       break;
@@ -1904,7 +1723,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_ROP_ENABLE;
       }
       break;
-      
+
    case GL_NORMALIZE:
       RADEON_STATECHANGE( rmesa, tcl );
       if ( state ) {
@@ -1971,21 +1790,30 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    }
 
    case GL_SCISSOR_TEST:
-      RADEON_FIREVERTICES( rmesa );
-      rmesa->state.scissor.enabled = state;
+      radeon_firevertices(&rmesa->radeon);
+      rmesa->radeon.state.scissor.enabled = state;
       radeonUpdateScissor( ctx );
       break;
 
    case GL_STENCIL_TEST:
-      if ( rmesa->state.stencil.hwBuffer ) {
-	 RADEON_STATECHANGE( rmesa, ctx );
-	 if ( state ) {
-	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_STENCIL_ENABLE;
+      {
+	 GLboolean hw_stencil = GL_FALSE;
+	 if (ctx->DrawBuffer) {
+	    struct radeon_renderbuffer *rrbStencil
+	       = radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+	    hw_stencil = (rrbStencil && rrbStencil->bo);
+	 }
+
+	 if (hw_stencil) {
+	    RADEON_STATECHANGE( rmesa, ctx );
+	    if ( state ) {
+	       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_STENCIL_ENABLE;
+	    } else {
+	       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_STENCIL_ENABLE;
+	    }
 	 } else {
-	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_STENCIL_ENABLE;
+	    FALLBACK( rmesa, RADEON_FALLBACK_STENCIL, state );
 	 }
-      } else {
-	 FALLBACK( rmesa, RADEON_FALLBACK_STENCIL, state );
       }
       break;
 
@@ -1995,7 +1823,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_TEXTURE_GEN_T:
       /* Picked up in radeonUpdateTextureState.
        */
-      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE; 
+      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE;
       break;
 
    case GL_COLOR_SUM_EXT:
@@ -2010,11 +1838,11 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
 
 static void radeonLightingSpaceChange( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLboolean tmp;
    RADEON_STATECHANGE( rmesa, tcl );
 
-   if (RADEON_DEBUG & DEBUG_STATE)
+   if (RADEON_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s %d BEFORE %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]);
 
@@ -2029,7 +1857,7 @@ static void radeonLightingSpaceChange( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_RESCALE_NORMALS;
    }
 
-   if (RADEON_DEBUG & DEBUG_STATE) 
+   if (RADEON_DEBUG & RADEON_STATE)
       fprintf(stderr, "%s %d AFTER %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]);
 }
@@ -2039,7 +1867,7 @@ static void radeonLightingSpaceChange( GLcontext *ctx )
  */
 
 
-void radeonUploadTexMatrix( radeonContextPtr rmesa,
+void radeonUploadTexMatrix( r100ContextPtr rmesa,
 			    int unit, GLboolean swapcols )
 {
 /* Here's how this works: on r100, only 3 tex coords can be submitted, so the
@@ -2065,7 +1893,7 @@ void radeonUploadTexMatrix( radeonContextPtr rmesa,
    int idx = TEXMAT_0 + unit;
    float *dest = ((float *)RADEON_DB_STATE( mat[idx] )) + MAT_ELT_0;
    int i;
-   struct gl_texture_unit tUnit = rmesa->glCtx->Texture.Unit[unit];
+   struct gl_texture_unit tUnit = rmesa->radeon.glCtx->Texture.Unit[unit];
    GLfloat *src = rmesa->tmpmat[unit].m;
 
    rmesa->TexMatColSwap &= ~(1 << unit);
@@ -2119,7 +1947,7 @@ void radeonUploadTexMatrix( radeonContextPtr rmesa,
 }
 
 
-static void upload_matrix( radeonContextPtr rmesa, GLfloat *src, int idx )
+static void upload_matrix( r100ContextPtr rmesa, GLfloat *src, int idx )
 {
    float *dest = ((float *)RADEON_DB_STATE( mat[idx] ))+MAT_ELT_0;
    int i;
@@ -2135,7 +1963,7 @@ static void upload_matrix( radeonContextPtr rmesa, GLfloat *src, int idx )
    RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
 }
 
-static void upload_matrix_t( radeonContextPtr rmesa, GLfloat *src, int idx )
+static void upload_matrix_t( r100ContextPtr rmesa, GLfloat *src, int idx )
 {
    float *dest = ((float *)RADEON_DB_STATE( mat[idx] ))+MAT_ELT_0;
    memcpy(dest, src, 16*sizeof(float));
@@ -2145,7 +1973,7 @@ static void upload_matrix_t( radeonContextPtr rmesa, GLfloat *src, int idx )
 
 static void update_texturematrix( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    GLuint tpc = rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL];
    GLuint vs = rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL];
    int unit;
@@ -2209,64 +2037,74 @@ static void update_texturematrix( GLcontext *ctx )
    }
 }
 
-
-/**
- * Tell the card where to render (offset, pitch).
- * Effected by glDrawBuffer, etc
- */
-void
-radeonUpdateDrawBuffer(GLcontext *ctx)
+static GLboolean r100ValidateBuffers(GLcontext *ctx)
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   driRenderbuffer *drb;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb;
+   int i, ret;
 
-   if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-      /* draw to front */
-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
-   }
-   else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-      /* draw to back */
-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+   radeon_cs_space_reset_bos(rmesa->radeon.cmdbuf.cs);
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   /* color buffer */
+   if (rrb && rrb->bo) {
+     radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, rrb->bo,
+				       0, RADEON_GEM_DOMAIN_VRAM);
    }
-   else {
-      /* drawing to multiple buffers, or none */
-      return;
+
+   /* depth buffer */
+   rrb = radeon_get_depthbuffer(&rmesa->radeon);
+   /* color buffer */
+   if (rrb && rrb->bo) {
+     radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, rrb->bo,
+				       0, RADEON_GEM_DOMAIN_VRAM);
    }
 
-   assert(drb);
-   assert(drb->flippedPitch);
+   for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+      radeonTexObj *t;
 
-   RADEON_STATECHANGE( rmesa, ctx );
+      if (!ctx->Texture.Unit[i]._ReallyEnabled)
+	 continue;
 
-   /* Note: we used the (possibly) page-flipped values */
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
-     = ((drb->flippedOffset + rmesa->radeonScreen->fbLocation)
-	& RADEON_COLOROFFSET_MASK);
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= RADEON_COLOR_TILE_ENABLE;
+      t = rmesa->state.texture.unit[i].texobj;
+      if (t->image_override && t->bo)
+	radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, t->bo,
+			   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+      else if (t->mt->bo)
+	radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, t->mt->bo,
+			   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
    }
-}
 
+   ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, first_elem(&rmesa->radeon.dma.reserved)->bo, RADEON_GEM_DOMAIN_GTT, 0);
+   if (ret)
+       return GL_FALSE;
+   return GL_TRUE;
+}
 
-void radeonValidateState( GLcontext *ctx )
+GLboolean radeonValidateState( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint new_state = rmesa->NewGLState;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   GLuint new_state = rmesa->radeon.NewGLState;
 
-   if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
-     radeonUpdateDrawBuffer(ctx);
+   if (new_state & _NEW_BUFFERS) {
+     _mesa_update_framebuffer(ctx);
+     /* this updates the DrawBuffer's Width/Height if it's a FBO */
+     _mesa_update_draw_buffer_bounds(ctx);
+     RADEON_STATECHANGE(rmesa, ctx);
    }
 
    if (new_state & _NEW_TEXTURE) {
       radeonUpdateTextureState( ctx );
-      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
+      new_state |= rmesa->radeon.NewGLState; /* may add TEXTURE_MATRIX */
    }
 
+   /* we need to do a space check here */
+   if (!r100ValidateBuffers(ctx))
+     return GL_FALSE;
+
    /* Need an event driven matrix update?
     */
-   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
       upload_matrix( rmesa, ctx->_ModelProjectMatrix.m, MODEL_PROJ );
 
    /* Need these for lighting (shouldn't upload otherwise)
@@ -2290,12 +2128,14 @@ void radeonValidateState( GLcontext *ctx )
    /* emit all active clip planes if projection matrix changes.
     */
    if (new_state & (_NEW_PROJECTION)) {
-      if (ctx->Transform.ClipPlanesEnabled) 
+      if (ctx->Transform.ClipPlanesEnabled)
 	 radeonUpdateClipPlanes( ctx );
    }
 
 
-   rmesa->NewGLState = 0;
+   rmesa->radeon.NewGLState = 0;
+
+   return GL_TRUE;
 }
 
 
@@ -2306,7 +2146,7 @@ static void radeonInvalidateState( GLcontext *ctx, GLuint new_state )
    _vbo_InvalidateState( ctx, new_state );
    _tnl_InvalidateState( ctx, new_state );
    _ae_invalidate_state( ctx, new_state );
-   RADEON_CONTEXT(ctx)->NewGLState |= new_state;
+   R100_CONTEXT(ctx)->radeon.NewGLState |= new_state;
 }
 
 
@@ -2317,8 +2157,8 @@ static GLboolean check_material( GLcontext *ctx )
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    GLint i;
 
-   for (i = _TNL_ATTRIB_MAT_FRONT_AMBIENT; 
-	i < _TNL_ATTRIB_MAT_BACK_INDEXES; 
+   for (i = _TNL_ATTRIB_MAT_FRONT_AMBIENT;
+	i < _TNL_ATTRIB_MAT_BACK_INDEXES;
 	i++)
       if (tnl->vb.AttribPtr[i] &&
 	  tnl->vb.AttribPtr[i]->stride)
@@ -2326,20 +2166,21 @@ static GLboolean check_material( GLcontext *ctx )
 
    return GL_FALSE;
 }
-      
+
 
 static void radeonWrapRunPipeline( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLboolean has_material;
 
    if (0)
-      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->NewGLState);
+      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->radeon.NewGLState);
 
    /* Validate state:
     */
-   if (rmesa->NewGLState)
-      radeonValidateState( ctx );
+   if (rmesa->radeon.NewGLState)
+      if (!radeonValidateState( ctx ))
+	 FALLBACK(rmesa, RADEON_FALLBACK_TEXTURE, GL_TRUE);
 
    has_material = (ctx->Light.Enabled && check_material( ctx ));
 
@@ -2348,7 +2189,7 @@ static void radeonWrapRunPipeline( GLcontext *ctx )
    }
 
    /* Run the pipeline.
-    */ 
+    */
    _tnl_run_pipeline( ctx );
 
    if (has_material) {
@@ -2356,12 +2197,28 @@ static void radeonWrapRunPipeline( GLcontext *ctx )
    }
 }
 
+static void radeonPolygonStipple( GLcontext *ctx, const GLubyte *mask )
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   GLint i;
+
+   radeon_firevertices(&r100->radeon);
+
+   RADEON_STATECHANGE(r100, stp);
+
+   /* Must flip pattern upside down.
+    */
+   for ( i = 31 ; i >= 0; i--) {
+     r100->hw.stp.cmd[3 + i] = ((GLuint *) mask)[i];
+   }
+}
+
 
 /* Initialize the driver's state functions.
  * Many of the ctx->Driver functions might have been initialized to
  * software defaults in the earlier _mesa_init_driver_functions() call.
  */
-void radeonInitStateFuncs( GLcontext *ctx )
+void radeonInitStateFuncs( GLcontext *ctx , GLboolean dri2 )
 {
    ctx->Driver.UpdateState		= radeonInvalidateState;
    ctx->Driver.LightingSpaceChange      = radeonLightingSpaceChange;
@@ -2394,7 +2251,10 @@ void radeonInitStateFuncs( GLcontext *ctx )
    ctx->Driver.LogicOpcode		= radeonLogicOpCode;
    ctx->Driver.PolygonMode		= radeonPolygonMode;
    ctx->Driver.PolygonOffset		= radeonPolygonOffset;
-   ctx->Driver.PolygonStipple		= radeonPolygonStipple;
+   if (dri2)
+      ctx->Driver.PolygonStipple		= radeonPolygonStipple;
+   else
+      ctx->Driver.PolygonStipple		= radeonPolygonStipplePreKMS;
    ctx->Driver.RenderMode		= radeonRenderMode;
    ctx->Driver.Scissor			= radeonScissor;
    ctx->Driver.ShadeModel		= radeonShadeModel;
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.h b/src/mesa/drivers/dri/radeon/radeon_state.h
index 2171879f75..c780cff0cf 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.h
+++ b/src/mesa/drivers/dri/radeon/radeon_state.h
@@ -39,30 +39,25 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "radeon_context.h"
 
-extern void radeonInitState( radeonContextPtr rmesa );
-extern void radeonInitStateFuncs( GLcontext *ctx );
+extern void radeonInitState( r100ContextPtr rmesa );
+extern void radeonInitStateFuncs( GLcontext *ctx , GLboolean dri2);
 
 extern void radeonUpdateMaterial( GLcontext *ctx );
 
-extern void radeonSetCliprects( radeonContextPtr rmesa );
-extern void radeonRecalcScissorRects( radeonContextPtr rmesa );
 extern void radeonUpdateViewportOffset( GLcontext *ctx );
 extern void radeonUpdateWindow( GLcontext *ctx );
 extern void radeonUpdateDrawBuffer( GLcontext *ctx );
-extern void radeonUploadTexMatrix( radeonContextPtr rmesa,
+extern void radeonUploadTexMatrix( r100ContextPtr rmesa,
 				   int unit, GLboolean swapcols );
 
-extern void radeonValidateState( GLcontext *ctx );
-
-extern void radeonPrintDirty( radeonContextPtr rmesa,
-			      const char *msg );
+extern GLboolean radeonValidateState( GLcontext *ctx );
 
 
 extern void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode );
 #define FALLBACK( rmesa, bit, mode ) do {				\
    if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",		\
 		     __FUNCTION__, bit, mode );				\
-   radeonFallback( rmesa->glCtx, bit, mode );				\
+   radeonFallback( rmesa->radeon.glCtx, bit, mode );				\
 } while (0)
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index 57dc380050..f3ad0dd17a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -38,39 +38,141 @@
 #include "swrast_setup/swrast_setup.h"
 
 #include "radeon_context.h"
+#include "radeon_mipmap_tree.h"
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
 #include "radeon_tcl.h"
 #include "radeon_tex.h"
 #include "radeon_swtcl.h"
+#include "radeon_queryobj.h"
+
+#include "../r200/r200_reg.h"
 
 #include "xmlpool.h"
 
+/* New (1.3) state mechanism.  3 commands (packet, scalar, vector) in
+ * 1.3 cmdbuffers allow all previous state to be updated as well as
+ * the tcl scalar and vector areas.
+ */
+static struct {
+	int start;
+	int len;
+	const char *name;
+} packet[RADEON_MAX_STATE_PACKETS] = {
+	{RADEON_PP_MISC, 7, "RADEON_PP_MISC"},
+	{RADEON_PP_CNTL, 3, "RADEON_PP_CNTL"},
+	{RADEON_RB3D_COLORPITCH, 1, "RADEON_RB3D_COLORPITCH"},
+	{RADEON_RE_LINE_PATTERN, 2, "RADEON_RE_LINE_PATTERN"},
+	{RADEON_SE_LINE_WIDTH, 1, "RADEON_SE_LINE_WIDTH"},
+	{RADEON_PP_LUM_MATRIX, 1, "RADEON_PP_LUM_MATRIX"},
+	{RADEON_PP_ROT_MATRIX_0, 2, "RADEON_PP_ROT_MATRIX_0"},
+	{RADEON_RB3D_STENCILREFMASK, 3, "RADEON_RB3D_STENCILREFMASK"},
+	{RADEON_SE_VPORT_XSCALE, 6, "RADEON_SE_VPORT_XSCALE"},
+	{RADEON_SE_CNTL, 2, "RADEON_SE_CNTL"},
+	{RADEON_SE_CNTL_STATUS, 1, "RADEON_SE_CNTL_STATUS"},
+	{RADEON_RE_MISC, 1, "RADEON_RE_MISC"},
+	{RADEON_PP_TXFILTER_0, 6, "RADEON_PP_TXFILTER_0"},
+	{RADEON_PP_BORDER_COLOR_0, 1, "RADEON_PP_BORDER_COLOR_0"},
+	{RADEON_PP_TXFILTER_1, 6, "RADEON_PP_TXFILTER_1"},
+	{RADEON_PP_BORDER_COLOR_1, 1, "RADEON_PP_BORDER_COLOR_1"},
+	{RADEON_PP_TXFILTER_2, 6, "RADEON_PP_TXFILTER_2"},
+	{RADEON_PP_BORDER_COLOR_2, 1, "RADEON_PP_BORDER_COLOR_2"},
+	{RADEON_SE_ZBIAS_FACTOR, 2, "RADEON_SE_ZBIAS_FACTOR"},
+	{RADEON_SE_TCL_OUTPUT_VTX_FMT, 11, "RADEON_SE_TCL_OUTPUT_VTX_FMT"},
+	{RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED, 17,
+		    "RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED"},
+	{R200_PP_TXCBLEND_0, 4, "R200_PP_TXCBLEND_0"},
+	{R200_PP_TXCBLEND_1, 4, "R200_PP_TXCBLEND_1"},
+	{R200_PP_TXCBLEND_2, 4, "R200_PP_TXCBLEND_2"},
+	{R200_PP_TXCBLEND_3, 4, "R200_PP_TXCBLEND_3"},
+	{R200_PP_TXCBLEND_4, 4, "R200_PP_TXCBLEND_4"},
+	{R200_PP_TXCBLEND_5, 4, "R200_PP_TXCBLEND_5"},
+	{R200_PP_TXCBLEND_6, 4, "R200_PP_TXCBLEND_6"},
+	{R200_PP_TXCBLEND_7, 4, "R200_PP_TXCBLEND_7"},
+	{R200_SE_TCL_LIGHT_MODEL_CTL_0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0"},
+	{R200_PP_TFACTOR_0, 6, "R200_PP_TFACTOR_0"},
+	{R200_SE_VTX_FMT_0, 4, "R200_SE_VTX_FMT_0"},
+	{R200_SE_VAP_CNTL, 1, "R200_SE_VAP_CNTL"},
+	{R200_SE_TCL_MATRIX_SEL_0, 5, "R200_SE_TCL_MATRIX_SEL_0"},
+	{R200_SE_TCL_TEX_PROC_CTL_2, 5, "R200_SE_TCL_TEX_PROC_CTL_2"},
+	{R200_SE_TCL_UCP_VERT_BLEND_CTL, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL"},
+	{R200_PP_TXFILTER_0, 6, "R200_PP_TXFILTER_0"},
+	{R200_PP_TXFILTER_1, 6, "R200_PP_TXFILTER_1"},
+	{R200_PP_TXFILTER_2, 6, "R200_PP_TXFILTER_2"},
+	{R200_PP_TXFILTER_3, 6, "R200_PP_TXFILTER_3"},
+	{R200_PP_TXFILTER_4, 6, "R200_PP_TXFILTER_4"},
+	{R200_PP_TXFILTER_5, 6, "R200_PP_TXFILTER_5"},
+	{R200_PP_TXOFFSET_0, 1, "R200_PP_TXOFFSET_0"},
+	{R200_PP_TXOFFSET_1, 1, "R200_PP_TXOFFSET_1"},
+	{R200_PP_TXOFFSET_2, 1, "R200_PP_TXOFFSET_2"},
+	{R200_PP_TXOFFSET_3, 1, "R200_PP_TXOFFSET_3"},
+	{R200_PP_TXOFFSET_4, 1, "R200_PP_TXOFFSET_4"},
+	{R200_PP_TXOFFSET_5, 1, "R200_PP_TXOFFSET_5"},
+	{R200_SE_VTE_CNTL, 1, "R200_SE_VTE_CNTL"},
+	{R200_SE_TCL_OUTPUT_VTX_COMP_SEL, 1,
+	 "R200_SE_TCL_OUTPUT_VTX_COMP_SEL"},
+	{R200_PP_TAM_DEBUG3, 1, "R200_PP_TAM_DEBUG3"},
+	{R200_PP_CNTL_X, 1, "R200_PP_CNTL_X"},
+	{R200_RB3D_DEPTHXY_OFFSET, 1, "R200_RB3D_DEPTHXY_OFFSET"},
+	{R200_RE_AUX_SCISSOR_CNTL, 1, "R200_RE_AUX_SCISSOR_CNTL"},
+	{R200_RE_SCISSOR_TL_0, 2, "R200_RE_SCISSOR_TL_0"},
+	{R200_RE_SCISSOR_TL_1, 2, "R200_RE_SCISSOR_TL_1"},
+	{R200_RE_SCISSOR_TL_2, 2, "R200_RE_SCISSOR_TL_2"},
+	{R200_SE_VAP_CNTL_STATUS, 1, "R200_SE_VAP_CNTL_STATUS"},
+	{R200_SE_VTX_STATE_CNTL, 1, "R200_SE_VTX_STATE_CNTL"},
+	{R200_RE_POINTSIZE, 1, "R200_RE_POINTSIZE"},
+	{R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, 4,
+		    "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0"},
+	{R200_PP_CUBIC_FACES_0, 1, "R200_PP_CUBIC_FACES_0"},	/* 61 */
+	{R200_PP_CUBIC_OFFSET_F1_0, 5, "R200_PP_CUBIC_OFFSET_F1_0"}, /* 62 */
+	{R200_PP_CUBIC_FACES_1, 1, "R200_PP_CUBIC_FACES_1"},
+	{R200_PP_CUBIC_OFFSET_F1_1, 5, "R200_PP_CUBIC_OFFSET_F1_1"},
+	{R200_PP_CUBIC_FACES_2, 1, "R200_PP_CUBIC_FACES_2"},
+	{R200_PP_CUBIC_OFFSET_F1_2, 5, "R200_PP_CUBIC_OFFSET_F1_2"},
+	{R200_PP_CUBIC_FACES_3, 1, "R200_PP_CUBIC_FACES_3"},
+	{R200_PP_CUBIC_OFFSET_F1_3, 5, "R200_PP_CUBIC_OFFSET_F1_3"},
+	{R200_PP_CUBIC_FACES_4, 1, "R200_PP_CUBIC_FACES_4"},
+	{R200_PP_CUBIC_OFFSET_F1_4, 5, "R200_PP_CUBIC_OFFSET_F1_4"},
+	{R200_PP_CUBIC_FACES_5, 1, "R200_PP_CUBIC_FACES_5"},
+	{R200_PP_CUBIC_OFFSET_F1_5, 5, "R200_PP_CUBIC_OFFSET_F1_5"},
+	{RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0"},
+	{RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1"},
+	{RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"},
+	{R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"},
+	{R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
+	{RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0"},
+	{RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0"},
+	{RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1"},
+	{RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0"},
+	{RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2"},
+	{RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0"},
+	{R200_PP_TRI_PERF, 2, "R200_PP_TRI_PERF"},
+	{R200_PP_TXCBLEND_8, 32, "R200_PP_AFS_0"},     /* 85 */
+	{R200_PP_TXCBLEND_0, 32, "R200_PP_AFS_1"},
+	{R200_PP_TFACTOR_0, 8, "R200_ATF_TFACTOR"},
+	{R200_PP_TXFILTER_0, 8, "R200_PP_TXCTLALL_0"},
+	{R200_PP_TXFILTER_1, 8, "R200_PP_TXCTLALL_1"},
+	{R200_PP_TXFILTER_2, 8, "R200_PP_TXCTLALL_2"},
+	{R200_PP_TXFILTER_3, 8, "R200_PP_TXCTLALL_3"},
+	{R200_PP_TXFILTER_4, 8, "R200_PP_TXCTLALL_4"},
+	{R200_PP_TXFILTER_5, 8, "R200_PP_TXCTLALL_5"},
+	{R200_VAP_PVS_CNTL_1, 2, "R200_VAP_PVS_CNTL"},
+};
+
 /* =============================================================
  * State initialization
  */
-
-void radeonPrintDirty( radeonContextPtr rmesa, const char *msg )
+static int cmdpkt( r100ContextPtr rmesa, int id ) 
 {
-   struct radeon_state_atom *l;
-
-   fprintf(stderr, msg);
-   fprintf(stderr, ": ");
+   drm_radeon_cmd_header_t h;
 
-   foreach(l, &rmesa->hw.atomlist) {
-      if (l->dirty || rmesa->hw.all_dirty)
-	 fprintf(stderr, "%s, ", l->name);
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     return CP_PACKET0(packet[id].start, packet[id].len - 1);
+   } else {
+     h.i = 0;
+     h.packet.cmd_type = RADEON_CMD_PACKET;
+     h.packet.packet_id = id;
    }
-
-   fprintf(stderr, "\n");
-}
-
-static int cmdpkt( int id ) 
-{
-   drm_radeon_cmd_header_t h;
-   h.i = 0;
-   h.packet.cmd_type = RADEON_CMD_PACKET;
-   h.packet.packet_id = id;
    return h.i;
 }
 
@@ -96,131 +198,523 @@ static int cmdscl( int offset, int stride, int count )
    return h.i;
 }
 
-#define CHECK( NM, FLAG )			\
-static GLboolean check_##NM( GLcontext *ctx )	\
-{						\
-   return FLAG;					\
+#define CHECK( NM, FLAG, ADD )				\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom )	\
+{							\
+   return FLAG ? atom->cmd_size + (ADD) : 0;			\
 }
 
-#define TCL_CHECK( NM, FLAG )				\
-static GLboolean check_##NM( GLcontext *ctx )		\
+#define TCL_CHECK( NM, FLAG, ADD )				\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom )	\
 {							\
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);	\
-   return !rmesa->TclFallback && (FLAG);		\
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);	\
+   return (!rmesa->radeon.TclFallback && (FLAG)) ? atom->cmd_size + (ADD) : 0;	\
 }
 
 
-CHECK( always, GL_TRUE )
-CHECK( never, GL_FALSE )
-CHECK( tex0, ctx->Texture.Unit[0]._ReallyEnabled )
-CHECK( tex1, ctx->Texture.Unit[1]._ReallyEnabled )
+CHECK( always, GL_TRUE, 0 )
+CHECK( always_add2, GL_TRUE, 2 )
+CHECK( always_add4, GL_TRUE, 4 )
+CHECK( never, GL_FALSE, 0 )
+CHECK( tex0_mm, ctx->Texture.Unit[0]._ReallyEnabled, 3 )
+CHECK( tex1_mm, ctx->Texture.Unit[1]._ReallyEnabled, 3 )
 /* need this for the cubic_map on disabled unit 2 bug, maybe r100 only? */
-CHECK( tex2, ctx->Texture._EnabledUnits )
-CHECK( cube0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_CUBE_BIT))
-CHECK( cube1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_CUBE_BIT))
-CHECK( cube2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_CUBE_BIT))
-CHECK( fog, ctx->Fog.Enabled )
-TCL_CHECK( tcl, GL_TRUE )
-TCL_CHECK( tcl_tex0, ctx->Texture.Unit[0]._ReallyEnabled )
-TCL_CHECK( tcl_tex1, ctx->Texture.Unit[1]._ReallyEnabled )
-TCL_CHECK( tcl_tex2, ctx->Texture.Unit[2]._ReallyEnabled )
-TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
-TCL_CHECK( tcl_eyespace_or_lighting, ctx->_NeedEyeCoords || ctx->Light.Enabled )
-TCL_CHECK( tcl_lit0, ctx->Light.Enabled && ctx->Light.Light[0].Enabled )
-TCL_CHECK( tcl_lit1, ctx->Light.Enabled && ctx->Light.Light[1].Enabled )
-TCL_CHECK( tcl_lit2, ctx->Light.Enabled && ctx->Light.Light[2].Enabled )
-TCL_CHECK( tcl_lit3, ctx->Light.Enabled && ctx->Light.Light[3].Enabled )
-TCL_CHECK( tcl_lit4, ctx->Light.Enabled && ctx->Light.Light[4].Enabled )
-TCL_CHECK( tcl_lit5, ctx->Light.Enabled && ctx->Light.Light[5].Enabled )
-TCL_CHECK( tcl_lit6, ctx->Light.Enabled && ctx->Light.Light[6].Enabled )
-TCL_CHECK( tcl_lit7, ctx->Light.Enabled && ctx->Light.Light[7].Enabled )
-TCL_CHECK( tcl_ucp0, (ctx->Transform.ClipPlanesEnabled & 0x1) )
-TCL_CHECK( tcl_ucp1, (ctx->Transform.ClipPlanesEnabled & 0x2) )
-TCL_CHECK( tcl_ucp2, (ctx->Transform.ClipPlanesEnabled & 0x4) )
-TCL_CHECK( tcl_ucp3, (ctx->Transform.ClipPlanesEnabled & 0x8) )
-TCL_CHECK( tcl_ucp4, (ctx->Transform.ClipPlanesEnabled & 0x10) )
-TCL_CHECK( tcl_ucp5, (ctx->Transform.ClipPlanesEnabled & 0x20) )
-TCL_CHECK( tcl_eyespace_or_fog, ctx->_NeedEyeCoords || ctx->Fog.Enabled ) 
-
-CHECK( txr0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_RECT_BIT))
-CHECK( txr1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_RECT_BIT))
-CHECK( txr2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_RECT_BIT))
+CHECK( tex2_mm, ctx->Texture._EnabledUnits, 3 )
+CHECK( tex0, ctx->Texture.Unit[0]._ReallyEnabled, 2 )
+CHECK( tex1, ctx->Texture.Unit[1]._ReallyEnabled, 2 )
+CHECK( tex2, ctx->Texture._EnabledUnits, 2 )
+CHECK( cube0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_CUBE_BIT), 3 + 3*5 - CUBE_STATE_SIZE )
+CHECK( cube1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_CUBE_BIT), 3 + 3*5 - CUBE_STATE_SIZE )
+CHECK( cube2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_CUBE_BIT), 3 + 3*5 - CUBE_STATE_SIZE )
+CHECK( cube0_mm, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_CUBE_BIT), 2 + 4*5 - CUBE_STATE_SIZE )
+CHECK( cube1_mm, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_CUBE_BIT), 2 + 4*5 - CUBE_STATE_SIZE )
+CHECK( cube2_mm, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_CUBE_BIT), 2 + 4*5 - CUBE_STATE_SIZE )
+CHECK( fog, ctx->Fog.Enabled, 0 )
+CHECK( fog_add4, ctx->Fog.Enabled, 4 )
+TCL_CHECK( tcl, GL_TRUE, 0 )
+TCL_CHECK( tcl_add4, GL_TRUE, 4 )
+TCL_CHECK( tcl_tex0, ctx->Texture.Unit[0]._ReallyEnabled, 0 )
+TCL_CHECK( tcl_tex1, ctx->Texture.Unit[1]._ReallyEnabled, 0 )
+TCL_CHECK( tcl_tex2, ctx->Texture.Unit[2]._ReallyEnabled, 0 )
+TCL_CHECK( tcl_tex0_add4, ctx->Texture.Unit[0]._ReallyEnabled, 4 )
+TCL_CHECK( tcl_tex1_add4, ctx->Texture.Unit[1]._ReallyEnabled, 4 )
+TCL_CHECK( tcl_tex2_add4, ctx->Texture.Unit[2]._ReallyEnabled, 4 )
+TCL_CHECK( tcl_lighting, ctx->Light.Enabled, 0 )
+TCL_CHECK( tcl_lighting_add4, ctx->Light.Enabled, 4 )
+TCL_CHECK( tcl_eyespace_or_lighting, ctx->_NeedEyeCoords || ctx->Light.Enabled, 0 )
+TCL_CHECK( tcl_eyespace_or_lighting_add4, ctx->_NeedEyeCoords || ctx->Light.Enabled, 4 )
+TCL_CHECK( tcl_lit0, ctx->Light.Enabled && ctx->Light.Light[0].Enabled, 0 )
+TCL_CHECK( tcl_lit1, ctx->Light.Enabled && ctx->Light.Light[1].Enabled, 0 )
+TCL_CHECK( tcl_lit2, ctx->Light.Enabled && ctx->Light.Light[2].Enabled, 0 )
+TCL_CHECK( tcl_lit3, ctx->Light.Enabled && ctx->Light.Light[3].Enabled, 0 )
+TCL_CHECK( tcl_lit4, ctx->Light.Enabled && ctx->Light.Light[4].Enabled, 0 )
+TCL_CHECK( tcl_lit5, ctx->Light.Enabled && ctx->Light.Light[5].Enabled, 0 )
+TCL_CHECK( tcl_lit6, ctx->Light.Enabled && ctx->Light.Light[6].Enabled, 0 )
+TCL_CHECK( tcl_lit7, ctx->Light.Enabled && ctx->Light.Light[7].Enabled, 0 )
+TCL_CHECK( tcl_lit0_add6, ctx->Light.Enabled && ctx->Light.Light[0].Enabled, 6 )
+TCL_CHECK( tcl_lit1_add6, ctx->Light.Enabled && ctx->Light.Light[1].Enabled, 6 )
+TCL_CHECK( tcl_lit2_add6, ctx->Light.Enabled && ctx->Light.Light[2].Enabled, 6 )
+TCL_CHECK( tcl_lit3_add6, ctx->Light.Enabled && ctx->Light.Light[3].Enabled, 6 )
+TCL_CHECK( tcl_lit4_add6, ctx->Light.Enabled && ctx->Light.Light[4].Enabled, 6 )
+TCL_CHECK( tcl_lit5_add6, ctx->Light.Enabled && ctx->Light.Light[5].Enabled, 6 )
+TCL_CHECK( tcl_lit6_add6, ctx->Light.Enabled && ctx->Light.Light[6].Enabled, 6 )
+TCL_CHECK( tcl_lit7_add6, ctx->Light.Enabled && ctx->Light.Light[7].Enabled, 6 )
+TCL_CHECK( tcl_ucp0, (ctx->Transform.ClipPlanesEnabled & 0x1), 0 )
+TCL_CHECK( tcl_ucp1, (ctx->Transform.ClipPlanesEnabled & 0x2), 0 )
+TCL_CHECK( tcl_ucp2, (ctx->Transform.ClipPlanesEnabled & 0x4), 0 )
+TCL_CHECK( tcl_ucp3, (ctx->Transform.ClipPlanesEnabled & 0x8), 0 )
+TCL_CHECK( tcl_ucp4, (ctx->Transform.ClipPlanesEnabled & 0x10), 0 )
+TCL_CHECK( tcl_ucp5, (ctx->Transform.ClipPlanesEnabled & 0x20), 0 )
+TCL_CHECK( tcl_ucp0_add4, (ctx->Transform.ClipPlanesEnabled & 0x1), 4 )
+TCL_CHECK( tcl_ucp1_add4, (ctx->Transform.ClipPlanesEnabled & 0x2), 4 )
+TCL_CHECK( tcl_ucp2_add4, (ctx->Transform.ClipPlanesEnabled & 0x4), 4 )
+TCL_CHECK( tcl_ucp3_add4, (ctx->Transform.ClipPlanesEnabled & 0x8), 4 )
+TCL_CHECK( tcl_ucp4_add4, (ctx->Transform.ClipPlanesEnabled & 0x10), 4 )
+TCL_CHECK( tcl_ucp5_add4, (ctx->Transform.ClipPlanesEnabled & 0x20), 4 )
+TCL_CHECK( tcl_eyespace_or_fog, ctx->_NeedEyeCoords || ctx->Fog.Enabled, 0 )
+TCL_CHECK( tcl_eyespace_or_fog_add4, ctx->_NeedEyeCoords || ctx->Fog.Enabled, 4 )
+
+CHECK( txr0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_RECT_BIT), 0 )
+CHECK( txr1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_RECT_BIT), 0 )
+CHECK( txr2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_RECT_BIT), 0 )
+
+#define OUT_VEC(hdr, data) do {			\
+    drm_radeon_cmd_header_t h;					\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
+    OUT_BATCH(0);							\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
+    OUT_BATCH(h.vectors.offset | (h.vectors.stride << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, h.vectors.count - 1));	\
+    OUT_BATCH_TABLE((data), h.vectors.count);				\
+  } while(0)
+
+#define OUT_SCL(hdr, data) do {					\
+    drm_radeon_cmd_header_t h;						\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
+    OUT_BATCH((h.scalars.offset) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
+    OUT_BATCH_TABLE((data), h.scalars.count);				\
+  } while(0)
+
+static void scl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+   
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_SCL(atom->cmd[0], atom->cmd+1);
+   END_BATCH();
+}
 
 
+static void vec_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
 
-/* Initialize the context's hardware state.
- */
-void radeonInitState( radeonContextPtr rmesa )
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[0], atom->cmd+1);
+   END_BATCH();
+}
+
+
+static void lit_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
-   GLcontext *ctx = rmesa->glCtx;
-   GLuint color_fmt, depth_fmt, i;
-   GLint drawPitch, drawOffset;
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[LIT_CMD_0], atom->cmd+1);
+   OUT_SCL(atom->cmd[LIT_CMD_1], atom->cmd+LIT_CMD_1+1);
+   END_BATCH();
+}
 
-   switch ( rmesa->radeonScreen->cpp ) {
-   case 2:
-      color_fmt = RADEON_COLOR_FORMAT_RGB565;
-      break;
-   case 4:
-      color_fmt = RADEON_COLOR_FORMAT_ARGB8888;
-      break;
-   default:
-      fprintf( stderr, "Error: Unsupported pixel depth... exiting\n" );
-      exit( -1 );
+static void ctx_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   struct radeon_renderbuffer *rrb;
+   uint32_t cbpitch;
+   uint32_t zbpitch, depth_fmt;
+   uint32_t dwords = atom->check(ctx, atom);
+
+   /* output the first 7 bytes of context */
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_BATCH_TABLE(atom->cmd, 5);
+
+   rrb = radeon_get_depthbuffer(&r100->radeon);
+   if (!rrb) {
+     OUT_BATCH(0);
+     OUT_BATCH(0);
+   } else {
+     zbpitch = (rrb->pitch / rrb->cpp);
+     if (r100->using_hyperz)
+       zbpitch |= RADEON_DEPTH_HYPERZ;
+
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+     OUT_BATCH(zbpitch);
+     if (rrb->cpp == 4)
+        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
+     else
+        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
+   }
+     
+   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
+   OUT_BATCH(atom->cmd[CTX_CMD_1]);
+   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
+
+   rrb = radeon_get_colorbuffer(&r100->radeon);
+   if (!rrb || !rrb->bo) {
+      OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+      OUT_BATCH(atom->cmd[CTX_RB3D_COLOROFFSET]);
+   } else {
+      atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
+      if (rrb->cpp == 4)
+         atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
+      else
+         atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
+
+      OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+      OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
    }
 
-   rmesa->state.color.clear = 0x00000000;
+   OUT_BATCH(atom->cmd[CTX_CMD_2]);
+
+   if (!rrb || !rrb->bo) {
+     OUT_BATCH(atom->cmd[CTX_RB3D_COLORPITCH]);
+   } else {
+     cbpitch = (rrb->pitch / rrb->cpp);
+     if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+       cbpitch |= RADEON_COLOR_TILE_ENABLE;
+     OUT_BATCH(cbpitch);
+   }
+
+   END_BATCH();
+}
+
+static int check_always_ctx( GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb, *drb;
+   uint32_t dwords;
+
+   rrb = radeon_get_colorbuffer(&r100->radeon);
+   if (!rrb || !rrb->bo) {
+      return 0;
+   }
+
+   drb = radeon_get_depthbuffer(&r100->radeon);
+
+   dwords = 10;
+   if (drb)
+     dwords += 6;
+   if (rrb)
+     dwords += 8;
+
+   return dwords;
+}
+
+static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   struct radeon_renderbuffer *rrb, *drb;
+   uint32_t cbpitch = 0;
+   uint32_t zbpitch = 0;
+   uint32_t dwords = atom->check(ctx, atom);
+   uint32_t depth_fmt;
+
+   rrb = radeon_get_colorbuffer(&r100->radeon);
+   if (!rrb || !rrb->bo) {
+      fprintf(stderr, "no rrb\n");
+      return;
+   }
+
+   atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
+   if (rrb->cpp == 4)
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
+   else switch (rrb->base._ActualFormat) {
+   case GL_RGB5:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
+	break;
+   case GL_RGBA4:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
+	break;
+   case GL_RGB5_A1:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
+	break;
+   }
+
+   cbpitch = (rrb->pitch / rrb->cpp);
+   if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+       cbpitch |= R200_COLOR_TILE_ENABLE;
+
+   drb = radeon_get_depthbuffer(&r100->radeon);
+   if (drb) {
+     zbpitch = (drb->pitch / drb->cpp);
+     if (drb->cpp == 4)
+        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
+     else
+        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
+     
+   }
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   /* In the CS case we need to split this up */
+   OUT_BATCH(CP_PACKET0(packet[0].start, 3));
+   OUT_BATCH_TABLE((atom->cmd + 1), 4);
+
+   if (drb) {
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHOFFSET, 0));
+     OUT_BATCH_RELOC(0, drb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHPITCH, 0));
+     OUT_BATCH(zbpitch);
+   }
+
+   OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZSTENCILCNTL, 0));
+   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
+   OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 1));
+   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
+   OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+
+   if (rrb) {
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLOROFFSET, 0));
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0));
+     OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+   }
+
+   // if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) {
+   //   OUT_BATCH_TABLE((atom->cmd + 14), 4);
+   // }
+
+   END_BATCH();
+   BEGIN_BATCH_NO_AUTOSTATE(4);
+   OUT_BATCH(CP_PACKET0(RADEON_RE_TOP_LEFT, 0));
+   OUT_BATCH(0);
+   OUT_BATCH(CP_PACKET0(RADEON_RE_WIDTH_HEIGHT, 0));
+   if (rrb) {
+       OUT_BATCH(((rrb->base.Width - 1) << RADEON_RE_WIDTH_SHIFT) |
+                 ((rrb->base.Height - 1) << RADEON_RE_HEIGHT_SHIFT));
+   } else {
+       OUT_BATCH(0);
+   }
+   END_BATCH();
+}
+
+static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+   int i = atom->idx, j;
+   radeonTexObj *t = r100->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+
+   if (!(ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT))
+	return;
+
+   if (!t)
+	return;
+
+   if (!t->mt)
+	return;
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_BATCH_TABLE(atom->cmd, 3);
+   lvl = &t->mt->levels[0];
+   for (j = 0; j < 5; j++) {
+	OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_VRAM, 0, 0);
+   }
+   END_BATCH();
+}
+
+static void cube_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->check(ctx, atom);
+   int i = atom->idx, j;
+   radeonTexObj *t = r100->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+   uint32_t base_reg;
+
+   if (!(ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT))
+	return;
+
+   if (!t)
+	return;
+
+   if (!t->mt)
+	return;
+
+   switch(i) {
+	case 1: base_reg = RADEON_PP_CUBIC_OFFSET_T1_0; break;
+	case 2: base_reg = RADEON_PP_CUBIC_OFFSET_T2_0; break;
+	default:
+	case 0: base_reg = RADEON_PP_CUBIC_OFFSET_T0_0; break;
+   };
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_BATCH_TABLE(atom->cmd, 2);
+   lvl = &t->mt->levels[0];
+   for (j = 0; j < 5; j++) {
+	OUT_BATCH(CP_PACKET0(base_reg + (4 * j), 0));
+	OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+   }
+   END_BATCH();
+}
+
+static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->cmd_size;
+   int i = atom->idx;
+   radeonTexObj *t = r100->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+
+   if (t && t->mt && !t->image_override)
+     dwords += 2;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   OUT_BATCH_TABLE(atom->cmd, 3);
+   if (t && t->mt && !t->image_override) {
+     if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
+   	lvl = &t->mt->levels[0];
+	OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
+			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     } else {
+        OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     }
+   } else if (!t) {
+     /* workaround for old CS mechanism */
+     OUT_BATCH(r100->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP]);
+     //     OUT_BATCH(r100->radeon.radeonScreen);
+   } else {
+     OUT_BATCH(t->override_offset);
+   }
+
+   OUT_BATCH_TABLE((atom->cmd+4), 5);
+   END_BATCH();
+}
+
+static void tex_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->cmd_size;
+   int i = atom->idx;
+   radeonTexObj *t = r100->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+   int hastexture = 1;
+
+   if (!t)
+	hastexture = 0;
+   else {
+	if (!t->mt && !t->bo)
+		hastexture = 0;
+   }
+   dwords += 1;
+   if (hastexture)
+     dwords += 2;
+   else
+     dwords -= 2;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   OUT_BATCH(CP_PACKET0(RADEON_PP_TXFILTER_0 + (24 * i), 1));
+   OUT_BATCH_TABLE((atom->cmd + 1), 2);
+
+   if (hastexture) {
+     OUT_BATCH(CP_PACKET0(RADEON_PP_TXOFFSET_0 + (24 * i), 0));
+     if (t->mt && !t->image_override) {
+        if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
+            lvl = &t->mt->levels[0];
+	    OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
+			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+        } else {
+           OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+        }
+      } else {
+	if (t->bo)
+            OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
+                            RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+      }
+   }
+
+   OUT_BATCH(CP_PACKET0(RADEON_PP_TXCBLEND_0 + (i * 24), 1));
+   OUT_BATCH_TABLE((atom->cmd+4), 2);
+   OUT_BATCH(CP_PACKET0(RADEON_PP_BORDER_COLOR_0 + (i * 4), 0));
+   OUT_BATCH((atom->cmd[TEX_PP_BORDER_COLOR]));
+   END_BATCH();
+}
+
+/* Initialize the context's hardware state.
+ */
+void radeonInitState( r100ContextPtr rmesa )
+{
+   GLcontext *ctx = rmesa->radeon.glCtx;
+   GLuint i;
+
+   rmesa->radeon.state.color.clear = 0x00000000;
 
    switch ( ctx->Visual.depthBits ) {
    case 16:
-      rmesa->state.depth.clear = 0x0000ffff;
-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffff;
-      depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
-      rmesa->state.stencil.clear = 0x00000000;
+      rmesa->radeon.state.depth.clear = 0x0000ffff;
+      rmesa->radeon.state.stencil.clear = 0x00000000;
       break;
    case 24:
-      rmesa->state.depth.clear = 0x00ffffff;
-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffffff;
-      depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
-      rmesa->state.stencil.clear = 0xffff0000;
+      rmesa->radeon.state.depth.clear = 0x00ffffff;
+      rmesa->radeon.state.stencil.clear = 0xffff0000;
       break;
    default:
-      fprintf( stderr, "Error: Unsupported depth %d... exiting\n",
-	       ctx->Visual.depthBits );
-      exit( -1 );
+      break;
    }
 
-   /* Only have hw stencil when depth buffer is 24 bits deep */
-   rmesa->state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
-				     ctx->Visual.depthBits == 24 );
+   rmesa->radeon.Fallback = 0;
 
-   rmesa->Fallback = 0;
 
-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
-      drawOffset = rmesa->radeonScreen->backOffset;
-      drawPitch  = rmesa->radeonScreen->backPitch;
-   } else {
-      drawOffset = rmesa->radeonScreen->frontOffset;
-      drawPitch  = rmesa->radeonScreen->frontPitch;
-   }
+   rmesa->radeon.hw.max_state_size = 0;
 
-   rmesa->hw.max_state_size = 0;
-
-#define ALLOC_STATE( ATOM, CHK, SZ, NM, FLAG )				\
+#define ALLOC_STATE_IDX( ATOM, CHK, SZ, NM, FLAG, IDX )		\
    do {								\
       rmesa->hw.ATOM.cmd_size = SZ;				\
-      rmesa->hw.ATOM.cmd = (int *)CALLOC(SZ * sizeof(int));	\
-      rmesa->hw.ATOM.lastcmd = (int *)CALLOC(SZ * sizeof(int));	\
-      rmesa->hw.ATOM.name = NM;					\
+      rmesa->hw.ATOM.cmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.lastcmd = (GLuint *)CALLOC(SZ * sizeof(int)); \
+      rmesa->hw.ATOM.name = NM;						\
       rmesa->hw.ATOM.is_tcl = FLAG;					\
       rmesa->hw.ATOM.check = check_##CHK;				\
-      rmesa->hw.ATOM.dirty = GL_TRUE;				\
-      rmesa->hw.max_state_size += SZ * sizeof(int);		\
+      rmesa->hw.ATOM.dirty = GL_TRUE;					\
+      rmesa->hw.ATOM.idx = IDX;					\
+      rmesa->radeon.hw.max_state_size += SZ * sizeof(int);		\
    } while (0)
-      
-      
+
+#define ALLOC_STATE( ATOM, CHK, SZ, NM, FLAG )		\
+   ALLOC_STATE_IDX(ATOM, CHK, SZ, NM, FLAG, 0)
+
    /* Allocate state buffers:
     */
-   ALLOC_STATE( ctx, always, CTX_STATE_SIZE, "CTX/context", 0 );
+   ALLOC_STATE( ctx, always_add4, CTX_STATE_SIZE, "CTX/context", 0 );
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     rmesa->hw.ctx.emit = ctx_emit_cs;
+     rmesa->hw.ctx.check = check_always_ctx;
+   } else
+     rmesa->hw.ctx.emit = ctx_emit;
    ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
    ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
    ALLOC_STATE( vpt, always, VPT_STATE_SIZE, "VPT/viewport", 0 );
@@ -229,82 +723,133 @@ void radeonInitState( radeonContextPtr rmesa )
    ALLOC_STATE( zbs, always, ZBS_STATE_SIZE, "ZBS/zbias", 0 );
    ALLOC_STATE( tcl, always, TCL_STATE_SIZE, "TCL/tcl", 1 );
    ALLOC_STATE( mtl, tcl_lighting, MTL_STATE_SIZE, "MTL/material", 1 );
-   ALLOC_STATE( grd, always, GRD_STATE_SIZE, "GRD/guard-band", 1 );
-   ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 1 );
-   ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 1 );
-   ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
-   ALLOC_STATE( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0 );
-   ALLOC_STATE( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0 );
-   ALLOC_STATE( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0 );
-   if (rmesa->radeonScreen->drmSupportsCubeMapsR100)
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      ALLOC_STATE( grd, always_add2, GRD_STATE_SIZE, "GRD/guard-band", 1 );
+      ALLOC_STATE( fog, fog_add4, FOG_STATE_SIZE, "FOG/fog", 1 );
+      ALLOC_STATE( glt, tcl_lighting_add4, GLT_STATE_SIZE, "GLT/light-global", 1 );
+      ALLOC_STATE( eye, tcl_lighting_add4, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
+      ALLOC_STATE_IDX( tex[0], tex0_mm, TEX_STATE_SIZE, "TEX/tex-0", 0, 0);
+      ALLOC_STATE_IDX( tex[1], tex1_mm, TEX_STATE_SIZE, "TEX/tex-1", 0, 1);
+      ALLOC_STATE_IDX( tex[2], tex2_mm, TEX_STATE_SIZE, "TEX/tex-2", 0, 2);
+      ALLOC_STATE( mat[0], tcl_add4, MAT_STATE_SIZE, "MAT/modelproject", 1 );
+      ALLOC_STATE( mat[1], tcl_eyespace_or_fog_add4, MAT_STATE_SIZE, "MAT/modelview", 1 );
+      ALLOC_STATE( mat[2], tcl_eyespace_or_lighting_add4, MAT_STATE_SIZE, "MAT/it-modelview", 1 );
+      ALLOC_STATE( mat[3], tcl_tex0_add4, MAT_STATE_SIZE, "MAT/texmat0", 1 );
+      ALLOC_STATE( mat[4], tcl_tex1_add4, MAT_STATE_SIZE, "MAT/texmat1", 1 );
+      ALLOC_STATE( mat[5], tcl_tex2_add4, MAT_STATE_SIZE, "MAT/texmat2", 1 );
+      ALLOC_STATE( lit[0], tcl_lit0_add6, LIT_STATE_SIZE, "LIT/light-0", 1 );
+      ALLOC_STATE( lit[1], tcl_lit1_add6, LIT_STATE_SIZE, "LIT/light-1", 1 );
+      ALLOC_STATE( lit[2], tcl_lit2_add6, LIT_STATE_SIZE, "LIT/light-2", 1 );
+      ALLOC_STATE( lit[3], tcl_lit3_add6, LIT_STATE_SIZE, "LIT/light-3", 1 );
+      ALLOC_STATE( lit[4], tcl_lit4_add6, LIT_STATE_SIZE, "LIT/light-4", 1 );
+      ALLOC_STATE( lit[5], tcl_lit5_add6, LIT_STATE_SIZE, "LIT/light-5", 1 );
+      ALLOC_STATE( lit[6], tcl_lit6_add6, LIT_STATE_SIZE, "LIT/light-6", 1 );
+      ALLOC_STATE( lit[7], tcl_lit7_add6, LIT_STATE_SIZE, "LIT/light-7", 1 );
+      ALLOC_STATE( ucp[0], tcl_ucp0_add4, UCP_STATE_SIZE, "UCP/userclip-0", 1 );
+      ALLOC_STATE( ucp[1], tcl_ucp1_add4, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
+      ALLOC_STATE( ucp[2], tcl_ucp2_add4, UCP_STATE_SIZE, "UCP/userclip-2", 1 );
+      ALLOC_STATE( ucp[3], tcl_ucp3_add4, UCP_STATE_SIZE, "UCP/userclip-3", 1 );
+      ALLOC_STATE( ucp[4], tcl_ucp4_add4, UCP_STATE_SIZE, "UCP/userclip-4", 1 );
+      ALLOC_STATE( ucp[5], tcl_ucp5_add4, UCP_STATE_SIZE, "UCP/userclip-5", 1 );
+   } else {
+      ALLOC_STATE( grd, always, GRD_STATE_SIZE, "GRD/guard-band", 1 );
+      ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 1 );
+      ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 1 );
+      ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
+      ALLOC_STATE_IDX( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0, 0);
+      ALLOC_STATE_IDX( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0, 1);
+      ALLOC_STATE_IDX( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0, 2);
+      ALLOC_STATE( mat[0], tcl, MAT_STATE_SIZE, "MAT/modelproject", 1 );
+      ALLOC_STATE( mat[1], tcl_eyespace_or_fog, MAT_STATE_SIZE, "MAT/modelview", 1 );
+      ALLOC_STATE( mat[2], tcl_eyespace_or_lighting, MAT_STATE_SIZE, "MAT/it-modelview", 1 );
+      ALLOC_STATE( mat[3], tcl_tex0, MAT_STATE_SIZE, "MAT/texmat0", 1 );
+      ALLOC_STATE( mat[4], tcl_tex1, MAT_STATE_SIZE, "MAT/texmat1", 1 );
+      ALLOC_STATE( mat[5], tcl_tex2, MAT_STATE_SIZE, "MAT/texmat2", 1 );
+      ALLOC_STATE( lit[0], tcl_lit0, LIT_STATE_SIZE, "LIT/light-0", 1 );
+      ALLOC_STATE( lit[1], tcl_lit1, LIT_STATE_SIZE, "LIT/light-1", 1 );
+      ALLOC_STATE( lit[2], tcl_lit2, LIT_STATE_SIZE, "LIT/light-2", 1 );
+      ALLOC_STATE( lit[3], tcl_lit3, LIT_STATE_SIZE, "LIT/light-3", 1 );
+      ALLOC_STATE( lit[4], tcl_lit4, LIT_STATE_SIZE, "LIT/light-4", 1 );
+      ALLOC_STATE( lit[5], tcl_lit5, LIT_STATE_SIZE, "LIT/light-5", 1 );
+      ALLOC_STATE( lit[6], tcl_lit6, LIT_STATE_SIZE, "LIT/light-6", 1 );
+      ALLOC_STATE( lit[7], tcl_lit7, LIT_STATE_SIZE, "LIT/light-7", 1 );
+      ALLOC_STATE( ucp[0], tcl_ucp0, UCP_STATE_SIZE, "UCP/userclip-0", 1 );
+      ALLOC_STATE( ucp[1], tcl_ucp1, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
+      ALLOC_STATE( ucp[2], tcl_ucp2, UCP_STATE_SIZE, "UCP/userclip-2", 1 );
+      ALLOC_STATE( ucp[3], tcl_ucp3, UCP_STATE_SIZE, "UCP/userclip-3", 1 );
+      ALLOC_STATE( ucp[4], tcl_ucp4, UCP_STATE_SIZE, "UCP/userclip-4", 1 );
+      ALLOC_STATE( ucp[5], tcl_ucp5, UCP_STATE_SIZE, "UCP/userclip-5", 1 );
+   }
+
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+       ALLOC_STATE( stp, always, STP_STATE_SIZE, "STP/stp", 0 );
+   }
+   
+   for (i = 0; i < 3; i++) {
+      if (rmesa->radeon.radeonScreen->kernel_mm)
+          rmesa->hw.tex[i].emit = tex_emit_cs;
+      else
+          rmesa->hw.tex[i].emit = tex_emit;
+   }
+   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR100)
    {
-      ALLOC_STATE( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0 );
-      ALLOC_STATE( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0 );
-      ALLOC_STATE( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0 );
+      if (rmesa->radeon.radeonScreen->kernel_mm) {
+         ALLOC_STATE_IDX( cube[0], cube0_mm, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
+         ALLOC_STATE_IDX( cube[1], cube1_mm, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
+         ALLOC_STATE_IDX( cube[2], cube2_mm, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
+         for (i = 0; i < 3; i++)
+            rmesa->hw.cube[i].emit = cube_emit_cs;
+      } else {
+         ALLOC_STATE_IDX( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
+         ALLOC_STATE_IDX( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
+         ALLOC_STATE_IDX( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
+         for (i = 0; i < 3; i++)
+            rmesa->hw.cube[i].emit = cube_emit;
+      }
    }
    else
    {
-      ALLOC_STATE( cube[0], never, CUBE_STATE_SIZE, "CUBE/cube-0", 0 );
-      ALLOC_STATE( cube[1], never, CUBE_STATE_SIZE, "CUBE/cube-1", 0 );
-      ALLOC_STATE( cube[2], never, CUBE_STATE_SIZE, "CUBE/cube-2", 0 );
-   }
-   ALLOC_STATE( mat[0], tcl, MAT_STATE_SIZE, "MAT/modelproject", 1 );
-   ALLOC_STATE( mat[1], tcl_eyespace_or_fog, MAT_STATE_SIZE, "MAT/modelview", 1 );
-   ALLOC_STATE( mat[2], tcl_eyespace_or_lighting, MAT_STATE_SIZE, "MAT/it-modelview", 1 );
-   ALLOC_STATE( mat[3], tcl_tex0, MAT_STATE_SIZE, "MAT/texmat0", 1 );
-   ALLOC_STATE( mat[4], tcl_tex1, MAT_STATE_SIZE, "MAT/texmat1", 1 );
-   ALLOC_STATE( mat[5], tcl_tex2, MAT_STATE_SIZE, "MAT/texmat2", 1 );
-   ALLOC_STATE( ucp[0], tcl_ucp0, UCP_STATE_SIZE, "UCP/userclip-0", 1 );
-   ALLOC_STATE( ucp[1], tcl_ucp1, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
-   ALLOC_STATE( ucp[2], tcl_ucp2, UCP_STATE_SIZE, "UCP/userclip-2", 1 );
-   ALLOC_STATE( ucp[3], tcl_ucp3, UCP_STATE_SIZE, "UCP/userclip-3", 1 );
-   ALLOC_STATE( ucp[4], tcl_ucp4, UCP_STATE_SIZE, "UCP/userclip-4", 1 );
-   ALLOC_STATE( ucp[5], tcl_ucp5, UCP_STATE_SIZE, "UCP/userclip-5", 1 );
-   ALLOC_STATE( lit[0], tcl_lit0, LIT_STATE_SIZE, "LIT/light-0", 1 );
-   ALLOC_STATE( lit[1], tcl_lit1, LIT_STATE_SIZE, "LIT/light-1", 1 );
-   ALLOC_STATE( lit[2], tcl_lit2, LIT_STATE_SIZE, "LIT/light-2", 1 );
-   ALLOC_STATE( lit[3], tcl_lit3, LIT_STATE_SIZE, "LIT/light-3", 1 );
-   ALLOC_STATE( lit[4], tcl_lit4, LIT_STATE_SIZE, "LIT/light-4", 1 );
-   ALLOC_STATE( lit[5], tcl_lit5, LIT_STATE_SIZE, "LIT/light-5", 1 );
-   ALLOC_STATE( lit[6], tcl_lit6, LIT_STATE_SIZE, "LIT/light-6", 1 );
-   ALLOC_STATE( lit[7], tcl_lit7, LIT_STATE_SIZE, "LIT/light-7", 1 );
-   ALLOC_STATE( txr[0], txr0, TXR_STATE_SIZE, "TXR/txr-0", 0 );
-   ALLOC_STATE( txr[1], txr1, TXR_STATE_SIZE, "TXR/txr-1", 0 );
-   ALLOC_STATE( txr[2], txr2, TXR_STATE_SIZE, "TXR/txr-2", 0 );
+      ALLOC_STATE_IDX( cube[0], never, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
+      ALLOC_STATE_IDX( cube[1], never, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
+      ALLOC_STATE_IDX( cube[2], never, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
+   }
+   ALLOC_STATE_IDX( txr[0], txr0, TXR_STATE_SIZE, "TXR/txr-0", 0, 0 );
+   ALLOC_STATE_IDX( txr[1], txr1, TXR_STATE_SIZE, "TXR/txr-1", 0, 1 );
+   ALLOC_STATE_IDX( txr[2], txr2, TXR_STATE_SIZE, "TXR/txr-2", 0, 2 );
 
    radeonSetUpAtomList( rmesa );
 
    /* Fill in the packet headers:
     */
-   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(RADEON_EMIT_PP_MISC);
-   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(RADEON_EMIT_PP_CNTL);
-   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(RADEON_EMIT_RB3D_COLORPITCH);
-   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(RADEON_EMIT_RE_LINE_PATTERN);
-   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(RADEON_EMIT_SE_LINE_WIDTH);
-   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(RADEON_EMIT_RB3D_STENCILREFMASK);
-   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(RADEON_EMIT_SE_VPORT_XSCALE);
-   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(RADEON_EMIT_SE_CNTL);
-   rmesa->hw.set.cmd[SET_CMD_1] = cmdpkt(RADEON_EMIT_SE_CNTL_STATUS);
-   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(RADEON_EMIT_RE_MISC);
-   rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_0);
-   rmesa->hw.tex[0].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_0);
-   rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_1);
-   rmesa->hw.tex[1].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_1);
-   rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_2);
-   rmesa->hw.tex[2].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_2);
-   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_0);
-   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T0);
-   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_1);
-   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T1);
-   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_2);
-   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T2);
-   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(RADEON_EMIT_SE_ZBIAS_FACTOR);
-   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT);
+   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_MISC);
+   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CNTL);
+   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(rmesa, RADEON_EMIT_RB3D_COLORPITCH);
+   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_LINE_PATTERN);
+   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_LINE_WIDTH);
+   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RB3D_STENCILREFMASK);
+   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_VPORT_XSCALE);
+   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL);
+   rmesa->hw.set.cmd[SET_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL_STATUS);
+   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_MISC);
+   rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_0);
+   rmesa->hw.tex[0].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_0);
+   rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_1);
+   rmesa->hw.tex[1].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_1);
+   rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_2);
+   rmesa->hw.tex[2].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_2);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_0);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T0);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_1);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T1);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_2);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T2);
+   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_ZBIAS_FACTOR);
+   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT);
    rmesa->hw.mtl.cmd[MTL_CMD_0] = 
-      cmdpkt(RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED);
-   rmesa->hw.txr[0].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_0);
-   rmesa->hw.txr[1].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_1);
-   rmesa->hw.txr[2].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_2);
+      cmdpkt(rmesa, RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED);
+   rmesa->hw.txr[0].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_0);
+   rmesa->hw.txr[1].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_1);
+   rmesa->hw.txr[2].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_2);
    rmesa->hw.grd.cmd[GRD_CMD_0] = 
       cmdscl( RADEON_SS_VERT_GUARD_CLIP_ADJ_ADDR, 1, 4 );
    rmesa->hw.fog.cmd[FOG_CMD_0] = 
@@ -331,6 +876,26 @@ void radeonInitState( radeonContextPtr rmesa )
 	 cmdvec( RADEON_VS_UCP_ADDR + i, 1, 4 );
    }
 
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      rmesa->hw.stp.cmd[STP_CMD_0] = CP_PACKET0(RADEON_RE_STIPPLE_ADDR, 0);
+      rmesa->hw.stp.cmd[STP_DATA_0] = 0;
+      rmesa->hw.stp.cmd[STP_CMD_1] = CP_PACKET0_ONE(RADEON_RE_STIPPLE_DATA, 31);
+
+      rmesa->hw.grd.emit = scl_emit;
+      rmesa->hw.fog.emit = vec_emit;
+      rmesa->hw.glt.emit = vec_emit;
+      rmesa->hw.eye.emit = vec_emit;
+      
+      for (i = 0; i < 6; i++)
+	 rmesa->hw.mat[i].emit = vec_emit;
+
+      for (i = 0; i < 8; i++)
+	 rmesa->hw.lit[i].emit = lit_emit;
+
+      for (i = 0; i < 6; i++)
+	 rmesa->hw.ucp[i].emit = vec_emit;
+   }
+
    rmesa->last_ReallyEnabled = -1;
 
    /* Initial Harware state:
@@ -352,19 +917,7 @@ void radeonInitState( radeonContextPtr rmesa )
 					    RADEON_SRC_BLEND_GL_ONE |
 					    RADEON_DST_BLEND_GL_ZERO );
 
-   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHOFFSET] =
-      rmesa->radeonScreen->depthOffset + rmesa->radeonScreen->fbLocation;
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] = 
-      ((rmesa->radeonScreen->depthPitch &
-	RADEON_DEPTHPITCH_MASK) |
-       RADEON_DEPTH_ENDIAN_NO_SWAP);
-       
-   if (rmesa->using_hyperz)
-       rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] |= RADEON_DEPTH_HYPERZ;
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (depth_fmt |
-					       RADEON_Z_TEST_LESS |
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (RADEON_Z_TEST_LESS |
 					       RADEON_STENCIL_TEST_ALWAYS |
 					       RADEON_STENCIL_FAIL_KEEP |
 					       RADEON_STENCIL_ZPASS_KEEP |
@@ -374,7 +927,7 @@ void radeonInitState( radeonContextPtr rmesa )
    if (rmesa->using_hyperz) {
        rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_COMPRESSION_ENABLE |
 						   RADEON_Z_DECOMPRESSION_ENABLE;
-      if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
 	 /* works for q3, but slight rendering errors with glxgears ? */
 /*	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_HIERARCHY_ENABLE;*/
 	 /* need this otherwise get lots of lockups with q3 ??? */
@@ -386,10 +939,9 @@ void radeonInitState( radeonContextPtr rmesa )
 				     RADEON_ANTI_ALIAS_NONE);
 
    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = (RADEON_PLANE_MASK_ENABLE |
-				       color_fmt |
 				       RADEON_ZBLOCK16);
 
-   switch ( driQueryOptioni( &rmesa->optionCache, "dither_mode" ) ) {
+   switch ( driQueryOptioni( &rmesa->radeon.optionCache, "dither_mode" ) ) {
    case DRI_CONF_DITHER_XERRORDIFFRESET:
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_DITHER_INIT;
       break;
@@ -397,30 +949,17 @@ void radeonInitState( radeonContextPtr rmesa )
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_SCALE_DITHER_ENABLE;
       break;
    }
-   if ( driQueryOptioni( &rmesa->optionCache, "round_mode" ) ==
+   if ( driQueryOptioni( &rmesa->radeon.optionCache, "round_mode" ) ==
 	DRI_CONF_ROUND_ROUND )
-      rmesa->state.color.roundEnable = RADEON_ROUND_ENABLE;
+      rmesa->radeon.state.color.roundEnable = RADEON_ROUND_ENABLE;
    else
-      rmesa->state.color.roundEnable = 0;
-   if ( driQueryOptioni (&rmesa->optionCache, "color_reduction" ) ==
+      rmesa->radeon.state.color.roundEnable = 0;
+   if ( driQueryOptioni (&rmesa->radeon.optionCache, "color_reduction" ) ==
 	DRI_CONF_COLOR_REDUCTION_DITHER )
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_DITHER_ENABLE;
    else
-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->state.color.roundEnable;
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((drawOffset +
-					       rmesa->radeonScreen->fbLocation)
-					      & RADEON_COLOROFFSET_MASK);
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->radeon.state.color.roundEnable;
 
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((drawPitch &
-					      RADEON_COLORPITCH_MASK) |
-					     RADEON_COLOR_ENDIAN_NO_SWAP);
-
-
-   /* (fixed size) sarea is initialized to zero afaics so can omit version check. Phew! */
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= RADEON_COLOR_TILE_ENABLE;
-   }
 
    rmesa->hw.set.cmd[SET_SE_CNTL] = (RADEON_FFACE_CULL_CCW |
 				     RADEON_BFACE_SOLID |
@@ -444,7 +983,7 @@ void radeonInitState( radeonContextPtr rmesa )
   					    RADEON_VC_NO_SWAP;
 #endif
 
-   if (!(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+   if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
      rmesa->hw.set.cmd[SET_SE_CNTL_STATUS] |= RADEON_TCL_BYPASS;
    }
 
@@ -491,8 +1030,8 @@ void radeonInitState( radeonContextPtr rmesa )
 	   (2 << RADEON_TXFORMAT_HEIGHT_SHIFT));
 
       /* Initialize the texture offset to the start of the card texture heap */
-      rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      //      rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET] =
+      //	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
 
       rmesa->hw.tex[i].cmd[TEX_PP_BORDER_COLOR] = 0;
       rmesa->hw.tex[i].cmd[TEX_PP_TXCBLEND] =  
@@ -513,15 +1052,15 @@ void radeonInitState( radeonContextPtr rmesa )
 
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_FACES] = 0;
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_0] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_1] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_2] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_3] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_4] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
    }
 
    /* Can only add ST1 at the time of doing some multitex but can keep
@@ -612,6 +1151,14 @@ void radeonInitState( radeonContextPtr rmesa )
    rmesa->hw.eye.cmd[EYE_Y] = 0;
    rmesa->hw.eye.cmd[EYE_Z] = IEEE_ONE;
    rmesa->hw.eye.cmd[EYE_RESCALE_FACTOR] = IEEE_ONE;
-   
-   rmesa->hw.all_dirty = GL_TRUE;
+
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      radeon_init_query_stateobj(&rmesa->radeon, R100_QUERYOBJ_CMDSIZE);
+      rmesa->radeon.query.queryobj.cmd[R100_QUERYOBJ_CMD_0] = CP_PACKET0(RADEON_RB3D_ZPASS_DATA, 0);
+      rmesa->radeon.query.queryobj.cmd[R100_QUERYOBJ_DATA_0] = 0;
+   }
+     
+   rmesa->radeon.hw.all_dirty = GL_TRUE;
+
+   rcommonInitCmdBuf(&rmesa->radeon);
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index ebea1fecdc..e61f59eaea 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/enums.h"
 #include "main/imports.h"
 #include "main/macros.h"
+#include "main/simple_list.h"
 
 #include "swrast_setup/swrast_setup.h"
 #include "math/m_translate.h"
@@ -50,10 +51,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_state.h"
 #include "radeon_swtcl.h"
 #include "radeon_tcl.h"
+#include "radeon_debug.h"
 
 
-static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
-
 /* R100: xyzw, c0, c1/fog, stq[0..2]  = 4+1+1+3*3 = 15  right? */
 /* R200: xyzw, c0, c1/fog, strq[0..5] = 4+1+1+4*6 = 30 */
 #define RADEON_MAX_TNL_VERTEX_SIZE (15 * sizeof(GLfloat))	/* for mesa _tnl stage */
@@ -64,18 +64,18 @@ static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
 
 #define EMIT_ATTR( ATTR, STYLE, F0 )					\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
    fmt_0 |= F0;								\
 } while (0)
 
 #define EMIT_PAD( N )							\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
 } while (0)
 
 static GLuint radeon_cp_vc_frmts[3][2] =
@@ -87,7 +87,7 @@ static GLuint radeon_cp_vc_frmts[3][2] =
 
 static void radeonSetVertexFormat( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    struct vertex_buffer *VB = &tnl->vb;
    DECLARE_RENDERINPUTS(index_bitset);
@@ -106,7 +106,7 @@ static void radeonSetVertexFormat( GLcontext *ctx )
    }
 
    assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-   rmesa->swtcl.vertex_attr_count = 0;
+   rmesa->radeon.swtcl.vertex_attr_count = 0;
 
    /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
     * build up a hardware vertex.
@@ -204,33 +204,52 @@ static void radeonSetVertexFormat( GLcontext *ctx )
       }
    }
 
-   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
+   if (!RENDERINPUTS_EQUAL( rmesa->radeon.tnl_index_bitset, index_bitset ) ||
 	fmt_0 != rmesa->swtcl.vertex_format) {
       RADEON_NEWPRIM(rmesa);
       rmesa->swtcl.vertex_format = fmt_0;
-      rmesa->swtcl.vertex_size =
+      rmesa->radeon.swtcl.vertex_size =
 	  _tnl_install_attrs( ctx,
-			      rmesa->swtcl.vertex_attrs, 
-			      rmesa->swtcl.vertex_attr_count,
+			      rmesa->radeon.swtcl.vertex_attrs, 
+			      rmesa->radeon.swtcl.vertex_attr_count,
 			      NULL, 0 );
-      rmesa->swtcl.vertex_size /= 4;
-      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
-      if (RADEON_DEBUG & DEBUG_VERTS)
-	 fprintf( stderr, "%s: vertex_size= %d floats\n",
-		  __FUNCTION__, rmesa->swtcl.vertex_size);
+      rmesa->radeon.swtcl.vertex_size /= 4;
+      RENDERINPUTS_COPY( rmesa->radeon.tnl_index_bitset, index_bitset );
+      radeon_print(RADEON_SWRENDER, RADEON_VERBOSE,
+	  "%s: vertex_size= %d floats\n",  __FUNCTION__, rmesa->radeon.swtcl.vertex_size);
    }
 }
 
+static void radeon_predict_emit_size( r100ContextPtr rmesa )
+{
+
+    if (!rmesa->radeon.swtcl.emit_prediction) {
+        const int state_size = radeonCountStateEmitSize( &rmesa->radeon );
+        const int scissor_size = 8;
+        const int prims_size = 8;
+        const int vertex_size = 7;
+
+        if (rcommonEnsureCmdBufSpace(&rmesa->radeon,
+                    state_size +
+                    (scissor_size + prims_size + vertex_size),
+                    __FUNCTION__))
+            rmesa->radeon.swtcl.emit_prediction = radeonCountStateEmitSize( &rmesa->radeon );
+        else
+            rmesa->radeon.swtcl.emit_prediction = state_size;
+        rmesa->radeon.swtcl.emit_prediction += scissor_size + prims_size + vertex_size
+            + rmesa->radeon.cmdbuf.cs->cdw;
+    }
+}
 
 static void radeonRenderStart( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+    r100ContextPtr rmesa = R100_CONTEXT( ctx );
 
-   radeonSetVertexFormat( ctx );
-   
-   if (rmesa->dma.flush != 0 && 
-       rmesa->dma.flush != flush_last_swtcl_prim)
-      rmesa->dma.flush( rmesa );
+    radeonSetVertexFormat( ctx );
+
+    if (rmesa->radeon.dma.flush != 0 &&
+            rmesa->radeon.dma.flush != rcommon_flush_last_swtcl_prim)
+        rmesa->radeon.dma.flush( ctx );
 }
 
 
@@ -241,7 +260,7 @@ static void radeonRenderStart( GLcontext *ctx )
  */
 void radeonChooseVertexState( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    TNLcontext *tnl = TNL_CONTEXT(ctx);
 
    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
@@ -254,7 +273,7 @@ void radeonChooseVertexState( GLcontext *ctx )
     * rasterization fallback.  As this function will be called again when we
     * leave a rasterization fallback, we can just skip it for now.
     */
-   if (rmesa->Fallback != 0)
+   if (rmesa->radeon.Fallback != 0)
       return;
 
    /* HW perspective divide is a win, but tiny vertex formats are a
@@ -281,80 +300,33 @@ void radeonChooseVertexState( GLcontext *ctx )
    }
 }
 
-
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( radeonContextPtr rmesa  )
+void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 {
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   rmesa->dma.flush = NULL;
 
-   if (rmesa->dma.current.buf) {
-      struct radeon_dma_region *current = &rmesa->dma.current;
-      GLuint current_offset = (rmesa->radeonScreen->gart_buffer_offset +
-			       current->buf->buf->idx * RADEON_BUFFER_SIZE + 
-			       current->start);
 
-      assert (!(rmesa->swtcl.hw_primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
+   radeonEmitState(&rmesa->radeon);
+   radeonEmitVertexAOS( rmesa,
+			rmesa->radeon.swtcl.vertex_size,
+			first_elem(&rmesa->radeon.dma.reserved)->bo,
+			current_offset);
 
-      assert (current->start + 
-	      rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-	      current->ptr);
+		      
+   radeonEmitVbufPrim( rmesa,
+		       rmesa->swtcl.vertex_format,
+		       rmesa->radeon.swtcl.hw_primitive,
+		       rmesa->radeon.swtcl.numverts);
+   if ( rmesa->radeon.swtcl.emit_prediction < rmesa->radeon.cmdbuf.cs->cdw )
+     WARN_ONCE("Rendering was %d commands larger than predicted size."
+	 " We might overflow  command buffer.\n",
+	 rmesa->radeon.cmdbuf.cs->cdw - rmesa->radeon.swtcl.emit_prediction );
 
-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-	 radeonEnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
-			          rmesa->hw.max_state_size + VBUF_BUFSZ );
 
-	 radeonEmitVertexAOS( rmesa,
-			      rmesa->swtcl.vertex_size,
-			      current_offset);
+   rmesa->radeon.swtcl.emit_prediction = 0;
 
-	 radeonEmitVbufPrim( rmesa,
-			     rmesa->swtcl.vertex_format,
-			     rmesa->swtcl.hw_primitive,
-			     rmesa->swtcl.numverts);
-      }
-
-      rmesa->swtcl.numverts = 0;
-      current->start = current->ptr;
-   }
 }
 
-
-/* Alloc space in the current dma region.
- */
-static INLINE void *
-radeonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
-{
-   GLuint bytes = vsize * nverts;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      radeonRefillCurrentDmaRegion( rmesa );
-
-   if (!rmesa->dma.flush) {
-      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-      rmesa->dma.flush = flush_last_swtcl_prim;
-   }
-
-   assert( vsize == rmesa->swtcl.vertex_size * 4 );
-   assert( rmesa->dma.flush == flush_last_swtcl_prim );
-   assert (rmesa->dma.current.start + 
-	   rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-	   rmesa->dma.current.ptr);
-
-
-   {
-      GLubyte *head = (GLubyte *)(rmesa->dma.current.address + rmesa->dma.current.ptr);
-      rmesa->dma.current.ptr += bytes;
-      rmesa->swtcl.numverts += nverts;
-      return head;
-   }
-
-}
-
-
 /*
  * Render unclipped vertex buffers by emitting vertices directly to
  * dma buffers.  Use strip/fan hardware primitives where possible.
@@ -387,22 +359,31 @@ static const GLuint hw_prim[GL_POLYGON+1] = {
 };
 
 static INLINE void
-radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
+radeonDmaPrimitive( r100ContextPtr rmesa, GLenum prim )
 {
    RADEON_NEWPRIM( rmesa );
-   rmesa->swtcl.hw_primitive = hw_prim[prim];
-   assert(rmesa->dma.current.ptr == rmesa->dma.current.start);
+   rmesa->radeon.swtcl.hw_primitive = hw_prim[prim];
+   //   assert(rmesa->radeon.dma.current.ptr == rmesa->radeon.dma.current.start);
+}
+
+static void* radeon_alloc_verts( r100ContextPtr rmesa , GLuint nr, GLuint size )
+{
+   void *rv;
+   do {
+     radeon_predict_emit_size( rmesa );
+     rv = rcommonAllocDmaLowVerts( &rmesa->radeon, nr, size );
+   } while (!rv);
+   return rv;
 }
 
-#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
+#define LOCAL_VARS r100ContextPtr rmesa = R100_CONTEXT(ctx)
 #define INIT( prim ) radeonDmaPrimitive( rmesa, prim )
 #define FLUSH()  RADEON_NEWPRIM( rmesa )
-#define GET_CURRENT_VB_MAX_VERTS() \
-  (((int)rmesa->dma.current.end - (int)rmesa->dma.current.ptr) / (rmesa->swtcl.vertex_size*4))
+#define GET_CURRENT_VB_MAX_VERTS()					10\
+//  (((int)rmesa->radeon.dma.current.end - (int)rmesa->radeon.dma.current.ptr) / (rmesa->radeon.swtcl.vertex_size*4))
 #define GET_SUBSEQUENT_VB_MAX_VERTS() \
-  ((RADEON_BUFFER_SIZE) / (rmesa->swtcl.vertex_size*4))
-#define ALLOC_VERTS( nr ) \
-  radeonAllocDmaLowVerts( rmesa, nr, rmesa->swtcl.vertex_size * 4 )
+  ((RADEON_BUFFER_SIZE) / (rmesa->radeon.swtcl.vertex_size*4))
+#define ALLOC_VERTS( nr ) radeon_alloc_verts( rmesa, nr, rmesa->radeon.swtcl.vertex_size * 4 )
 #define EMIT_VERTS( ctx, j, nr, buf ) \
   _tnl_emit_vertices_to_buffer(ctx, j, (j)+(nr), buf)
 
@@ -418,16 +399,13 @@ radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
 static GLboolean radeon_run_render( GLcontext *ctx,
 				    struct tnl_pipeline_stage *stage )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    struct vertex_buffer *VB = &tnl->vb;
    tnl_render_func *tab = TAG(render_tab_verts);
    GLuint i;
 
-   if (rmesa->swtcl.indexed_verts.buf) 
-      RELEASE_ELT_VERTS();
-   	
-   if (rmesa->swtcl.RenderIndex != 0 ||   
+   if (rmesa->radeon.swtcl.RenderIndex != 0 ||   
        !radeon_dma_validate_render( ctx, VB ))
       return GL_TRUE;		
 
@@ -442,8 +420,8 @@ static GLboolean radeon_run_render( GLcontext *ctx,
       if (!length)
 	 continue;
 
-      if (RADEON_DEBUG & DEBUG_PRIMS)
-	 fprintf(stderr, "radeon_render.c: prim %s %d..%d\n", 
+      radeon_print(RADEON_SWRENDER, RADEON_NORMAL,
+	  "radeon_render.c: prim %s %d..%d\n",
 		 _mesa_lookup_enum_by_nr(prim & PRIM_MODE_MASK), 
 		 start, start+length);
 
@@ -496,13 +474,13 @@ static void radeonResetLineStipple( GLcontext *ctx );
 
 #undef LOCAL_VARS
 #undef ALLOC_VERTS
-#define CTX_ARG radeonContextPtr rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) radeonAllocDmaLowVerts( rmesa, n, (size) * 4 )
+#define CTX_ARG r100ContextPtr rmesa
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) radeon_alloc_verts( rmesa, n, (size) * 4 )
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);		\
-   const char *radeonverts = (char *)rmesa->swtcl.verts;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);		\
+   const char *radeonverts = (char *)rmesa->radeon.swtcl.verts;
 #define VERT(x) (radeonVertex *)(radeonverts + ((x) * (vertsize) * sizeof(int)))
 #define VERTEX radeonVertex 
 #undef TAG
@@ -560,7 +538,7 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + ((e) * rmesa->swtcl.vertex_size * sizeof(int)))
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + ((e) * rmesa->radeon.swtcl.vertex_size * sizeof(int)))
 
 #define VERT_SET_RGBA( v, c )  					\
 do {								\
@@ -606,8 +584,8 @@ do {							\
 #undef INIT
 
 #define LOCAL_VARS(n)							\
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);			\
-   GLuint color[n], spec[n];						\
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);			\
+   GLuint color[n] = {0}, spec[n] = {0};						\
    GLuint coloroffset = rmesa->swtcl.coloroffset;	\
    GLuint specoffset = rmesa->swtcl.specoffset;			\
    (void) color; (void) spec; (void) coloroffset; (void) specoffset;
@@ -617,7 +595,7 @@ do {							\
  ***********************************************************************/
 
 #define RASTERIZE(x) radeonRasterPrimitive( ctx, reduced_hw_prim[x] )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
 #undef TAG
 #define TAG(x) x
 #include "tnl_dd/t_dd_unfilled.h"
@@ -673,9 +651,9 @@ static void init_rast_tab( void )
 } while (0)
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);		\
-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
-   const char *radeonverts = (char *)rmesa->swtcl.verts;		\
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);		\
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
+   const char *radeonverts = (char *)rmesa->radeon.swtcl.verts;		\
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
    const GLboolean stipple = ctx->Line.StippleFlag;		\
    (void) elt; (void) stipple;
@@ -700,17 +678,17 @@ static void init_rast_tab( void )
 void radeonChooseRenderState( GLcontext *ctx )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint index = 0;
    GLuint flags = ctx->_TriangleCaps;
 
-   if (!rmesa->TclFallback || rmesa->Fallback) 
+   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback) 
       return;
 
    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= RADEON_TWOSIDE_BIT;
    if (flags & DD_TRI_UNFILLED)      index |= RADEON_UNFILLED_BIT;
 
-   if (index != rmesa->swtcl.RenderIndex) {
+   if (index != rmesa->radeon.swtcl.RenderIndex) {
       tnl->Driver.Render.Points = rast_tab[index].points;
       tnl->Driver.Render.Line = rast_tab[index].line;
       tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -727,7 +705,7 @@ void radeonChooseRenderState( GLcontext *ctx )
 	 tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
       }
 
-      rmesa->swtcl.RenderIndex = index;
+      rmesa->radeon.swtcl.RenderIndex = index;
    }
 }
 
@@ -739,18 +717,18 @@ void radeonChooseRenderState( GLcontext *ctx )
 
 static void radeonRasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   if (rmesa->swtcl.hw_primitive != hwprim) {
+   if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
       RADEON_NEWPRIM( rmesa );
-      rmesa->swtcl.hw_primitive = hwprim;
+      rmesa->radeon.swtcl.hw_primitive = hwprim;
    }
 }
 
 static void radeonRenderPrimitive( GLcontext *ctx, GLenum prim )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   rmesa->swtcl.render_primitive = prim;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   rmesa->radeon.swtcl.render_primitive = prim;
    if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
       radeonRasterPrimitive( ctx, reduced_hw_prim[prim] );
 }
@@ -761,7 +739,7 @@ static void radeonRenderFinish( GLcontext *ctx )
 
 static void radeonResetLineStipple( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    RADEON_STATECHANGE( rmesa, lin );
 }
 
@@ -795,25 +773,25 @@ static const char *getFallbackString(GLuint bit)
 
 void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->Fallback;
+   GLuint oldfallback = rmesa->radeon.Fallback;
 
    if (mode) {
-      rmesa->Fallback |= bit;
+      rmesa->radeon.Fallback |= bit;
       if (oldfallback == 0) {
-	 RADEON_FIREVERTICES( rmesa );
+	 radeon_firevertices(&rmesa->radeon);
 	 TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_TRUE );
 	 _swsetup_Wakeup( ctx );
-	 rmesa->swtcl.RenderIndex = ~0;
-         if (RADEON_DEBUG & DEBUG_FALLBACKS) {
+	 rmesa->radeon.swtcl.RenderIndex = ~0;
+         if (RADEON_DEBUG & RADEON_FALLBACKS) {
             fprintf(stderr, "Radeon begin rasterization fallback: 0x%x %s\n",
                     bit, getFallbackString(bit));
          }
       }
    }
    else {
-      rmesa->Fallback &= ~bit;
+      rmesa->radeon.Fallback &= ~bit;
       if (oldfallback == bit) {
 	 _swrast_flush( ctx );
 	 tnl->Driver.Render.Start = radeonRenderStart;
@@ -826,18 +804,18 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 	 tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
 	 TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_FALSE );
-	 if (rmesa->TclFallback) {
-	    /* These are already done if rmesa->TclFallback goes to
+	 if (rmesa->radeon.TclFallback) {
+	    /* These are already done if rmesa->radeon.TclFallback goes to
 	     * zero above. But not if it doesn't (RADEON_NO_TCL for
 	     * example?)
 	     */
 	    _tnl_invalidate_vertex_state( ctx, ~0 );
 	    _tnl_invalidate_vertices( ctx, ~0 );
-	    RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
+	    RENDERINPUTS_ZERO( rmesa->radeon.tnl_index_bitset );
 	    radeonChooseVertexState( ctx );
 	    radeonChooseRenderState( ctx );
 	 }
-         if (RADEON_DEBUG & DEBUG_FALLBACKS) {
+         if (RADEON_DEBUG & RADEON_FALLBACKS) {
             fprintf(stderr, "Radeon end rasterization fallback: 0x%x %s\n",
                     bit, getFallbackString(bit));
          }
@@ -853,13 +831,14 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 void radeonInitSwtcl( GLcontext *ctx )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    static int firsttime = 1;
 
    if (firsttime) {
       init_rast_tab();
       firsttime = 0;
    }
+   rmesa->radeon.swtcl.emit_prediction = 0;
 
    tnl->Driver.Render.Start = radeonRenderStart;
    tnl->Driver.Render.Finish = radeonRenderFinish;
@@ -872,18 +851,9 @@ void radeonInitSwtcl( GLcontext *ctx )
    _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
 		       RADEON_MAX_TNL_VERTEX_SIZE);
    
-   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->swtcl.render_primitive = GL_TRIANGLES;
-   rmesa->swtcl.hw_primitive = 0;
+   rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+   rmesa->radeon.swtcl.hw_primitive = 0;
 }
 
-
-void radeonDestroySwtcl( GLcontext *ctx )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (rmesa->swtcl.indexed_verts.buf) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
-			      __FUNCTION__ );
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.h b/src/mesa/drivers/dri/radeon/radeon_swtcl.h
index e485052ad7..da89158eeb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.h
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.h
@@ -40,7 +40,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_context.h"
 
 extern void radeonInitSwtcl( GLcontext *ctx );
-extern void radeonDestroySwtcl( GLcontext *ctx );
 
 extern void radeonChooseRenderState( GLcontext *ctx );
 extern void radeonChooseVertexState( GLcontext *ctx );
@@ -63,5 +62,5 @@ extern void radeon_translate_vertex( GLcontext *ctx,
 
 extern void radeon_print_vertex( GLcontext *ctx, const radeonVertex *v );
 
-
+extern void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c
index 779e9ae5df..b334ea05e5 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c
@@ -42,6 +42,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/tnl.h"
 #include "tnl/t_pipeline.h"
 
+#include "radeon_common.h"
 #include "radeon_context.h"
 #include "radeon_state.h"
 #include "radeon_ioctl.h"
@@ -49,6 +50,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_tcl.h"
 #include "radeon_swtcl.h"
 #include "radeon_maos.h"
+#include "radeon_common_context.h"
 
 
 
@@ -104,7 +106,7 @@ static GLboolean discrete_prim[0x10] = {
 };
    
 
-#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
+#define LOCAL_VARS r100ContextPtr rmesa = R100_CONTEXT(ctx)
 #define ELT_TYPE  GLushort
 
 #define ELT_INIT(prim, hw_prim) \
@@ -125,7 +127,7 @@ static GLboolean discrete_prim[0x10] = {
 
 #define RESET_STIPPLE() do {			\
    RADEON_STATECHANGE( rmesa, lin );		\
-   radeonEmitState( rmesa );			\
+   radeonEmitState(&rmesa->radeon);			\
 } while (0)
 
 #define AUTO_STIPPLE( mode )  do {		\
@@ -136,31 +138,26 @@ static GLboolean discrete_prim[0x10] = {
    else						\
       rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] &=	\
 	 ~RADEON_LINE_PATTERN_AUTO_RESET;	\
-   radeonEmitState( rmesa );			\
+   radeonEmitState(&rmesa->radeon);		\
 } while (0)
 
 
 
 #define ALLOC_ELTS(nr)	radeonAllocElts( rmesa, nr )
 
-static GLushort *radeonAllocElts( radeonContextPtr rmesa, GLuint nr ) 
+static GLushort *radeonAllocElts( r100ContextPtr rmesa, GLuint nr ) 
 {
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
+      if (rmesa->radeon.dma.flush)
+	 rmesa->radeon.dma.flush( rmesa->radeon.glCtx );
 
-   radeonEnsureCmdBufSpace(rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
-			   rmesa->hw.max_state_size + ELTS_BUFSZ(nr));
+      radeonEmitAOS( rmesa,
+		     rmesa->radeon.tcl.aos_count, 0 );
 
-   radeonEmitAOS( rmesa,
-		rmesa->tcl.aos_components,
-		rmesa->tcl.nr_aos_components, 0 );
-
-   return radeonAllocEltsOpenEnded( rmesa,
-				    rmesa->tcl.vertex_format, 
-				    rmesa->tcl.hw_primitive, nr );
+      return radeonAllocEltsOpenEnded( rmesa, rmesa->tcl.vertex_format,
+				       rmesa->tcl.hw_primitive, nr );
 }
 
-#define CLOSE_ELTS()  RADEON_NEWPRIM( rmesa )
+#define CLOSE_ELTS() if (0)  RADEON_NEWPRIM( rmesa )
 
 
 
@@ -174,15 +171,11 @@ static void radeonEmitPrim( GLcontext *ctx,
 		       GLuint start, 
 		       GLuint count)	
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    radeonTclPrimitive( ctx, prim, hwprim );
    
-   radeonEnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
-			    rmesa->hw.max_state_size + VBUF_BUFSZ );
-
    radeonEmitAOS( rmesa,
-		  rmesa->tcl.aos_components,
-		  rmesa->tcl.nr_aos_components,
+		  rmesa->radeon.tcl.aos_count,
 		  start );
    
    /* Why couldn't this packet have taken an offset param?
@@ -197,6 +190,8 @@ static void radeonEmitPrim( GLcontext *ctx,
    radeonEmitPrim( ctx, prim, hwprim, start, count );           \
    (void) rmesa; } while (0)
 
+#define MAX_CONVERSION_SIZE 40
+
 /* Try & join small primitives
  */
 #if 0
@@ -254,7 +249,7 @@ void radeonTclPrimitive( GLcontext *ctx,
 			 GLenum prim,
 			 int hw_prim )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint se_cntl;
    GLuint newprim = hw_prim | RADEON_CP_VC_CNTL_TCL_ENABLE;
 
@@ -361,6 +356,73 @@ radeonComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord )
    }
 }
 
+/**
+ * Predict total emit size for next rendering operation so there is no flush in middle of rendering
+ * Prediction has to aim towards the best possible value that is worse than worst case scenario
+ */
+static GLuint radeonEnsureEmitSize( GLcontext * ctx , GLuint inputs )
+{
+  r100ContextPtr rmesa = R100_CONTEXT(ctx);
+  TNLcontext *tnl = TNL_CONTEXT(ctx);
+  struct vertex_buffer *VB = &tnl->vb;
+  GLuint space_required;
+  GLuint state_size;
+  GLuint nr_aos = 1; /* radeonEmitArrays does always emit one */
+  int i;
+  /* list of flags that are allocating aos object */
+  const GLuint flags_to_check[] = {
+    VERT_BIT_NORMAL,
+    VERT_BIT_COLOR0,
+    VERT_BIT_COLOR1,
+    VERT_BIT_FOG
+  };
+  /* predict number of aos to emit */
+  for (i=0; i < sizeof(flags_to_check)/sizeof(flags_to_check[0]); ++i)
+  {
+    if (inputs & flags_to_check[i])
+      ++nr_aos;
+  }
+  for (i = 0; i < ctx->Const.MaxTextureUnits; ++i)
+  {
+    if (inputs & VERT_BIT_TEX(i))
+      ++nr_aos;
+  }
+
+  {
+    /* count the prediction for state size */
+    space_required = 0;
+    state_size = radeonCountStateEmitSize( &rmesa->radeon );
+    /* tcl may be changed in radeonEmitArrays so account for it if not dirty */
+    if (!rmesa->hw.tcl.dirty)
+      state_size += rmesa->hw.tcl.check( rmesa->radeon.glCtx, &rmesa->hw.tcl );
+    /* predict size for elements */
+    for (i = 0; i < VB->PrimitiveCount; ++i)
+    {
+      if (!VB->Primitive[i].count)
+	continue;
+      /* If primitive.count is less than MAX_CONVERSION_SIZE
+	 rendering code may decide convert to elts.
+	 In that case we have to make pessimistic prediction.
+	 and use larger of 2 paths. */
+      const GLuint elts = ELTS_BUFSZ(nr_aos);
+      const GLuint index = INDEX_BUFSZ;
+      const GLuint vbuf = VBUF_BUFSZ;
+      if ( (!VB->Elts && VB->Primitive[i].count >= MAX_CONVERSION_SIZE)
+	  || vbuf > index + elts)
+	space_required += vbuf;
+      else
+	space_required += index + elts;
+      space_required += AOS_BUFSZ(nr_aos);
+    }
+    space_required += SCISSOR_BUFSZ;
+  }
+  /* flush the buffer in case we need more than is left. */
+  if (rcommonEnsureCmdBufSpace(&rmesa->radeon, space_required, __FUNCTION__))
+    return space_required + radeonCountStateEmitSize( &rmesa->radeon );
+  else
+    return space_required + state_size;
+}
+
 /**********************************************************************/
 /*                          Render pipeline stage                     */
 /**********************************************************************/
@@ -371,7 +433,7 @@ radeonComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord )
 static GLboolean radeon_run_tcl_render( GLcontext *ctx,
 					struct tnl_pipeline_stage *stage )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    struct vertex_buffer *VB = &tnl->vb;
    GLuint inputs = VERT_BIT_POS | VERT_BIT_COLOR0;
@@ -379,7 +441,7 @@ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
 
    /* TODO: separate this from the swtnl pipeline 
     */
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       return GL_TRUE;	/* fallback to software t&l */
 
    if (VB->Count == 0)
@@ -411,6 +473,8 @@ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
    }
 
    radeonReleaseArrays( ctx, ~0 );
+   GLuint emit_end = radeonEnsureEmitSize( ctx, inputs )
+     + rmesa->radeon.cmdbuf.cs->cdw;
    radeonEmitArrays( ctx, inputs );
 
    rmesa->tcl.Elts = VB->Elts;
@@ -430,6 +494,10 @@ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
 	 radeonEmitPrimitive( ctx, start, start+length, prim );
    }
 
+   if (emit_end < rmesa->radeon.cmdbuf.cs->cdw)
+      WARN_ONCE("Rendering was %d commands larger than predicted size."
+	  " We might overflow  command buffer.\n", rmesa->radeon.cmdbuf.cs->cdw - emit_end);
+
    return GL_FALSE;		/* finished the pipe */
 }
 
@@ -461,7 +529,7 @@ const struct tnl_pipeline_stage _radeon_tcl_stage =
 
 static void transition_to_swtnl( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    GLuint se_cntl;
 
@@ -490,7 +558,7 @@ static void transition_to_swtnl( GLcontext *ctx )
 
 static void transition_to_hwtnl( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
 
@@ -509,17 +577,17 @@ static void transition_to_hwtnl( GLcontext *ctx )
 
    tnl->Driver.NotifyMaterialChange = radeonUpdateMaterial;
 
-   if ( rmesa->dma.flush )			
-      rmesa->dma.flush( rmesa );	
+   if ( rmesa->radeon.dma.flush )			
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	
 
-   rmesa->dma.flush = NULL;
+   rmesa->radeon.dma.flush = NULL;
    rmesa->swtcl.vertex_format = 0;
    
-   if (rmesa->swtcl.indexed_verts.buf) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
-			      __FUNCTION__ );
+   //   if (rmesa->swtcl.indexed_verts.buf) 
+   //      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
+   //			      __FUNCTION__ );
 
-   if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+   if (RADEON_DEBUG & RADEON_FALLBACKS)
       fprintf(stderr, "Radeon end tcl fallback\n");
 }
 
@@ -550,22 +618,22 @@ static char *getFallbackString(GLuint bit)
 
 void radeonTclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->TclFallback;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   GLuint oldfallback = rmesa->radeon.TclFallback;
 
    if (mode) {
-      rmesa->TclFallback |= bit;
+      rmesa->radeon.TclFallback |= bit;
       if (oldfallback == 0) {
-	 if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+	 if (RADEON_DEBUG & RADEON_FALLBACKS)
 	    fprintf(stderr, "Radeon begin tcl fallback %s\n",
 		    getFallbackString( bit ));
 	 transition_to_swtnl( ctx );
       }
    }
    else {
-      rmesa->TclFallback &= ~bit;
+      rmesa->radeon.TclFallback &= ~bit;
       if (oldfallback == bit) {
-	 if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+	 if (RADEON_DEBUG & RADEON_FALLBACKS)
 	    fprintf(stderr, "Radeon end tcl fallback %s\n",
 		    getFallbackString( bit ));
 	 transition_to_hwtnl( ctx );
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index b0aec21670..99865fff27 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -44,6 +44,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/texobj.h"
 
 #include "radeon_context.h"
+#include "radeon_mipmap_tree.h"
 #include "radeon_state.h"
 #include "radeon_ioctl.h"
 #include "radeon_swtcl.h"
@@ -170,10 +171,13 @@ static void radeonSetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
 {
    GLuint anisotropy = (t->pp_txfilter & RADEON_MAX_ANISO_MASK);
 
+   /* Force revalidation to account for switches from/to mipmapping. */
+   t->validated = GL_FALSE;
+
    t->pp_txfilter &= ~(RADEON_MIN_FILTER_MASK | RADEON_MAG_FILTER_MASK);
 
    /* r100 chips can't handle mipmaps/aniso for cubemap/volume textures */
-   if ( t->base.tObj->Target == GL_TEXTURE_CUBE_MAP ) {
+   if ( t->base.Target == GL_TEXTURE_CUBE_MAP ) {
       switch ( minf ) {
       case GL_NEAREST:
       case GL_NEAREST_MIPMAP_NEAREST:
@@ -239,442 +243,27 @@ static void radeonSetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
    }
 }
 
-static void radeonSetTexBorderColor( radeonTexObjPtr t, GLubyte c[4] )
+static void radeonSetTexBorderColor( radeonTexObjPtr t, const GLfloat color[4] )
 {
+   GLubyte c[4];
+   CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+   CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+   CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+   CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
    t->pp_border_color = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
 }
 
-
-/**
- * Allocate space for and load the mesa images into the texture memory block.
- * This will happen before drawing with a new texture, or drawing with a
- * texture after it was swapped out or teximaged again.
- */
-
-static radeonTexObjPtr radeonAllocTexObj( struct gl_texture_object *texObj )
-{
-   radeonTexObjPtr t;
-
-   t = CALLOC_STRUCT( radeon_tex_obj );
-   texObj->DriverData = t;
-   if ( t != NULL ) {
-      if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
-	 fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)texObj, (void *)t );
-      }
-
-      /* Initialize non-image-dependent parts of the state:
-       */
-      t->base.tObj = texObj;
-      t->border_fallback = GL_FALSE;
-
-      t->pp_txfilter = RADEON_BORDER_MODE_OGL;
-      t->pp_txformat = (RADEON_TXFORMAT_ENDIAN_NO_SWAP |
-			RADEON_TXFORMAT_PERSPECTIVE_ENABLE);
-
-      make_empty_list( & t->base );
-
-      radeonSetTexWrap( t, texObj->WrapS, texObj->WrapT );
-      radeonSetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
-      radeonSetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
-      radeonSetTexBorderColor( t, texObj->_BorderChan );
-   }
-
-   return t;
-}
-
-
-static const struct gl_texture_format *
-radeonChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
-                           GLenum format, GLenum type )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   const GLboolean do32bpt =
-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32 );
-   const GLboolean force16bpt =
-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16 );
-   (void) format;
-
-   switch ( internalFormat ) {
-   case 4:
-   case GL_RGBA:
-   case GL_COMPRESSED_RGBA:
-      switch ( type ) {
-      case GL_UNSIGNED_INT_10_10_10_2:
-      case GL_UNSIGNED_INT_2_10_10_10_REV:
-	 return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb1555;
-      case GL_UNSIGNED_SHORT_4_4_4_4:
-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-	 return _dri_texformat_argb4444;
-      case GL_UNSIGNED_SHORT_5_5_5_1:
-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-	 return _dri_texformat_argb1555;
-      default:
-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb4444;
-      }
-
-   case 3:
-   case GL_RGB:
-   case GL_COMPRESSED_RGB:
-      switch ( type ) {
-      case GL_UNSIGNED_SHORT_4_4_4_4:
-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-	 return _dri_texformat_argb4444;
-      case GL_UNSIGNED_SHORT_5_5_5_1:
-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-	 return _dri_texformat_argb1555;
-      case GL_UNSIGNED_SHORT_5_6_5:
-      case GL_UNSIGNED_SHORT_5_6_5_REV:
-	 return _dri_texformat_rgb565;
-      default:
-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
-      }
-
-   case GL_RGBA8:
-   case GL_RGB10_A2:
-   case GL_RGBA12:
-   case GL_RGBA16:
-      return !force16bpt ?
-	  _dri_texformat_argb8888 : _dri_texformat_argb4444;
-
-   case GL_RGBA4:
-   case GL_RGBA2:
-      return _dri_texformat_argb4444;
-
-   case GL_RGB5_A1:
-      return _dri_texformat_argb1555;
-
-   case GL_RGB8:
-   case GL_RGB10:
-   case GL_RGB12:
-   case GL_RGB16:
-      return !force16bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
-
-   case GL_RGB5:
-   case GL_RGB4:
-   case GL_R3_G3_B2:
-      return _dri_texformat_rgb565;
-
-   case GL_ALPHA:
-   case GL_ALPHA4:
-   case GL_ALPHA8:
-   case GL_ALPHA12:
-   case GL_ALPHA16:
-   case GL_COMPRESSED_ALPHA:
-      return _dri_texformat_a8;
-
-   case 1:
-   case GL_LUMINANCE:
-   case GL_LUMINANCE4:
-   case GL_LUMINANCE8:
-   case GL_LUMINANCE12:
-   case GL_LUMINANCE16:
-   case GL_COMPRESSED_LUMINANCE:
-      return _dri_texformat_l8;
-
-   case 2:
-   case GL_LUMINANCE_ALPHA:
-   case GL_LUMINANCE4_ALPHA4:
-   case GL_LUMINANCE6_ALPHA2:
-   case GL_LUMINANCE8_ALPHA8:
-   case GL_LUMINANCE12_ALPHA4:
-   case GL_LUMINANCE12_ALPHA12:
-   case GL_LUMINANCE16_ALPHA16:
-   case GL_COMPRESSED_LUMINANCE_ALPHA:
-      return _dri_texformat_al88;
-
-   case GL_INTENSITY:
-   case GL_INTENSITY4:
-   case GL_INTENSITY8:
-   case GL_INTENSITY12:
-   case GL_INTENSITY16:
-   case GL_COMPRESSED_INTENSITY:
-      return _dri_texformat_i8;
-
-   case GL_YCBCR_MESA:
-      if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-          type == GL_UNSIGNED_BYTE)
-         return &_mesa_texformat_ycbcr;
-      else
-         return &_mesa_texformat_ycbcr_rev;
-
-   case GL_RGB_S3TC:
-   case GL_RGB4_S3TC:
-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgb_dxt1;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgba_dxt1;
-
-   case GL_RGBA_S3TC:
-   case GL_RGBA4_S3TC:
-   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-      return &_mesa_texformat_rgba_dxt3;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-      return &_mesa_texformat_rgba_dxt5;
-
-   default:
-      _mesa_problem(ctx, "unexpected texture format in %s", __FUNCTION__);
-      return NULL;
-   }
-
-   return NULL; /* never get here */
-}
-
-
-static void radeonTexImage1D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint border,
-                              GLenum format, GLenum type, const GLvoid *pixels,
-                              const struct gl_pixelstore_attrib *packing,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-         return;
-      }
-   }
-
-   /* Note, this will call ChooseTextureFormat */
-   _mesa_store_teximage1d(ctx, target, level, internalFormat,
-                          width, border, format, type, pixels,
-                          &ctx->Unpack, texObj, texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-
-
-static void radeonTexSubImage1D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset,
-                                 GLsizei width,
-                                 GLenum format, GLenum type,
-                                 const GLvoid *pixels,
-                                 const struct gl_pixelstore_attrib *packing,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
-         return;
-      }
-   }
-
-   _mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
-			     format, type, pixels, packing, texObj,
-			     texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-
-
-static void radeonTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint height, GLint border,
-                              GLenum format, GLenum type, const GLvoid *pixels,
-                              const struct gl_pixelstore_attrib *packing,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   if ( t != NULL ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
-         return;
-      }
-   }
-
-   /* Note, this will call ChooseTextureFormat */
-   _mesa_store_teximage2d(ctx, target, level, internalFormat,
-                          width, height, border, format, type, pixels,
-                          &ctx->Unpack, texObj, texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-
-static void radeonTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset, GLint yoffset,
-                                 GLsizei width, GLsizei height,
-                                 GLenum format, GLenum type,
-                                 const GLvoid *pixels,
-                                 const struct gl_pixelstore_attrib *packing,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
-         return;
-      }
-   }
-
-   _mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-			     height, format, type, pixels, packing, texObj,
-			     texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-static void radeonCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint height, GLint border,
-                              GLsizei imageSize, const GLvoid *data,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   if ( t != NULL ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
-         return;
-      }
-   }
-
-   /* Note, this will call ChooseTextureFormat */
-   _mesa_store_compressed_teximage2d(ctx, target, level, internalFormat, width,
-                                 height, border, imageSize, data, texObj, texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-
-static void radeonCompressedTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset, GLint yoffset,
-                                 GLsizei width, GLsizei height,
-                                 GLenum format,
-                                 GLsizei imageSize, const GLvoid *data,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage2D");
-         return;
-      }
-   }
-
-   _mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-                                 height, format, imageSize, data, texObj, texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
 #define SCALED_FLOAT_TO_BYTE( x, scale ) \
 		(((GLuint)((255.0F / scale) * (x))) / 2)
 
 static void radeonTexEnv( GLcontext *ctx, GLenum target,
 			  GLenum pname, const GLfloat *param )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint unit = ctx->Texture.CurrentUnit;
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 
-   if ( RADEON_DEBUG & DEBUG_STATE ) {
+   if ( RADEON_DEBUG & RADEON_STATE ) {
       fprintf( stderr, "%s( %s )\n",
 	       __FUNCTION__, _mesa_lookup_enum_by_nr( pname ) );
    }
@@ -701,7 +290,7 @@ static void radeonTexEnv( GLcontext *ctx, GLenum target,
        * functions, one mapping [-1.0,0.0] to [-128,0] and one mapping
        * [0.0,4.0] to [0,127].
        */
-      min = driQueryOptionb (&rmesa->optionCache, "no_neg_lod_bias") ?
+      min = driQueryOptionb (&rmesa->radeon.optionCache, "no_neg_lod_bias") ?
 	  0.0 : -1.0;
       bias = CLAMP( *param, min, 4.0 );
       if ( bias == 0 ) {
@@ -734,12 +323,10 @@ static void radeonTexParameter( GLcontext *ctx, GLenum target,
 				struct gl_texture_object *texObj,
 				GLenum pname, const GLfloat *params )
 {
-   radeonTexObjPtr t = (radeonTexObjPtr) texObj->DriverData;
+   radeonTexObj* t = radeon_tex_obj(texObj);
 
-   if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
-      fprintf( stderr, "%s( %s )\n", __FUNCTION__,
+   radeon_print(RADEON_TEXTURE, RADEON_VERBOSE, "%s( %s )\n", __FUNCTION__,
 	       _mesa_lookup_enum_by_nr( pname ) );
-   }
 
    switch ( pname ) {
    case GL_TEXTURE_MIN_FILTER:
@@ -755,64 +342,57 @@ static void radeonTexParameter( GLcontext *ctx, GLenum target,
       break;
 
    case GL_TEXTURE_BORDER_COLOR:
-      radeonSetTexBorderColor( t, texObj->_BorderChan );
+      radeonSetTexBorderColor( t, texObj->BorderColor );
       break;
 
    case GL_TEXTURE_BASE_LEVEL:
    case GL_TEXTURE_MAX_LEVEL:
    case GL_TEXTURE_MIN_LOD:
    case GL_TEXTURE_MAX_LOD:
+
       /* This isn't the most efficient solution but there doesn't appear to
        * be a nice alternative.  Since there's no LOD clamping,
        * we just have to rely on loading the right subset of mipmap levels
        * to simulate a clamped LOD.
        */
-      driSwapOutTextureObject( (driTextureObject *) t );
+      if (t->mt) {
+         radeon_miptree_unreference(t->mt);
+	 t->mt = 0;
+	 t->validated = GL_FALSE;
+      }
       break;
 
    default:
       return;
    }
-
-   /* Mark this texobj as dirty (one bit per tex unit)
-    */
-   t->dirty_state = TEX_ALL;
-}
-
-
-static void radeonBindTexture( GLcontext *ctx, GLenum target,
-			       struct gl_texture_object *texObj )
-{
-   if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
-      fprintf( stderr, "%s( %p ) unit=%d\n", __FUNCTION__, (void *)texObj,
-	       ctx->Texture.CurrentUnit );
-   }
-
-   assert( (target != GL_TEXTURE_1D && target != GL_TEXTURE_2D &&
-            target != GL_TEXTURE_RECTANGLE_NV && target != GL_TEXTURE_CUBE_MAP) ||
-           (texObj->DriverData != NULL) );
 }
 
-
 static void radeonDeleteTexture( GLcontext *ctx,
 				 struct gl_texture_object *texObj )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   radeonTexObj* t = radeon_tex_obj(texObj);
+   int i;
 
-   if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
-      fprintf( stderr, "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
+   radeon_print(RADEON_TEXTURE, RADEON_NORMAL,
+	 "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
 	       _mesa_lookup_enum_by_nr( texObj->Target ) );
-   }
-
-   if ( t != NULL ) {
-      if ( rmesa ) {
-         RADEON_FIREVERTICES( rmesa );
-      }
 
-      driDestroyTextureObject( t );
+   if ( rmesa ) {
+     radeon_firevertices(&rmesa->radeon);
+     for ( i = 0 ; i < rmesa->radeon.glCtx->Const.MaxTextureUnits ; i++ ) {
+       if ( t == rmesa->state.texture.unit[i].texobj ) {
+	 rmesa->state.texture.unit[i].texobj = NULL;
+	 rmesa->hw.tex[i].dirty = GL_FALSE;
+	 rmesa->hw.cube[i].dirty = GL_FALSE;
+       }
+     }
    }
 
+   if (t->mt) {
+      radeon_miptree_unreference(t->mt);
+      t->mt = 0;
+   }
    /* Free mipmap images and the texture object itself */
    _mesa_delete_texture_object(ctx, texObj);
 }
@@ -832,7 +412,7 @@ static void radeonTexGen( GLcontext *ctx,
 			  GLenum pname,
 			  const GLfloat *params )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint unit = ctx->Texture.CurrentUnit;
    rmesa->recheck_texgen[unit] = GL_TRUE;
 }
@@ -846,29 +426,40 @@ static void radeonTexGen( GLcontext *ctx,
 static struct gl_texture_object *
 radeonNewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_object *obj;
-   obj = _mesa_new_texture_object(ctx, name, target);
-   if (!obj)
-      return NULL;
-   obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
-   radeonAllocTexObj( obj );
-   return obj;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
+   _mesa_initialize_texture_object(&t->base, name, target);
+   t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+   t->border_fallback = GL_FALSE;
+
+   t->pp_txfilter = RADEON_BORDER_MODE_OGL;
+   t->pp_txformat = (RADEON_TXFORMAT_ENDIAN_NO_SWAP |
+		     RADEON_TXFORMAT_PERSPECTIVE_ENABLE);
+   
+   radeonSetTexWrap( t, t->base.WrapS, t->base.WrapT );
+   radeonSetTexMaxAnisotropy( t, t->base.MaxAnisotropy );
+   radeonSetTexFilter( t, t->base.MinFilter, t->base.MagFilter );
+   radeonSetTexBorderColor( t, t->base.BorderColor );
+   return &t->base;
 }
 
 
+
 void radeonInitTextureFuncs( struct dd_function_table *functions )
 {
-   functions->ChooseTextureFormat	= radeonChooseTextureFormat;
+   functions->ChooseTextureFormat	= radeonChooseTextureFormat_mesa;
    functions->TexImage1D		= radeonTexImage1D;
    functions->TexImage2D		= radeonTexImage2D;
    functions->TexSubImage1D		= radeonTexSubImage1D;
    functions->TexSubImage2D		= radeonTexSubImage2D;
+   functions->GetTexImage               = radeonGetTexImage;
+   functions->GetCompressedTexImage     = radeonGetCompressedTexImage;
 
    functions->NewTextureObject		= radeonNewTextureObject;
-   functions->BindTexture		= radeonBindTexture;
+   //   functions->BindTexture		= radeonBindTexture;
    functions->DeleteTexture		= radeonDeleteTexture;
-   functions->IsTextureResident		= driIsTextureResident;
 
    functions->TexEnv			= radeonTexEnv;
    functions->TexParameter		= radeonTexParameter;
@@ -877,5 +468,12 @@ void radeonInitTextureFuncs( struct dd_function_table *functions )
    functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
    functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
 
+   functions->GenerateMipmap = radeonGenerateMipmap;
+
+   functions->NewTextureImage = radeonNewTextureImage;
+   functions->FreeTexImageData = radeonFreeTexImageData;
+   functions->MapTexture = radeonMapTexture;
+   functions->UnmapTexture = radeonUnmapTexture;
+
    driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h b/src/mesa/drivers/dri/radeon/radeon_tex.h
index 8000880828..a4aaddc74f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
@@ -41,12 +41,16 @@ extern void radeonSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
                                unsigned long long offset, GLint depth,
                                GLuint pitch);
 
+extern void radeonSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv);
+extern void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format,
+			       __DRIdrawable *dPriv);
+
 extern void radeonUpdateTextureState( GLcontext *ctx );
 
-extern int radeonUploadTexImages( radeonContextPtr rmesa, radeonTexObjPtr t,
+extern int radeonUploadTexImages( r100ContextPtr rmesa, radeonTexObjPtr t,
 				  GLuint face );
 
-extern void radeonDestroyTexObj( radeonContextPtr rmesa, radeonTexObjPtr t );
+extern void radeonDestroyTexObj( r100ContextPtr rmesa, radeonTexObjPtr t );
 
 extern void radeonInitTextureFuncs( struct dd_function_table *functions );
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_texmem.c b/src/mesa/drivers/dri/radeon/radeon_texmem.c
deleted file mode 100644
index 5f7bbe6a4c..0000000000
--- a/src/mesa/drivers/dri/radeon/radeon_texmem.c
+++ /dev/null
@@ -1,404 +0,0 @@
-/**************************************************************************
-
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation on the rights to use, copy, modify, merge, publish,
-distribute, sub license, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Kevin E. Martin <martin@valinux.com>
- *   Gareth Hughes <gareth@valinux.com>
- *
- */
-#include <errno.h> 
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/macros.h"
-
-#include "radeon_context.h"
-#include "radeon_ioctl.h"
-#include "radeon_tex.h"
-
-#include <unistd.h>  /* for usleep() */
-
-
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void
-radeonDestroyTexObj( radeonContextPtr rmesa, radeonTexObjPtr t )
-{
-   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)t, (void *)t->base.tObj );
-   }
-
-   if ( rmesa != NULL ) {
-      unsigned   i;
-
-
-      for ( i = 0 ; i < rmesa->glCtx->Const.MaxTextureUnits ; i++ ) {
-	 if ( t == rmesa->state.texture.unit[i].texobj ) {
-	    rmesa->state.texture.unit[i].texobj = NULL;
-	 }
-      }
-   }
-}
-
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-
-static void radeonUploadRectSubImage( radeonContextPtr rmesa,
-				      radeonTexObjPtr t, 
-				      struct gl_texture_image *texImage,
-				      GLint x, GLint y, 
-				      GLint width, GLint height )
-{
-   const struct gl_texture_format *texFormat = texImage->TexFormat;
-   int blit_format, dstPitch, done;
-
-   switch ( texFormat->TexelBytes ) {
-   case 1:
-      blit_format = RADEON_GMC_DST_8BPP_CI;
-      break;
-   case 2:
-      blit_format = RADEON_GMC_DST_16BPP;
-      break;
-   case 4:
-      blit_format = RADEON_GMC_DST_32BPP;
-      break;
-   default:
-      fprintf( stderr, "radeonUploadRectSubImage: unknown blit_format (texelbytes=%d)\n", 
-      	       texFormat->TexelBytes);
-      return;
-   }
-
-   t->image[0][0].data = texImage->Data;
-
-   /* Currently don't need to cope with small pitches.
-    */
-   width = texImage->Width;
-   height = texImage->Height;
-   dstPitch = t->pp_txpitch + 32;
-
-   {	/* FIXME: prefer GART-texturing if possible */
-      /* Data not in GART memory, or bad pitch.
-       */
-      for (done = 0; done < height ; ) {
-	 struct radeon_dma_region region;
-	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
-	 int src_pitch;
-	 char *tex;
-
-         src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-	 tex = (char *)texImage->Data + done * src_pitch;
-
-	 memset(&region, 0, sizeof(region));
-	 radeonAllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
-
-	 /* Copy texdata to dma:
-	  */
-	 if (0)
-	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
-		    __FUNCTION__, src_pitch, dstPitch);
-
-	 if (src_pitch == dstPitch) {
-	    memcpy( region.address + region.start, tex, lines * src_pitch );
-	 } 
-	 else {
-	    char *buf = region.address + region.start;
-	    int i;
-	    for (i = 0 ; i < lines ; i++) {
-	       memcpy( buf, tex, src_pitch );
-	       buf += dstPitch;
-	       tex += src_pitch;
-	    }
-	 }
-
-	 radeonEmitWait( rmesa, RADEON_WAIT_3D );
-
-	 
-
-	 /* Blit to framebuffer
-	  */
-	 radeonEmitBlit( rmesa,
-		       blit_format,
-		       dstPitch, GET_START( &region ),
-		       dstPitch, t->bufAddr,
-		       0, 0,
-		       0, done,
-		       width, lines );
-	 
-	 radeonEmitWait( rmesa, RADEON_WAIT_2D );
-
-	 radeonReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
-	 done += lines;
-      }
-   }
-}
-
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void uploadSubImage( radeonContextPtr rmesa, radeonTexObjPtr t, 
-			    GLint hwlevel,
-			    GLint x, GLint y, GLint width, GLint height,
-			    GLuint face )
-{
-   struct gl_texture_image *texImage = NULL;
-   GLuint offset;
-   GLint imageWidth, imageHeight;
-   GLint ret;
-   drm_radeon_texture_t tex;
-   drm_radeon_tex_image_t tmp;
-   const int level = hwlevel + t->base.firstLevel;
-
-   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
-	       __FUNCTION__, (void *)t, (void *)t->base.tObj, level, width, height, face );
-   }
-
-   ASSERT(face < 6);
-
-   /* Ensure we have a valid texture to upload */
-   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
-      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-      return;
-   }
-
-   texImage = t->base.tObj->Image[face][level];
-
-   if ( !texImage ) {
-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
-      return;
-   }
-   if ( !texImage->Data ) {
-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
-      return;
-   }
-
-
-   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-      assert(level == 0);
-      assert(hwlevel == 0);
-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
-      radeonUploadRectSubImage( rmesa, t, texImage, x, y, width, height );
-      return;
-   }
-
-   imageWidth = texImage->Width;
-   imageHeight = texImage->Height;
-
-   offset = t->bufAddr + t->base.totalSize * face / 6;
-
-   if ( RADEON_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      GLint imageX = 0;
-      GLint imageY = 0;
-      GLint blitX = t->image[face][hwlevel].x;
-      GLint blitY = t->image[face][hwlevel].y;
-      GLint blitWidth = t->image[face][hwlevel].width;
-      GLint blitHeight = t->image[face][hwlevel].height;
-      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
-	       imageWidth, imageHeight, imageX, imageY );
-      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
-	       blitWidth, blitHeight, blitX, blitY );
-      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-	       (GLuint)offset, hwlevel, level );
-   }
-
-   t->image[face][hwlevel].data = texImage->Data;
-
-   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-    * We used to use 1, 2 and 4-byte texels and used to use the texture
-    * width to dictate the blit width - but that won't work for compressed
-    * textures. (Brian)
-    * NOTE: can't do that with texture tiling. (sroland)
-    */
-   tex.offset = offset;
-   tex.image = &tmp;
-   /* copy (x,y,width,height,data) */
-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(drm_radeon_tex_image_t) );
-
-   if (texImage->TexFormat->TexelBytes) {
-      /* use multi-byte upload scheme */
-      tex.height = imageHeight;
-      tex.width = imageWidth;
-      tex.format = t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK;
-      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
-      tex.offset += tmp.x & ~1023;
-      tmp.x = tmp.x % 1024;
-      if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
-	 /* need something like "tiled coordinates" ? */
-	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
-	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
-	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-      }
-      else {
-	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-      }
-      if ((t->tile_bits & RADEON_TXO_MACRO_TILE) &&
-	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256)) {
-	 /* radeon switches off macro tiling for small textures/mipmaps it seems */
-	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-      }
-   }
-   else {
-      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
-         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
-         so the kernel module reads the right amount of data. */
-      tex.format = RADEON_TXFORMAT_I8; /* any 1-byte texel format */
-      tex.pitch = (BLIT_WIDTH_BYTES / 64);
-      tex.height = (imageHeight + 3) / 4;
-      tex.width = (imageWidth + 3) / 4;
-      switch (t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) {
-      case RADEON_TXFORMAT_DXT1:
-         tex.width *= 8;
-         break;
-      case RADEON_TXFORMAT_DXT23:
-      case RADEON_TXFORMAT_DXT45:
-         tex.width *= 16;
-         break;
-      }
-   }
-
-   LOCK_HARDWARE( rmesa );
-   do {
-      ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
-                                 &tex, sizeof(drm_radeon_texture_t) );
-   } while ( ret == -EAGAIN );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
-      fprintf( stderr, "   offset=0x%08x\n",
-	       offset );
-      fprintf( stderr, "   image width=%d height=%d\n",
-	       imageWidth, imageHeight );
-      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
-	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
-	       t->image[face][hwlevel].data );
-      exit( 1 );
-   }
-}
-
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- * 
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int radeonUploadTexImages( radeonContextPtr rmesa, radeonTexObjPtr t, GLuint face )
-{
-   int numLevels;
-
-   if ( !t || t->base.totalSize == 0 || t->image_override )
-      return 0;
-
-   if ( RADEON_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-	       (void *)rmesa->glCtx, (void *)t->base.tObj, t->base.totalSize,
-	       t->base.firstLevel, t->base.lastLevel );
-   }
-
-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   if (RADEON_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      radeonFinish( rmesa->glCtx );
-   }
-
-   LOCK_HARDWARE( rmesa );
-
-   if ( t->base.memBlock == NULL ) {
-      int heap;
-
-      heap = driAllocateTexture( rmesa->texture_heaps, rmesa->nr_heaps,
-				 (driTextureObject *) t );
-      if ( heap == -1 ) {
-	 UNLOCK_HARDWARE( rmesa );
-	 return -1;
-      }
-
-      /* Set the base offset of the texture image */
-      t->bufAddr = rmesa->radeonScreen->texOffset[heap] 
-	   + t->base.memBlock->ofs;
-      t->pp_txoffset = t->bufAddr;
-
-      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-	 /* hope it's safe to add that here... */
-	 t->pp_txoffset |= t->tile_bits;
-      }
-
-      /* Mark this texobj as dirty on all units:
-       */
-      t->dirty_state = TEX_ALL;
-   }
-
-
-   /* Let the world know we've used this memory recently.
-    */
-   driUpdateTextureLRU( (driTextureObject *) t );
-   UNLOCK_HARDWARE( rmesa );
-
-
-   /* Upload any images that are new */
-   if (t->base.dirty_images[face]) {
-      int i;
-      for ( i = 0 ; i < numLevels ; i++ ) {
-         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
-            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
-			    t->image[face][i].height, face );
-         }
-      }
-      t->base.dirty_images[face] = 0;
-   }
-
-   if (RADEON_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      radeonFinish( rmesa->glCtx );
-   }
-
-   return 0;
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index 1e2f654add..9d252aa74c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -39,10 +39,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/texformat.h"
+#include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
 
 #include "radeon_context.h"
+#include "radeon_mipmap_tree.h"
 #include "radeon_state.h"
 #include "radeon_ioctl.h"
 #include "radeon_swtcl.h"
@@ -75,10 +77,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
 			     && (tx_table[f].format != 0xffffffff) )
 
-static const struct {
+struct tx_table {
    GLuint format, filter;
-}
-tx_table[] =
+};
+
+static const struct tx_table tx_table[] =
 {
    _ALPHA(RGBA8888),
    _ALPHA_REV(RGBA8888),
@@ -111,252 +114,6 @@ tx_table[] =
 #undef _ALPHA
 #undef _INVALID
 
-/**
- * This function computes the number of bytes of storage needed for
- * the given texture object (all mipmap levels, all cube faces).
- * The \c image[face][level].x/y/width/height parameters for upload/blitting
- * are computed here.  \c pp_txfilter, \c pp_txformat, etc. will be set here
- * too.
- * 
- * \param rmesa Context pointer
- * \param tObj GL texture object whose images are to be posted to
- *                 hardware state.
- */
-static void radeonSetTexImages( radeonContextPtr rmesa,
-				struct gl_texture_object *tObj )
-{
-   radeonTexObjPtr t = (radeonTexObjPtr)tObj->DriverData;
-   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
-   GLint curOffset, blitWidth;
-   GLint i, texelBytes;
-   GLint numLevels;
-   GLint log2Width, log2Height, log2Depth;
-
-   /* Set the hardware texture format
-    */
-   if ( !t->image_override ) {
-      t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
-                          RADEON_TXFORMAT_ALPHA_IN_MAP);
-      t->pp_txfilter &= ~RADEON_YUV_TO_RGB;
-
-      if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
-         t->pp_txformat |= tx_table[ baseImage->TexFormat->MesaFormat ].format;
-         t->pp_txfilter |= tx_table[ baseImage->TexFormat->MesaFormat ].filter;
-      }
-      else {
-         _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
-         return;
-      }
-   }
-
-   texelBytes = baseImage->TexFormat->TexelBytes;
-
-   /* Compute which mipmap levels we really want to send to the hardware.
-    */
-
-   if (tObj->Target != GL_TEXTURE_CUBE_MAP)
-      driCalculateTextureFirstLastLevel( (driTextureObject *) t );
-   else {
-      /* r100 can't handle mipmaps for cube/3d textures, so don't waste
-         memory for them */
-      t->base.firstLevel = t->base.lastLevel = tObj->BaseLevel;
-   }
-   log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
-   log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
-   log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
-
-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
-
-   /* Calculate mipmap offsets and dimensions for blitting (uploading)
-    * The idea is that we lay out the mipmap levels within a block of
-    * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
-    */
-   curOffset = 0;
-   blitWidth = BLIT_WIDTH_BYTES;
-   t->tile_bits = 0;
-
-   /* figure out if this texture is suitable for tiling. */
-   if (texelBytes && (tObj->Target != GL_TEXTURE_RECTANGLE_NV)) {
-      if (rmesa->texmicrotile && (baseImage->Height > 1)) {
-	 /* allow 32 (bytes) x 1 mip (which will use two times the space
-	    the non-tiled version would use) max if base texture is large enough */
-	 if ((numLevels == 1) ||
-	   (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
-	       (baseImage->Width * texelBytes > 64)) ||
-	    ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
-	    /* R100 has two microtile bits (only the txoffset reg, not the blitter)
-	       weird: X2 + OPT: 32bit correct, 16bit completely hosed
-		      X2: 32bit correct, 16bit correct
-		      OPT: 32bit large mips correct, small mips hosed, 16bit completely hosed */
-	    t->tile_bits |= RADEON_TXO_MICRO_TILE_X2 /*| RADEON_TXO_MICRO_TILE_OPT*/;
-	 }
-      }
-      if ((baseImage->Width * texelBytes >= 256) && (baseImage->Height >= 16)) {
-	 /* R100 disables macro tiling only if mip width is smaller than 256 bytes, and not
-	    in the case if height is smaller than 16 (not 100% sure), as does the r200,
-	    so need to disable macro tiling in that case */
-	 if ((numLevels == 1) || ((baseImage->Width * texelBytes / baseImage->Height) <= 4)) {
-	    t->tile_bits |= RADEON_TXO_MACRO_TILE;
-	 }
-      }
-   }
-
-   for (i = 0; i < numLevels; i++) {
-      const struct gl_texture_image *texImage;
-      GLuint size;
-
-      texImage = tObj->Image[0][i + t->base.firstLevel];
-      if ( !texImage )
-	 break;
-
-      /* find image size in bytes */
-      if (texImage->IsCompressed) {
-      /* need to calculate the size AFTER padding even though the texture is
-         submitted without padding.
-         Only handle pot textures currently - don't know if npot is even possible,
-         size calculation would certainly need (trivial) adjustments.
-         Align (and later pad) to 32byte, not sure what that 64byte blit width is
-         good for? */
-         if ((t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) == RADEON_TXFORMAT_DXT1) {
-            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
-            if ((texImage->Width + 3) < 8) /* width one block */
-               size = texImage->CompressedSize * 4;
-            else if ((texImage->Width + 3) < 16)
-               size = texImage->CompressedSize * 2;
-            else size = texImage->CompressedSize;
-         }
-         else /* DXT3/5, 16 bytes per block */
-            if ((texImage->Width + 3) < 8)
-               size = texImage->CompressedSize * 2;
-            else size = texImage->CompressedSize;
-      }
-      else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-	 size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
-      }
-      else if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
-	 /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-	    though the actual offset may be different (if texture is less than
-	    32 bytes width) to the untiled case */
-	 int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-	 size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-      }
-      else {
-	 int w = (texImage->Width * texelBytes + 31) & ~31;
-	 size = w * texImage->Height * texImage->Depth;
-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-      }
-      assert(size > 0);
-
-      /* Align to 32-byte offset.  It is faster to do this unconditionally
-       * (no branch penalty).
-       */
-
-      curOffset = (curOffset + 0x1f) & ~0x1f;
-
-      if (texelBytes) {
-	 t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
-	 t->image[0][i].y = 0;
-	 t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
-	 t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
-      }
-      else {
-         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
-         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
-         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
-         t->image[0][i].height = size / t->image[0][i].width;     
-      }
-
-#if 0
-      /* for debugging only and only  applicable to non-rectangle targets */
-      assert(size % t->image[0][i].width == 0);
-      assert(t->image[0][i].x == 0
-             || (size < BLIT_WIDTH_BYTES && t->image[0][i].height == 1));
-#endif
-
-      if (0)
-         fprintf(stderr,
-                 "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-                 i, texImage->Width, texImage->Height,
-                 t->image[0][i].x, t->image[0][i].y,
-                 t->image[0][i].width, t->image[0][i].height, size, curOffset);
-
-      curOffset += size;
-
-   }
-
-   /* Align the total size of texture memory block.
-    */
-   t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-
-   /* Setup remaining cube face blits, if needed */
-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      const GLuint faceSize = t->base.totalSize;
-      GLuint face;
-      /* reuse face 0 x/y/width/height - just update the offset when uploading */
-      for (face = 1; face < 6; face++) {
-         for (i = 0; i < numLevels; i++) {
-            t->image[face][i].x =  t->image[0][i].x;
-            t->image[face][i].y =  t->image[0][i].y;
-            t->image[face][i].width  = t->image[0][i].width;
-            t->image[face][i].height = t->image[0][i].height;
-         }
-      }
-      t->base.totalSize = 6 * faceSize; /* total texmem needed */
-   }
-
-   /* Hardware state:
-    */
-   t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
-   t->pp_txfilter |= (numLevels - 1) << RADEON_MAX_MIP_LEVEL_SHIFT;
-
-   t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
-		       RADEON_TXFORMAT_HEIGHT_MASK |
-                       RADEON_TXFORMAT_CUBIC_MAP_ENABLE |
-                       RADEON_TXFORMAT_F5_WIDTH_MASK |
-                       RADEON_TXFORMAT_F5_HEIGHT_MASK);
-   t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
-		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
-
-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      assert(log2Width == log2Height);
-      t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
-                         (log2Height << RADEON_TXFORMAT_F5_HEIGHT_SHIFT) |
-                         (RADEON_TXFORMAT_CUBIC_MAP_ENABLE));
-      t->pp_cubic_faces = ((log2Width << RADEON_FACE_WIDTH_1_SHIFT) |
-                           (log2Height << RADEON_FACE_HEIGHT_1_SHIFT) |
-                           (log2Width << RADEON_FACE_WIDTH_2_SHIFT) |
-                           (log2Height << RADEON_FACE_HEIGHT_2_SHIFT) |
-                           (log2Width << RADEON_FACE_WIDTH_3_SHIFT) |
-                           (log2Height << RADEON_FACE_HEIGHT_3_SHIFT) |
-                           (log2Width << RADEON_FACE_WIDTH_4_SHIFT) |
-                           (log2Height << RADEON_FACE_HEIGHT_4_SHIFT));
-   }
-
-   t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
-                   ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
-
-   /* Only need to round to nearest 32 for textures, but the blitter
-    * requires 64-byte aligned pitches, and we may/may not need the
-    * blitter.   NPOT only!
-    */
-   if ( !t->image_override ) {
-      if (baseImage->IsCompressed)
-         t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-      else
-         t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
-      t->pp_txpitch -= 32;
-   }
-
-   t->dirty_state = TEX_ALL;
-
-   /* FYI: radeonUploadTexImages( rmesa, t ); used to be called here */
-}
-
-
-
 /* ================================================================
  * Texture combine functions
  */
@@ -503,7 +260,7 @@ do {							\
 
 static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
    GLuint color_combine, alpha_combine;
    const GLuint color_combine0 = RADEON_COLOR_ARG_A_ZERO | RADEON_COLOR_ARG_B_ZERO
@@ -520,7 +277,7 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
    assert( (texUnit->_ReallyEnabled == 0)
 	   || (texUnit->_Current != NULL) );
 
-   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
+   if ( RADEON_DEBUG & RADEON_TEXTURE ) {
       fprintf( stderr, "%s( %p, %d )\n", __FUNCTION__, (void *)ctx, unit );
    }
 
@@ -846,22 +603,21 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
 void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
                         unsigned long long offset, GLint depth, GLuint pitch)
 {
-	radeonContextPtr rmesa = pDRICtx->driverPrivate;
+	r100ContextPtr rmesa = pDRICtx->driverPrivate;
 	struct gl_texture_object *tObj =
-	    _mesa_lookup_texture(rmesa->glCtx, texname);
-	radeonTexObjPtr t;
+	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
 
 	if (tObj == NULL)
 		return;
 
-	t = (radeonTexObjPtr) tObj->DriverData;
-
 	t->image_override = GL_TRUE;
 
 	if (!offset)
 		return;
-
-	t->pp_txoffset = offset;
+	
+	t->bo = NULL;
+	t->override_offset = offset;
 	t->pp_txpitch = pitch - 32;
 
 	switch (depth) {
@@ -881,6 +637,125 @@ void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	}
 }
 
+void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format,
+			 __DRIdrawable *dPriv)
+{
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	r100ContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+	
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->tile_bits = 0;
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = tx_table[MESA_FORMAT_RGB888].format;
+		else
+			t->pp_txformat = tx_table[MESA_FORMAT_ARGB8888].format;
+		t->pp_txfilter |= tx_table[MESA_FORMAT_ARGB8888].filter;
+		break;
+	case 3:
+	default:
+		t->pp_txformat = tx_table[MESA_FORMAT_RGB888].format;
+		t->pp_txfilter |= tx_table[MESA_FORMAT_RGB888].filter;
+		break;
+	case 2:
+		t->pp_txformat = tx_table[MESA_FORMAT_RGB565].format;
+		t->pp_txfilter |= tx_table[MESA_FORMAT_RGB565].filter;
+		break;
+	}
+        t->pp_txsize = ((rb->base.Width - 1) << RADEON_TEX_USIZE_SHIFT)
+		   | ((rb->base.Height - 1) << RADEON_TEX_VSIZE_SHIFT);
+        t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
+	t->pp_txpitch = pitch_val;
+        t->pp_txpitch -= 32;
+
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
+}
+
+
+void radeonSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+        radeonSetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}
+
+
 #define TEXOBJ_TXFILTER_MASK (RADEON_MAX_MIP_LEVEL_MASK |	\
 			      RADEON_MIN_FILTER_MASK | 		\
 			      RADEON_MAG_FILTER_MASK |		\
@@ -901,12 +776,53 @@ void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
                               RADEON_TXFORMAT_NON_POWER2)
 
 
-static void import_tex_obj_state( radeonContextPtr rmesa,
+static void disable_tex_obj_state( r100ContextPtr rmesa, 
+				   int unit )
+{
+   RADEON_STATECHANGE( rmesa, tex[unit] );
+
+   RADEON_STATECHANGE( rmesa, tcl );
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_ST_BIT(unit) |
+					     RADEON_Q_BIT(unit));
+   
+   if (rmesa->radeon.TclFallback & (RADEON_TCL_FALLBACK_TEXGEN_0<<unit)) {
+     TCL_FALLBACK( rmesa->radeon.glCtx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
+     rmesa->recheck_texgen[unit] = GL_TRUE;
+   }
+
+   if (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] & RADEON_TXFORMAT_CUBIC_MAP_ENABLE) {
+     /* this seems to be a genuine (r100 only?) hw bug. Need to remove the
+	cubic_map bit on unit 2 when the unit is disabled, otherwise every
+	2nd (2d) mipmap on unit 0 will be broken (may not be needed for other
+	units, better be safe than sorry though).*/
+     RADEON_STATECHANGE( rmesa, tex[unit] );
+     rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= ~RADEON_TXFORMAT_CUBIC_MAP_ENABLE;
+   }
+
+   {
+      GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
+      GLuint tmp = rmesa->TexGenEnabled;
+
+      rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
+      rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
+      rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
+      rmesa->TexGenNeedNormals[unit] = 0;
+      rmesa->TexGenEnabled |= 
+	(RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
+
+      if (tmp != rmesa->TexGenEnabled) {
+	rmesa->recheck_texgen[unit] = GL_TRUE;
+	rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+      }
+   }
+}
+
+static void import_tex_obj_state( r100ContextPtr rmesa,
 				  int unit,
 				  radeonTexObjPtr texobj )
 {
 /* do not use RADEON_DB_STATE to avoid stale texture caches */
-   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+   uint32_t *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
 
    RADEON_STATECHANGE( rmesa, tex[unit] );
@@ -915,10 +831,9 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
    cmd[TEX_PP_TXFILTER] |= texobj->pp_txfilter & TEXOBJ_TXFILTER_MASK;
    cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
    cmd[TEX_PP_TXFORMAT] |= texobj->pp_txformat & TEXOBJ_TXFORMAT_MASK;
-   cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset;
    cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
 
-   if (texobj->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+   if (texobj->base.Target == GL_TEXTURE_RECTANGLE_NV) {
       GLuint *txr_cmd = RADEON_DB_STATE( txr[unit] );
       txr_cmd[TXR_PP_TEX_SIZE] = texobj->pp_txsize; /* NPOT only! */
       txr_cmd[TXR_PP_TEX_PITCH] = texobj->pp_txpitch; /* NPOT only! */
@@ -928,22 +843,12 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
    else {
       se_coord_fmt &= ~(RADEON_VTX_ST0_NONPARAMETRIC << unit);
 
-      if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
-	 int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
-	 GLuint bytesPerFace = texobj->base.totalSize / 6;
-	 ASSERT(texobj->base.totalSize % 6 == 0);
+      if (texobj->base.Target == GL_TEXTURE_CUBE_MAP) {
+	 uint32_t *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
 
 	 RADEON_STATECHANGE( rmesa, cube[unit] );
 	 cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
-	 /* dont know if this setup conforms to OpenGL.. 
-	  * at least it matches the behavior of mesa software renderer
-	  */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_0] = texobj->pp_txoffset; /* right */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_1] = texobj->pp_txoffset + 1 * bytesPerFace; /* left */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_2] = texobj->pp_txoffset + 2 * bytesPerFace; /* top */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_3] = texobj->pp_txoffset + 3 * bytesPerFace; /* bottom */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_4] = texobj->pp_txoffset + 4 * bytesPerFace; /* front */
-	 cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset + 5 * bytesPerFace; /* back */
+	 /* state filled out in the cube_emit */
       }
    }
 
@@ -952,13 +857,11 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
       rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
    }
 
-   texobj->dirty_state &= ~(1<<unit);
+   rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
 }
 
 
-
-
-static void set_texgen_matrix( radeonContextPtr rmesa, 
+static void set_texgen_matrix( r100ContextPtr rmesa, 
 			       GLuint unit,
 			       const GLfloat *s_plane,
 			       const GLfloat *t_plane,
@@ -986,14 +889,14 @@ static void set_texgen_matrix( radeonContextPtr rmesa,
    rmesa->TexGenMatrix[unit].m[15] = q_plane[3];
 
    rmesa->TexGenEnabled |= RADEON_TEXMAT_0_ENABLE << unit;
-   rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+   rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
 }
 
 /* Returns GL_FALSE if fallback required.
  */
 static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
    GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
    GLuint tmp = rmesa->TexGenEnabled;
@@ -1023,14 +926,14 @@ static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
     */
    else if ( (texUnit->TexGenEnabled & S_BIT) &&
 	     (texUnit->TexGenEnabled & T_BIT) &&
-	     (texUnit->GenModeS == texUnit->GenModeT) ) {
+	     (texUnit->GenS.Mode == texUnit->GenT.Mode) ) {
       if ( ((texUnit->TexGenEnabled & R_BIT) &&
-	    (texUnit->GenModeS != texUnit->GenModeR)) ||
+	    (texUnit->GenS.Mode != texUnit->GenR.Mode)) ||
 	   ((texUnit->TexGenEnabled & Q_BIT) &&
-	    (texUnit->GenModeS != texUnit->GenModeQ)) ) {
+	    (texUnit->GenS.Mode != texUnit->GenQ.Mode)) ) {
 	 /* Mixed modes, fallback:
 	  */
-	 if (RADEON_DEBUG & DEBUG_FALLBACKS)
+	 if (RADEON_DEBUG & RADEON_FALLBACKS)
 	    fprintf(stderr, "fallback mixed texgen\n");
 	 return GL_FALSE;
       }
@@ -1038,7 +941,7 @@ static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
    }
    else {
    /* some texgen mode not including both S and T bits */
-      if (RADEON_DEBUG & DEBUG_FALLBACKS)
+      if (RADEON_DEBUG & RADEON_FALLBACKS)
 	 fprintf(stderr, "fallback mixed texgen/nontexgen\n");
       return GL_FALSE;
    }
@@ -1051,23 +954,23 @@ static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
       rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_Q_BIT(unit);
    }
 
-   switch (texUnit->GenModeS) {
+   switch (texUnit->GenS.Mode) {
    case GL_OBJECT_LINEAR:
       rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_OBJ << inputshift;
       set_texgen_matrix( rmesa, unit,
-			 texUnit->ObjectPlaneS,
-			 texUnit->ObjectPlaneT,
-			 texUnit->ObjectPlaneR,
-			 texUnit->ObjectPlaneQ);
+			 texUnit->GenS.ObjectPlane,
+			 texUnit->GenT.ObjectPlane,
+			 texUnit->GenR.ObjectPlane,
+			 texUnit->GenQ.ObjectPlane);
       break;
 
    case GL_EYE_LINEAR:
       rmesa->TexGenEnabled |= RADEON_TEXGEN_INPUT_EYE << inputshift;
       set_texgen_matrix( rmesa, unit,
-			 texUnit->EyePlaneS,
-			 texUnit->EyePlaneT,
-			 texUnit->EyePlaneR,
-			 texUnit->EyePlaneQ);
+			 texUnit->GenS.EyePlane,
+			 texUnit->GenT.EyePlane,
+			 texUnit->GenR.EyePlane,
+			 texUnit->GenQ.EyePlane);
       break;
 
    case GL_REFLECTION_MAP_NV:
@@ -1088,289 +991,195 @@ static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
    default:
       /* Unsupported mode, fallback:
        */
-      if (RADEON_DEBUG & DEBUG_FALLBACKS) 
+      if (RADEON_DEBUG & RADEON_FALLBACKS)
 	 fprintf(stderr, "fallback GL_SPHERE_MAP\n");
       return GL_FALSE;
    }
 
    if (tmp != rmesa->TexGenEnabled) {
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
    }
 
    return GL_TRUE;
 }
 
-
-static void disable_tex( GLcontext *ctx, int unit )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (RADEON_TEX_0_ENABLE<<unit)) {
-      /* Texture unit disabled */
-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
-	 /* The old texture is no longer bound to this texture unit.
-	  * Mark it as such.
-	  */
-
-	 rmesa->state.texture.unit[unit].texobj->base.bound &= ~(1UL << unit);
-	 rmesa->state.texture.unit[unit].texobj = NULL;
-      }
-
-      RADEON_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= 
-	  ~((RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit);
-
-      RADEON_STATECHANGE( rmesa, tcl );
-      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_ST_BIT(unit) |
-						RADEON_Q_BIT(unit));
-
-      if (rmesa->TclFallback & (RADEON_TCL_FALLBACK_TEXGEN_0<<unit)) {
-	 TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
-	 rmesa->recheck_texgen[unit] = GL_TRUE;
-      }
-
-      if (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] & RADEON_TXFORMAT_CUBIC_MAP_ENABLE) {
-      /* this seems to be a genuine (r100 only?) hw bug. Need to remove the
-         cubic_map bit on unit 2 when the unit is disabled, otherwise every
-	 2nd (2d) mipmap on unit 0 will be broken (may not be needed for other
-	 units, better be safe than sorry though).*/
-	 RADEON_STATECHANGE( rmesa, tex[unit] );
-	 rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= ~RADEON_TXFORMAT_CUBIC_MAP_ENABLE;
-      }
-
-      {
-	 GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
-	 GLuint tmp = rmesa->TexGenEnabled;
-
-	 rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
-	 rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
-	 rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
-	 rmesa->TexGenNeedNormals[unit] = 0;
-	 rmesa->TexGenEnabled |= 
-	     (RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
-
-	 if (tmp != rmesa->TexGenEnabled) {
-	    rmesa->recheck_texgen[unit] = GL_TRUE;
-	    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
-	 }
-      }
-   }
-}
-
-static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
-
-   /* Need to load the 2d images associated with this unit.
-    */
-   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
-   }
-
-   ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
-
-   if ( t->base.dirty_images[0] ) {
-      RADEON_FIREVERTICES( rmesa );
-      radeonSetTexImages( rmesa, tObj );
-      radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock && !t->image_override ) 
-	return GL_FALSE;
-   }
-
-   return GL_TRUE;
-}
-
-static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
+/**
+ * Compute the cached hardware register values for the given texture object.
+ *
+ * \param rmesa Context pointer
+ * \param t the r300 texture object
+ */
+static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int unit)
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
-   GLuint face;
+   const struct gl_texture_image *firstImage;
+   GLint log2Width, log2Height, log2Depth, texelBytes;
 
-   /* Need to load the 2d images associated with this unit.
-    */
-   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
-      for (face = 0; face < 6; face++)
-         t->base.dirty_images[face] = ~0;
+   if ( t->bo ) {
+	return GL_TRUE;
    }
 
-   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+   firstImage = t->base.Image[0][t->mt->firstLevel];   
 
-   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
-        t->base.dirty_images[2] || t->base.dirty_images[3] ||
-        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
-      /* flush */
-      RADEON_FIREVERTICES( rmesa );
-      /* layout memory space, once for all faces */
-      radeonSetTexImages( rmesa, tObj );
+   if (firstImage->Border > 0) {
+      fprintf(stderr, "%s: border\n", __FUNCTION__);
+      return GL_FALSE;
    }
 
-   /* upload (per face) */
-   for (face = 0; face < 6; face++) {
-      if (t->base.dirty_images[face]) {
-         radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, face );
+   log2Width  = firstImage->WidthLog2;
+   log2Height = firstImage->HeightLog2;
+   log2Depth  = firstImage->DepthLog2;
+   texelBytes = firstImage->TexFormat->TexelBytes;
+
+   if (!t->image_override) {
+      if (VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+	const struct tx_table *table = tx_table;
+
+	 t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
+			     RADEON_TXFORMAT_ALPHA_IN_MAP);
+	 t->pp_txfilter &= ~RADEON_YUV_TO_RGB;	 
+	 
+	 t->pp_txformat |= table[ firstImage->TexFormat->MesaFormat ].format;
+	 t->pp_txfilter |= table[ firstImage->TexFormat->MesaFormat ].filter;
+      } else {
+	 _mesa_problem(NULL, "unexpected texture format in %s",
+		       __FUNCTION__);
+	 return GL_FALSE;
       }
    }
-      
-   if ( !t->base.memBlock ) {
-      /* texmem alloc failed, use s/w fallback */
-      return GL_FALSE;
+   
+   t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
+   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << RADEON_MAX_MIP_LEVEL_SHIFT;
+	
+   t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
+		       RADEON_TXFORMAT_HEIGHT_MASK |
+		       RADEON_TXFORMAT_CUBIC_MAP_ENABLE |
+		       RADEON_TXFORMAT_F5_WIDTH_MASK |
+		       RADEON_TXFORMAT_F5_HEIGHT_MASK);
+   t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
+		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
+   
+   t->tile_bits = 0;
+   
+   if (t->base.Target == GL_TEXTURE_CUBE_MAP) {
+      ASSERT(log2Width == log2Height);
+      t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
+			 (log2Height << RADEON_TXFORMAT_F5_HEIGHT_SHIFT) |
+			 /* don't think we need this bit, if it exists at all - fglrx does not set it */
+			 (RADEON_TXFORMAT_CUBIC_MAP_ENABLE));
+      t->pp_cubic_faces = ((log2Width << RADEON_FACE_WIDTH_1_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_1_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_2_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_2_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_3_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_3_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_4_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_4_SHIFT));
    }
 
-   return GL_TRUE;
-}
-
-static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+   t->pp_txsize = (((firstImage->Width - 1) << RADEON_TEX_USIZE_SHIFT)
+		   | ((firstImage->Height - 1) << RADEON_TEX_VSIZE_SHIFT));
 
-   if (!(t->pp_txformat & RADEON_TXFORMAT_NON_POWER2)) {
-      t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
+   if ( !t->image_override ) {
+      if (firstImage->IsCompressed)
+         t->pp_txpitch = (firstImage->Width + 63) & ~(63);
+      else
+         t->pp_txpitch = ((firstImage->Width * texelBytes) + 63) & ~(63);
+      t->pp_txpitch -= 32;
    }
 
-   ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
-
-   if ( t->base.dirty_images[0] ) {
-      RADEON_FIREVERTICES( rmesa );
-      radeonSetTexImages( rmesa, tObj );
-      radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock &&
-           !t->image_override /* && !rmesa->prefer_gart_client_texturing  FIXME */ ) {
-	 fprintf(stderr, "%s: upload failed\n", __FUNCTION__);
-	 return GL_FALSE;
-      }
+   if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+      t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
    }
 
    return GL_TRUE;
 }
 
-
-static GLboolean update_tex_common( GLcontext *ctx, int unit )
+static GLboolean radeon_validate_texture(GLcontext *ctx, struct gl_texture_object *texObj, int unit)
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
-   GLenum format;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   radeonTexObj *t = radeon_tex_obj(texObj);
+   int ret;
 
-   /* Fallback if there's a texture border */
-   if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 ) {
-      fprintf(stderr, "%s: border\n", __FUNCTION__);
+   if (!radeon_validate_texture_miptree(ctx, texObj))
       return GL_FALSE;
-   }
+
+   ret = setup_hardware_state(rmesa, t, unit);
+   if (ret == GL_FALSE)
+     return GL_FALSE;
+
    /* yuv conversion only works in first unit */
    if (unit != 0 && (t->pp_txfilter & RADEON_YUV_TO_RGB))
       return GL_FALSE;
 
-   /* Update state if this is a different texture object to last
-    * time.
-    */
-   if ( rmesa->state.texture.unit[unit].texobj != t ) {
-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
-	 /* The old texture is no longer bound to this texture unit.
-	  * Mark it as such.
-	  */
-
-	 rmesa->state.texture.unit[unit].texobj->base.bound &= 
-	     ~(1UL << unit);
-      }
-
-      rmesa->state.texture.unit[unit].texobj = t;
-      t->base.bound |= (1UL << unit);
-      t->dirty_state |= 1<<unit;
-      driUpdateTextureLRU( (driTextureObject *) t ); /* XXX: should be locked! */
-   }
+   RADEON_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= 
+     (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit;
 
+   RADEON_STATECHANGE( rmesa, tcl );
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
 
-   /* Newly enabled?
-    */
-   if ( !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (RADEON_TEX_0_ENABLE<<unit))) {
-      RADEON_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= 
-	  (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit;
+   rmesa->recheck_texgen[unit] = GL_TRUE;
 
-      RADEON_STATECHANGE( rmesa, tcl );
-
-      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
-
-      rmesa->recheck_texgen[unit] = GL_TRUE;
-   }
-
-   if (t->dirty_state & (1<<unit)) {
-      import_tex_obj_state( rmesa, unit, t );
-      /* may need to update texture matrix (for texrect adjustments) */
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
-   }
+   import_tex_obj_state( rmesa, unit, t );
 
    if (rmesa->recheck_texgen[unit]) {
       GLboolean fallback = !radeon_validate_texgen( ctx, unit );
       TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), fallback);
       rmesa->recheck_texgen[unit] = 0;
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
    }
 
-   format = tObj->Image[0][tObj->BaseLevel]->_BaseFormat;
-   if ( rmesa->state.texture.unit[unit].format != format ||
-	rmesa->state.texture.unit[unit].envMode != texUnit->EnvMode ) {
-      rmesa->state.texture.unit[unit].format = format;
-      rmesa->state.texture.unit[unit].envMode = texUnit->EnvMode;
-      if ( ! radeonUpdateTextureEnv( ctx, unit ) ) {
-	 return GL_FALSE;
-      }
+   if ( ! radeonUpdateTextureEnv( ctx, unit ) ) {
+     return GL_FALSE;
    }
-
    FALLBACK( rmesa, RADEON_FALLBACK_BORDER_MODE, t->border_fallback );
+
+   t->validated = GL_TRUE;
    return !t->border_fallback;
 }
 
-
-
 static GLboolean radeonUpdateTextureUnit( GLcontext *ctx, int unit )
 {
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   if ( texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT) ) {
-      return (enable_tex_rect( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-   else if ( texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
-      return (enable_tex_2d( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-   else if ( texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT) ) {
-      return (enable_tex_cube( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
+   if (ctx->Texture.Unit[unit]._ReallyEnabled & TEXTURE_3D_BIT) {
+     rmesa->state.texture.unit[unit].texobj = NULL;
+     return GL_FALSE;
    }
-   else if ( texUnit->_ReallyEnabled ) {
-      return GL_FALSE;
+
+   if (!ctx->Texture.Unit[unit]._ReallyEnabled) {
+     /* disable the unit */
+     disable_tex_obj_state(rmesa, unit);
+     rmesa->state.texture.unit[unit].texobj = NULL;
+     return GL_TRUE;
    }
-   else {
-      disable_tex( ctx, unit );
-      return GL_TRUE;
+
+   if (!radeon_validate_texture(ctx, ctx->Texture.Unit[unit]._Current, unit)) {
+    _mesa_warning(ctx,
+		  "failed to validate texture for unit %d.\n",
+		  unit);
+     rmesa->state.texture.unit[unit].texobj = NULL;
+     return GL_FALSE;
    }
+   rmesa->state.texture.unit[unit].texobj = radeon_tex_obj(ctx->Texture.Unit[unit]._Current);
+   return GL_TRUE;
 }
 
 void radeonUpdateTextureState( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLboolean ok;
 
+   /* set the ctx all textures off */
+   RADEON_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~((RADEON_TEX_ENABLE_MASK) | (RADEON_TEX_BLEND_ENABLE_MASK));
+
    ok = (radeonUpdateTextureUnit( ctx, 0 ) &&
 	 radeonUpdateTextureUnit( ctx, 1 ) &&
 	 radeonUpdateTextureUnit( ctx, 2 ));
 
    FALLBACK( rmesa, RADEON_FALLBACK_TEXTURE, !ok );
 
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       radeonChooseVertexState( ctx );
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
new file mode 100644
index 0000000000..fad3d1ceda
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -0,0 +1,1052 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ * Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+ *
+ * The Weather Channel (TM) funded Tungsten Graphics to develop the
+ * initial release of the Radeon 8500 driver under the XFree86 license.
+ * This notice must be preserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/context.h"
+#include "main/convolve.h"
+#include "main/mipmap.h"
+#include "main/texcompress.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "main/teximage.h"
+#include "main/texobj.h"
+#include "main/texgetimage.h"
+
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+
+#include "radeon_common.h"
+
+#include "radeon_mipmap_tree.h"
+
+
+static void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
+	GLuint numrows, GLuint rowsize)
+{
+	assert(rowsize <= dststride);
+	assert(rowsize <= srcstride);
+
+	if (rowsize == srcstride && rowsize == dststride) {
+		memcpy(dst, src, numrows*rowsize);
+	} else {
+		GLuint i;
+		for(i = 0; i < numrows; ++i) {
+			memcpy(dst, src, rowsize);
+			dst += dststride;
+			src += srcstride;
+		}
+	}
+}
+
+/* textures */
+/**
+ * Allocate an empty texture image object.
+ */
+struct gl_texture_image *radeonNewTextureImage(GLcontext *ctx)
+{
+	return CALLOC(sizeof(radeon_texture_image));
+}
+
+/**
+ * Free memory associated with this texture image.
+ */
+void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage)
+{
+	radeon_texture_image* image = get_radeon_texture_image(timage);
+
+	if (image->mt) {
+		radeon_miptree_unreference(image->mt);
+		image->mt = 0;
+		assert(!image->base.Data);
+	} else {
+		_mesa_free_texture_image_data(ctx, timage);
+	}
+	if (image->bo) {
+		radeon_bo_unref(image->bo);
+		image->bo = NULL;
+	}
+	if (timage->Data) {
+		_mesa_free_texmemory(timage->Data);
+		timage->Data = NULL;
+	}
+}
+
+/* Set Data pointer and additional data for mapped texture image */
+static void teximage_set_map_data(radeon_texture_image *image)
+{
+	radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+
+	image->base.Data = image->mt->bo->ptr + lvl->faces[image->mtface].offset;
+	image->base.RowStride = lvl->rowstride / image->mt->bpp;
+}
+
+
+/**
+ * Map a single texture image for glTexImage and friends.
+ */
+void radeon_teximage_map(radeon_texture_image *image, GLboolean write_enable)
+{
+	if (image->mt) {
+		assert(!image->base.Data);
+
+		radeon_bo_map(image->mt->bo, write_enable);
+		teximage_set_map_data(image);
+	}
+}
+
+
+void radeon_teximage_unmap(radeon_texture_image *image)
+{
+	if (image->mt) {
+		assert(image->base.Data);
+
+		image->base.Data = 0;
+		radeon_bo_unmap(image->mt->bo);
+	}
+}
+
+static void map_override(GLcontext *ctx, radeonTexObj *t)
+{
+	radeon_texture_image *img = get_radeon_texture_image(t->base.Image[0][0]);
+
+	radeon_bo_map(t->bo, GL_FALSE);
+
+	img->base.Data = t->bo->ptr;
+	_mesa_set_fetch_functions(&img->base, 2);
+}
+
+static void unmap_override(GLcontext *ctx, radeonTexObj *t)
+{
+	radeon_texture_image *img = get_radeon_texture_image(t->base.Image[0][0]);
+
+	radeon_bo_unmap(t->bo);
+
+	img->base.Data = NULL;
+}
+
+/**
+ * Map a validated texture for reading during software rendering.
+ */
+void radeonMapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
+{
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	int face, level;
+
+	if (!radeon_validate_texture_miptree(ctx, texObj))
+	  return;
+
+	/* for r100 3D sw fallbacks don't have mt */
+	if (t->image_override && t->bo)
+		map_override(ctx, t);
+
+	if (!t->mt)
+		return;
+
+	radeon_bo_map(t->mt->bo, GL_FALSE);
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+			teximage_set_map_data(get_radeon_texture_image(texObj->Image[face][level]));
+	}
+}
+
+void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
+{
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	int face, level;
+
+	if (t->image_override && t->bo)
+		unmap_override(ctx, t);
+	/* for r100 3D sw fallbacks don't have mt */
+	if (!t->mt)
+	  return;
+
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+			texObj->Image[face][level]->Data = 0;
+	}
+	radeon_bo_unmap(t->mt->bo);
+}
+
+GLuint radeon_face_for_target(GLenum target)
+{
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		return (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+	default:
+		return 0;
+	}
+}
+
+/**
+ * Wraps Mesa's implementation to ensure that the base level image is mapped.
+ *
+ * This relies on internal details of _mesa_generate_mipmap, in particular
+ * the fact that the memory for recreated texture images is always freed.
+ */
+static void radeon_generate_mipmap(GLcontext *ctx, GLenum target,
+				   struct gl_texture_object *texObj)
+{
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	GLuint nr_faces = (t->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+	int i, face;
+
+
+	_mesa_generate_mipmap(ctx, target, texObj);
+
+	for (face = 0; face < nr_faces; face++) {
+		for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
+			radeon_texture_image *image;
+
+			image = get_radeon_texture_image(texObj->Image[face][i]);
+
+			if (image == NULL)
+				break;
+
+			image->mtlevel = i;
+			image->mtface = face;
+
+			radeon_miptree_unreference(image->mt);
+			image->mt = NULL;
+		}
+	}
+	
+}
+
+void radeonGenerateMipmap(GLcontext* ctx, GLenum target, struct gl_texture_object *texObj)
+{
+	GLuint face = radeon_face_for_target(target);
+	radeon_texture_image *baseimage = get_radeon_texture_image(texObj->Image[face][texObj->BaseLevel]);
+
+	radeon_teximage_map(baseimage, GL_FALSE);
+	radeon_generate_mipmap(ctx, target, texObj);
+	radeon_teximage_unmap(baseimage);
+}
+
+
+/* try to find a format which will only need a memcopy */
+static const struct gl_texture_format *radeonChoose8888TexFormat(radeonContextPtr rmesa,
+								 GLenum srcFormat,
+								 GLenum srcType, GLboolean fbo)
+{
+	const GLuint ui = 1;
+	const GLubyte littleEndian = *((const GLubyte *)&ui);
+
+	/* r100 can only do this */
+	if (IS_R100_CLASS(rmesa->radeonScreen) || fbo)
+	  return _dri_texformat_argb8888;
+
+	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
+		return &_mesa_texformat_rgba8888;
+	} else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+		   (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
+		return &_mesa_texformat_rgba8888_rev;
+	} else if (IS_R200_CLASS(rmesa->radeonScreen)) {
+		return _dri_texformat_argb8888;
+	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+					    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
+		return &_mesa_texformat_argb8888_rev;
+	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+					    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+		return &_mesa_texformat_argb8888;
+	} else
+		return _dri_texformat_argb8888;
+}
+
+const struct gl_texture_format *radeonChooseTextureFormat_mesa(GLcontext * ctx,
+							  GLint internalFormat,
+							  GLenum format,
+							  GLenum type)
+{
+	return radeonChooseTextureFormat(ctx, internalFormat, format,
+					 type, 0);
+}
+
+const struct gl_texture_format *radeonChooseTextureFormat(GLcontext * ctx,
+							  GLint internalFormat,
+							  GLenum format,
+							  GLenum type, GLboolean fbo)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	const GLboolean do32bpt =
+	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32);
+	const GLboolean force16bpt =
+	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16);
+	(void)format;
+
+#if 0
+	fprintf(stderr, "InternalFormat=%s(%d) type=%s format=%s\n",
+		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
+		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+	fprintf(stderr, "do32bpt=%d force16bpt=%d\n", do32bpt, force16bpt);
+#endif
+
+	switch (internalFormat) {
+	case 4:
+	case GL_RGBA:
+	case GL_COMPRESSED_RGBA:
+		switch (type) {
+		case GL_UNSIGNED_INT_10_10_10_2:
+		case GL_UNSIGNED_INT_2_10_10_10_REV:
+			return do32bpt ? _dri_texformat_argb8888 :
+			    _dri_texformat_argb1555;
+		case GL_UNSIGNED_SHORT_4_4_4_4:
+		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+			return _dri_texformat_argb4444;
+		case GL_UNSIGNED_SHORT_5_5_5_1:
+		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+			return _dri_texformat_argb1555;
+		default:
+			return do32bpt ? radeonChoose8888TexFormat(rmesa, format, type, fbo) :
+			    _dri_texformat_argb4444;
+		}
+
+	case 3:
+	case GL_RGB:
+	case GL_COMPRESSED_RGB:
+		switch (type) {
+		case GL_UNSIGNED_SHORT_4_4_4_4:
+		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+			return _dri_texformat_argb4444;
+		case GL_UNSIGNED_SHORT_5_5_5_1:
+		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+			return _dri_texformat_argb1555;
+		case GL_UNSIGNED_SHORT_5_6_5:
+		case GL_UNSIGNED_SHORT_5_6_5_REV:
+			return _dri_texformat_rgb565;
+		default:
+			return do32bpt ? _dri_texformat_argb8888 :
+			    _dri_texformat_rgb565;
+		}
+
+	case GL_RGBA8:
+	case GL_RGB10_A2:
+	case GL_RGBA12:
+	case GL_RGBA16:
+		return !force16bpt ?
+			radeonChoose8888TexFormat(rmesa, format, type, fbo) :
+			_dri_texformat_argb4444;
+
+	case GL_RGBA4:
+	case GL_RGBA2:
+		return _dri_texformat_argb4444;
+
+	case GL_RGB5_A1:
+		return _dri_texformat_argb1555;
+
+	case GL_RGB8:
+	case GL_RGB10:
+	case GL_RGB12:
+	case GL_RGB16:
+		return !force16bpt ? _dri_texformat_argb8888 :
+		    _dri_texformat_rgb565;
+
+	case GL_RGB5:
+	case GL_RGB4:
+	case GL_R3_G3_B2:
+		return _dri_texformat_rgb565;
+
+	case GL_ALPHA:
+	case GL_ALPHA4:
+	case GL_ALPHA8:
+	case GL_ALPHA12:
+	case GL_ALPHA16:
+	case GL_COMPRESSED_ALPHA:
+		/* r200: can't use a8 format since interpreting hw I8 as a8 would result
+		   in wrong rgb values (same as alpha value instead of 0). */
+		if (IS_R200_CLASS(rmesa->radeonScreen))
+			return _dri_texformat_al88;
+		else
+			return _dri_texformat_a8;
+	case 1:
+	case GL_LUMINANCE:
+	case GL_LUMINANCE4:
+	case GL_LUMINANCE8:
+	case GL_LUMINANCE12:
+	case GL_LUMINANCE16:
+	case GL_COMPRESSED_LUMINANCE:
+		return _dri_texformat_l8;
+
+	case 2:
+	case GL_LUMINANCE_ALPHA:
+	case GL_LUMINANCE4_ALPHA4:
+	case GL_LUMINANCE6_ALPHA2:
+	case GL_LUMINANCE8_ALPHA8:
+	case GL_LUMINANCE12_ALPHA4:
+	case GL_LUMINANCE12_ALPHA12:
+	case GL_LUMINANCE16_ALPHA16:
+	case GL_COMPRESSED_LUMINANCE_ALPHA:
+		return _dri_texformat_al88;
+
+	case GL_INTENSITY:
+	case GL_INTENSITY4:
+	case GL_INTENSITY8:
+	case GL_INTENSITY12:
+	case GL_INTENSITY16:
+	case GL_COMPRESSED_INTENSITY:
+		return _dri_texformat_i8;
+
+	case GL_YCBCR_MESA:
+		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+		    type == GL_UNSIGNED_BYTE)
+			return &_mesa_texformat_ycbcr;
+		else
+			return &_mesa_texformat_ycbcr_rev;
+
+	case GL_RGB_S3TC:
+	case GL_RGB4_S3TC:
+	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+		return &_mesa_texformat_rgb_dxt1;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+		return &_mesa_texformat_rgba_dxt1;
+
+	case GL_RGBA_S3TC:
+	case GL_RGBA4_S3TC:
+	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+		return &_mesa_texformat_rgba_dxt3;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+		return &_mesa_texformat_rgba_dxt5;
+
+	case GL_ALPHA16F_ARB:
+		return &_mesa_texformat_alpha_float16;
+	case GL_ALPHA32F_ARB:
+		return &_mesa_texformat_alpha_float32;
+	case GL_LUMINANCE16F_ARB:
+		return &_mesa_texformat_luminance_float16;
+	case GL_LUMINANCE32F_ARB:
+		return &_mesa_texformat_luminance_float32;
+	case GL_LUMINANCE_ALPHA16F_ARB:
+		return &_mesa_texformat_luminance_alpha_float16;
+	case GL_LUMINANCE_ALPHA32F_ARB:
+		return &_mesa_texformat_luminance_alpha_float32;
+	case GL_INTENSITY16F_ARB:
+		return &_mesa_texformat_intensity_float16;
+	case GL_INTENSITY32F_ARB:
+		return &_mesa_texformat_intensity_float32;
+	case GL_RGB16F_ARB:
+		return &_mesa_texformat_rgba_float16;
+	case GL_RGB32F_ARB:
+		return &_mesa_texformat_rgba_float32;
+	case GL_RGBA16F_ARB:
+		return &_mesa_texformat_rgba_float16;
+	case GL_RGBA32F_ARB:
+		return &_mesa_texformat_rgba_float32;
+
+	case GL_DEPTH_COMPONENT:
+	case GL_DEPTH_COMPONENT16:
+	case GL_DEPTH_COMPONENT24:
+	case GL_DEPTH_COMPONENT32:
+	case GL_DEPTH_STENCIL_EXT:
+	case GL_DEPTH24_STENCIL8_EXT:
+		return &_mesa_texformat_s8_z24;
+
+	/* EXT_texture_sRGB */
+	case GL_SRGB:
+	case GL_SRGB8:
+	case GL_SRGB_ALPHA:
+	case GL_SRGB8_ALPHA8:
+	case GL_COMPRESSED_SRGB:
+	case GL_COMPRESSED_SRGB_ALPHA:
+		return &_mesa_texformat_srgba8;
+
+	case GL_SLUMINANCE:
+	case GL_SLUMINANCE8:
+	case GL_COMPRESSED_SLUMINANCE:
+		return &_mesa_texformat_sl8;
+
+	case GL_SLUMINANCE_ALPHA:
+	case GL_SLUMINANCE8_ALPHA8:
+	case GL_COMPRESSED_SLUMINANCE_ALPHA:
+		return &_mesa_texformat_sla8;
+
+	default:
+		_mesa_problem(ctx,
+			      "unexpected internalFormat 0x%x in %s",
+			      (int)internalFormat, __func__);
+		return NULL;
+	}
+
+	return NULL;		/* never get here */
+}
+
+/**
+ * All glTexImage calls go through this function.
+ */
+static void radeon_teximage(
+	GLcontext *ctx, int dims,
+	GLenum target, GLint level,
+	GLint internalFormat,
+	GLint width, GLint height, GLint depth,
+	GLsizei imageSize,
+	GLenum format, GLenum type, const GLvoid * pixels,
+	const struct gl_pixelstore_attrib *packing,
+	struct gl_texture_object *texObj,
+	struct gl_texture_image *texImage,
+	int compressed)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	radeon_texture_image* image = get_radeon_texture_image(texImage);
+	GLuint dstRowStride;
+	GLint postConvWidth = width;
+	GLint postConvHeight = height;
+	GLuint texelBytes;
+	GLuint face = radeon_face_for_target(target);
+
+	radeon_firevertices(rmesa);
+
+	t->validated = GL_FALSE;
+
+	if (ctx->_ImageTransferState & IMAGE_CONVOLUTION_BIT) {
+	       _mesa_adjust_image_for_convolution(ctx, dims, &postConvWidth,
+						  &postConvHeight);
+	}
+
+	/* Choose and fill in the texture format for this image */
+	texImage->TexFormat = radeonChooseTextureFormat(ctx, internalFormat, format, type, 0);
+	_mesa_set_fetch_functions(texImage, dims);
+
+	if (texImage->TexFormat->TexelBytes == 0) {
+		texelBytes = 0;
+		texImage->IsCompressed = GL_TRUE;
+		texImage->CompressedSize =
+			ctx->Driver.CompressedTextureSize(ctx, texImage->Width,
+					   texImage->Height, texImage->Depth,
+					   texImage->TexFormat->MesaFormat);
+	} else {
+		texImage->IsCompressed = GL_FALSE;
+		texImage->CompressedSize = 0;
+
+		texelBytes = texImage->TexFormat->TexelBytes;
+		/* Minimum pitch of 32 bytes */
+		if (postConvWidth * texelBytes < 32) {
+		  postConvWidth = 32 / texelBytes;
+		  texImage->RowStride = postConvWidth;
+		}
+		if (!image->mt) {      
+			assert(texImage->RowStride == postConvWidth);
+		}
+	}
+
+	/* Allocate memory for image */
+	radeonFreeTexImageData(ctx, texImage); /* Mesa core only clears texImage->Data but not image->mt */
+
+	if (t->mt &&
+	    t->mt->firstLevel == level &&
+	    t->mt->lastLevel == level &&
+	    t->mt->target != GL_TEXTURE_CUBE_MAP_ARB &&
+	    !radeon_miptree_matches_image(t->mt, texImage, face, level)) {
+	  radeon_miptree_unreference(t->mt);
+	  t->mt = NULL;
+	}
+
+	if (!t->mt)
+		radeon_try_alloc_miptree(rmesa, t, image, face, level);
+	if (t->mt && radeon_miptree_matches_image(t->mt, texImage, face, level)) {
+		radeon_mipmap_level *lvl;
+		image->mt = t->mt;
+		image->mtlevel = level - t->mt->firstLevel;
+		image->mtface = face;
+		radeon_miptree_reference(t->mt);
+		lvl = &image->mt->levels[image->mtlevel];
+		dstRowStride = lvl->rowstride;
+	} else {
+		int size;
+		if (texImage->IsCompressed) {
+			size = texImage->CompressedSize;
+		} else {
+			size = texImage->Width * texImage->Height * texImage->Depth * texImage->TexFormat->TexelBytes;
+		}
+		texImage->Data = _mesa_alloc_texmemory(size);
+	}
+
+	/* Upload texture image; note that the spec allows pixels to be NULL */
+	if (compressed) {
+		pixels = _mesa_validate_pbo_compressed_teximage(
+			ctx, imageSize, pixels, packing, "glCompressedTexImage");
+	} else {
+		pixels = _mesa_validate_pbo_teximage(
+			ctx, dims, width, height, depth,
+			format, type, pixels, packing, "glTexImage");
+	}
+
+	if (pixels) {
+		radeon_teximage_map(image, GL_TRUE);
+		if (compressed) {
+			if (image->mt) {
+				uint32_t srcRowStride, bytesPerRow, rows;
+				srcRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, width);
+				bytesPerRow = srcRowStride;
+				rows = (height + 3) / 4;
+				copy_rows(texImage->Data, image->mt->levels[level].rowstride,
+					  pixels, srcRowStride, rows, bytesPerRow);
+			} else {
+				memcpy(texImage->Data, pixels, imageSize);
+			}
+		} else {
+			GLuint dstRowStride;
+			GLuint *dstImageOffsets;
+
+			if (image->mt) {
+				radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+				dstRowStride = lvl->rowstride;
+			} else {
+				dstRowStride = texImage->Width * texImage->TexFormat->TexelBytes;
+			}
+
+			if (dims == 3) {
+				int i;
+
+				dstImageOffsets = _mesa_malloc(depth * sizeof(GLuint)) ;
+				if (!dstImageOffsets)
+					_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
+
+				for (i = 0; i < depth; ++i) {
+					dstImageOffsets[i] = dstRowStride/texImage->TexFormat->TexelBytes * height * i;
+				}
+			} else {
+				dstImageOffsets = texImage->ImageOffsets;
+			}
+
+			if (!texImage->TexFormat->StoreImage(ctx, dims,
+						texImage->_BaseFormat,
+						texImage->TexFormat,
+						texImage->Data, 0, 0, 0, /* dstX/Y/Zoffset */
+						dstRowStride,
+						dstImageOffsets,
+						width, height, depth,
+						format, type, pixels, packing))
+				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
+
+			if (dims == 3)
+				_mesa_free(dstImageOffsets);
+		}
+
+		/* SGIS_generate_mipmap */
+		if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+			radeon_generate_mipmap(ctx, target, texObj);
+		}
+	}
+
+	_mesa_unmap_teximage_pbo(ctx, packing);
+
+	if (pixels)
+	  radeon_teximage_unmap(image);
+
+
+}
+
+void radeonTexImage1D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage)
+{
+	radeon_teximage(ctx, 1, target, level, internalFormat, width, 1, 1,
+		0, format, type, pixels, packing, texObj, texImage, 0);
+}
+
+void radeonTexImage2D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint height, GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+
+{
+	radeon_teximage(ctx, 2, target, level, internalFormat, width, height, 1,
+		0, format, type, pixels, packing, texObj, texImage, 0);
+}
+
+void radeonCompressedTexImage2D(GLcontext * ctx, GLenum target,
+				     GLint level, GLint internalFormat,
+				     GLint width, GLint height, GLint border,
+				     GLsizei imageSize, const GLvoid * data,
+				     struct gl_texture_object *texObj,
+				     struct gl_texture_image *texImage)
+{
+	radeon_teximage(ctx, 2, target, level, internalFormat, width, height, 1,
+		imageSize, 0, 0, data, &ctx->Unpack, texObj, texImage, 1);
+}
+
+void radeonTexImage3D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint height, GLint depth,
+		      GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage)
+{
+	radeon_teximage(ctx, 3, target, level, internalFormat, width, height, depth,
+		0, format, type, pixels, packing, texObj, texImage, 0);
+}
+
+/**
+ * Update a subregion of the given texture image.
+ */
+static void radeon_texsubimage(GLcontext* ctx, int dims, GLenum target, int level,
+		GLint xoffset, GLint yoffset, GLint zoffset,
+		GLsizei width, GLsizei height, GLsizei depth,
+		GLsizei imageSize,
+		GLenum format, GLenum type,
+		const GLvoid * pixels,
+		const struct gl_pixelstore_attrib *packing,
+		struct gl_texture_object *texObj,
+		struct gl_texture_image *texImage,
+		int compressed)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	radeon_texture_image* image = get_radeon_texture_image(texImage);
+
+	radeon_firevertices(rmesa);
+
+	t->validated = GL_FALSE;
+	if (compressed) {
+		pixels = _mesa_validate_pbo_compressed_teximage(
+			ctx, imageSize, pixels, packing, "glCompressedTexImage");
+	} else {
+		pixels = _mesa_validate_pbo_teximage(ctx, dims,
+			width, height, depth, format, type, pixels, packing, "glTexSubImage1D");
+	}
+
+	if (pixels) {
+		GLint dstRowStride;
+		radeon_teximage_map(image, GL_TRUE);
+
+		if (image->mt) {
+			radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+			dstRowStride = lvl->rowstride;
+		} else {
+			dstRowStride = texImage->RowStride * texImage->TexFormat->TexelBytes;
+		}
+
+		if (compressed) {
+			uint32_t srcRowStride, bytesPerRow, rows;
+			GLubyte *img_start;
+			if (!image->mt) {
+				dstRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, texImage->Width);
+				img_start = _mesa_compressed_image_address(xoffset, yoffset, 0,
+									   texImage->TexFormat->MesaFormat,
+									   texImage->Width, texImage->Data);
+			}
+			else {
+				uint32_t blocks_x = dstRowStride / (image->mt->bpp * 4);
+				img_start = texImage->Data + image->mt->bpp * 4 * (blocks_x * (yoffset / 4) + xoffset / 4);
+			}
+			srcRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, width);
+			bytesPerRow = srcRowStride;
+			rows = (height + 3) / 4;
+
+			copy_rows(img_start, dstRowStride,  pixels, srcRowStride, rows,  bytesPerRow);
+			
+		} else {
+			if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
+							     texImage->TexFormat, texImage->Data,
+							     xoffset, yoffset, zoffset,
+							     dstRowStride,
+							     texImage->ImageOffsets,
+							     width, height, depth,
+							     format, type, pixels, packing))
+				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage");
+		}
+
+		/* GL_SGIS_generate_mipmap */
+		if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+			radeon_generate_mipmap(ctx, target, texObj);
+		}
+	}
+
+	radeon_teximage_unmap(image);
+
+	_mesa_unmap_teximage_pbo(ctx, packing);
+
+
+}
+
+void radeonTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset,
+			 GLsizei width,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage)
+{
+	radeon_texsubimage(ctx, 1, target, level, xoffset, 0, 0, width, 1, 1, 0,
+		format, type, pixels, packing, texObj, texImage, 0);
+}
+
+void radeonTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset, GLint yoffset,
+			 GLsizei width, GLsizei height,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage)
+{
+	radeon_texsubimage(ctx, 2, target, level, xoffset, yoffset, 0, width, height, 1,
+			   0, format, type, pixels, packing, texObj, texImage,
+			   0);
+}
+
+void radeonCompressedTexSubImage2D(GLcontext * ctx, GLenum target,
+				   GLint level, GLint xoffset,
+				   GLint yoffset, GLsizei width,
+				   GLsizei height, GLenum format,
+				   GLsizei imageSize, const GLvoid * data,
+				   struct gl_texture_object *texObj,
+				   struct gl_texture_image *texImage)
+{
+	radeon_texsubimage(ctx, 2, target, level, xoffset, yoffset, 0, width, height, 1,
+		imageSize, format, 0, data, &ctx->Unpack, texObj, texImage, 1);
+}
+
+
+void radeonTexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset, GLint yoffset, GLint zoffset,
+			 GLsizei width, GLsizei height, GLsizei depth,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage)
+{
+	radeon_texsubimage(ctx, 3, target, level, xoffset, yoffset, zoffset, width, height, depth, 0,
+		format, type, pixels, packing, texObj, texImage, 0);
+}
+
+
+
+/**
+ * Ensure that the given image is stored in the given miptree from now on.
+ */
+static void migrate_image_to_miptree(radeon_mipmap_tree *mt, radeon_texture_image *image, int face, int level)
+{
+	radeon_mipmap_level *dstlvl = &mt->levels[level - mt->firstLevel];
+	unsigned char *dest;
+
+	assert(image->mt != mt);
+	assert(dstlvl->width == image->base.Width);
+	assert(dstlvl->height == image->base.Height);
+	assert(dstlvl->depth == image->base.Depth);
+
+
+	radeon_bo_map(mt->bo, GL_TRUE);
+	dest = mt->bo->ptr + dstlvl->faces[face].offset;
+
+	if (image->mt) {
+		/* Format etc. should match, so we really just need a memcpy().
+		 * In fact, that memcpy() could be done by the hardware in many
+		 * cases, provided that we have a proper memory manager.
+		 */
+		radeon_mipmap_level *srclvl = &image->mt->levels[image->mtlevel-image->mt->firstLevel];
+
+		assert(srclvl->size == dstlvl->size);
+		assert(srclvl->rowstride == dstlvl->rowstride);
+
+		radeon_bo_map(image->mt->bo, GL_FALSE);
+
+		memcpy(dest,
+			image->mt->bo->ptr + srclvl->faces[face].offset,
+			dstlvl->size);
+		radeon_bo_unmap(image->mt->bo);
+
+		radeon_miptree_unreference(image->mt);
+	} else {
+		uint32_t srcrowstride;
+		uint32_t height;
+		/* need to confirm this value is correct */
+		if (mt->compressed) {
+			height = (image->base.Height + 3) / 4;
+			srcrowstride = _mesa_compressed_row_stride(image->base.TexFormat->MesaFormat, image->base.Width);
+		} else {
+			height = image->base.Height * image->base.Depth;
+			srcrowstride = image->base.Width * image->base.TexFormat->TexelBytes;
+		}
+
+//		if (mt->tilebits)
+//			WARN_ONCE("%s: tiling not supported yet", __FUNCTION__);
+
+		copy_rows(dest, dstlvl->rowstride, image->base.Data, srcrowstride,
+			  height, srcrowstride);
+
+		_mesa_free_texmemory(image->base.Data);
+		image->base.Data = 0;
+	}
+
+	radeon_bo_unmap(mt->bo);
+
+	image->mt = mt;
+	image->mtface = face;
+	image->mtlevel = level;
+	radeon_miptree_reference(image->mt);
+}
+
+int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	radeon_texture_image *baseimage = get_radeon_texture_image(texObj->Image[0][texObj->BaseLevel]);
+	int face, level;
+
+	if (t->validated || t->image_override)
+		return GL_TRUE;
+
+	if (RADEON_DEBUG & RADEON_TEXTURE)
+		fprintf(stderr, "%s: Validating texture %p now\n", __FUNCTION__, texObj);
+
+	if (baseimage->base.Border > 0)
+		return GL_FALSE;
+
+	/* Ensure a matching miptree exists.
+	 *
+	 * Differing mipmap trees can result when the app uses TexImage to
+	 * change texture dimensions.
+	 *
+	 * Prefer to use base image's miptree if it
+	 * exists, since that most likely contains more valid data (remember
+	 * that the base level is usually significantly larger than the rest
+	 * of the miptree, so cubemaps are the only possible exception).
+	 */
+	if (baseimage->mt &&
+	    baseimage->mt != t->mt &&
+	    radeon_miptree_matches_texture(baseimage->mt, &t->base)) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = baseimage->mt;
+		radeon_miptree_reference(t->mt);
+	} else if (t->mt && !radeon_miptree_matches_texture(t->mt, &t->base)) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = 0;
+	}
+
+	if (!t->mt) {
+		if (RADEON_DEBUG & RADEON_TEXTURE)
+			fprintf(stderr, " Allocate new miptree\n");
+		radeon_try_alloc_miptree(rmesa, t, baseimage, 0, texObj->BaseLevel);
+		if (!t->mt) {
+			_mesa_problem(ctx, "radeon_validate_texture failed to alloc miptree");
+			return GL_FALSE;
+		}
+	}
+
+	/* Ensure all images are stored in the single main miptree */
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level) {
+			radeon_texture_image *image = get_radeon_texture_image(texObj->Image[face][level]);
+			if (RADEON_DEBUG & RADEON_TEXTURE)
+				fprintf(stderr, " face %i, level %i... %p vs %p ", face, level, t->mt, image->mt);
+			if (t->mt == image->mt) {
+				if (RADEON_DEBUG & RADEON_TEXTURE)
+					fprintf(stderr, "OK\n");
+
+				continue;
+			}
+
+			if (RADEON_DEBUG & RADEON_TEXTURE)
+				fprintf(stderr, "migrating\n");
+			migrate_image_to_miptree(t->mt, image, face, level);
+		}
+	}
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Need to map texture image into memory before copying image data,
+ * then unmap it.
+ */
+static void
+radeon_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
+		     GLenum format, GLenum type, GLvoid * pixels,
+		     struct gl_texture_object *texObj,
+		     struct gl_texture_image *texImage, int compressed)
+{
+	radeon_texture_image *image = get_radeon_texture_image(texImage);
+
+	if (image->mt) {
+		/* Map the texture image read-only */
+		radeon_teximage_map(image, GL_FALSE);
+	} else {
+		/* Image hasn't been uploaded to a miptree yet */
+		assert(image->base.Data);
+	}
+
+	if (compressed) {
+		/* FIXME: this can't work for small textures (mips) which
+		         use different hw stride */
+		_mesa_get_compressed_teximage(ctx, target, level, pixels,
+					      texObj, texImage);
+	} else {
+		_mesa_get_teximage(ctx, target, level, format, type, pixels,
+				   texObj, texImage);
+	}
+     
+	if (image->mt) {
+		radeon_teximage_unmap(image);
+	}
+}
+
+void
+radeonGetTexImage(GLcontext * ctx, GLenum target, GLint level,
+		  GLenum format, GLenum type, GLvoid * pixels,
+		  struct gl_texture_object *texObj,
+		  struct gl_texture_image *texImage)
+{
+	radeon_get_tex_image(ctx, target, level, format, type, pixels,
+			     texObj, texImage, 0);
+}
+
+void
+radeonGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
+			    GLvoid *pixels,
+			    struct gl_texture_object *texObj,
+			    struct gl_texture_image *texImage)
+{
+	radeon_get_tex_image(ctx, target, level, 0, 0, pixels,
+			     texObj, texImage, 1);
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.h b/src/mesa/drivers/dri/radeon/radeon_texture.h
new file mode 100644
index 0000000000..888a55ba91
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ * Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+ *
+ * The Weather Channel (TM) funded Tungsten Graphics to develop the
+ * initial release of the Radeon 8500 driver under the XFree86 license.
+ * This notice must be preserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_TEXTURE_H
+#define RADEON_TEXTURE_H
+struct gl_texture_image *radeonNewTextureImage(GLcontext *ctx);
+void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage);
+
+void radeon_teximage_map(radeon_texture_image *image, GLboolean write_enable);
+void radeon_teximage_unmap(radeon_texture_image *image);
+void radeonMapTexture(GLcontext *ctx, struct gl_texture_object *texObj);
+void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj);
+void radeonGenerateMipmap(GLcontext* ctx, GLenum target, struct gl_texture_object *texObj);
+int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj);
+GLuint radeon_face_for_target(GLenum target);
+const struct gl_texture_format *radeonChooseTextureFormat_mesa(GLcontext * ctx,
+							  GLint internalFormat,
+							  GLenum format,
+							  GLenum type);
+const struct gl_texture_format *radeonChooseTextureFormat(GLcontext * ctx,
+							  GLint internalFormat,
+							  GLenum format,
+							  GLenum type, GLboolean fbo);
+
+void radeonTexImage1D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage);
+void radeonTexImage2D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint height, GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage);
+void radeonCompressedTexImage2D(GLcontext * ctx, GLenum target,
+				GLint level, GLint internalFormat,
+				GLint width, GLint height, GLint border,
+				GLsizei imageSize, const GLvoid * data,
+				struct gl_texture_object *texObj,
+				struct gl_texture_image *texImage);
+void radeonTexImage3D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint height, GLint depth,
+		      GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage);
+void radeonTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset,
+			 GLsizei width,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage);
+void radeonTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
+				GLint xoffset, GLint yoffset,
+				GLsizei width, GLsizei height,
+				GLenum format, GLenum type,
+				const GLvoid * pixels,
+				const struct gl_pixelstore_attrib *packing,
+				struct gl_texture_object *texObj,
+				struct gl_texture_image *texImage);
+void radeonCompressedTexSubImage2D(GLcontext * ctx, GLenum target,
+				   GLint level, GLint xoffset,
+				   GLint yoffset, GLsizei width,
+				   GLsizei height, GLenum format,
+				   GLsizei imageSize, const GLvoid * data,
+				   struct gl_texture_object *texObj,
+				   struct gl_texture_image *texImage);
+
+void radeonTexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset, GLint yoffset, GLint zoffset,
+			 GLsizei width, GLsizei height, GLsizei depth,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage);
+
+void radeonGetTexImage(GLcontext * ctx, GLenum target, GLint level,
+		       GLenum format, GLenum type, GLvoid * pixels,
+		       struct gl_texture_object *texObj,
+		       struct gl_texture_image *texImage);
+void radeonGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
+				 GLvoid *pixels,
+				 struct gl_texture_object *texObj,
+				 struct gl_texture_image *texImage);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/server/radeon_reg.h b/src/mesa/drivers/dri/radeon/server/radeon_reg.h
index 596a8aa715..e81d7fdcd0 100644
--- a/src/mesa/drivers/dri/radeon/server/radeon_reg.h
+++ b/src/mesa/drivers/dri/radeon/server/radeon_reg.h
@@ -1500,7 +1500,7 @@
 #       define RADEON_ALPHA_ARG_C_T1_ALPHA       (6   << 8)
 #       define RADEON_ALPHA_ARG_C_T2_ALPHA       (7   << 8)
 #       define RADEON_ALPHA_ARG_C_T3_ALPHA       (8   << 8)
-#       define RADEON_DOT_ALPHA_DONT_REPLICATE   (1   << 9)
+#       define RADEON_DOT_ALPHA_DONT_REPLICATE   (1   << 12)
 #       define RADEON_ALPHA_ARG_MASK             0xf
 
 #define RADEON_PP_TFACTOR_0                 0x1c68
@@ -1601,6 +1601,8 @@
 #       define RADEON_STENCIL_VALUE_MASK      (0xff << 16)
 #       define RADEON_STENCIL_WRITEMASK_SHIFT 24
 #       define RADEON_STENCIL_WRITE_MASK      (0xff << 24)
+#define RADEON_RB3D_ZPASS_DATA              0x3290
+#define RADEON_RB3D_ZPASS_ADDR              0x3294
 #define RADEON_RB3D_ZSTENCILCNTL            0x1c2c
 #       define RADEON_DEPTH_FORMAT_MASK          (0xf << 0)
 #       define RADEON_DEPTH_FORMAT_16BIT_INT_Z   (0  <<  0)
@@ -1661,6 +1663,9 @@
 #       define RADEON_FORCE_Z_DIRTY              (1  << 29)
 #       define RADEON_Z_WRITE_ENABLE             (1  << 30)
 #       define RADEON_Z_DECOMPRESSION_ENABLE     (1  << 31)
+
+#define RADEON_RE_STIPPLE_ADDR              0x1cc8
+#define RADEON_RE_STIPPLE_DATA              0x1ccc
 #define RADEON_RE_LINE_PATTERN              0x1cd0
 #       define RADEON_LINE_PATTERN_MASK             0x0000ffff
 #       define RADEON_LINE_REPEAT_COUNT_SHIFT       16
@@ -2031,6 +2036,9 @@
 #define RADEON_CP_PACKET3_3D_DRAW_INDX              0xC0002A00
 #define RADEON_CP_PACKET3_LOAD_PALETTE              0xC0002C00
 #define RADEON_CP_PACKET3_3D_LOAD_VBPNTR            0xC0002F00
+#define R200_CP_CMD_3D_DRAW_VBUF_2      0xC0003400
+#define R200_CP_CMD_3D_DRAW_IMMD_2      0xC0003500
+#define R200_CP_CMD_3D_DRAW_INDX_2      0xC0003600
 #define RADEON_CP_PACKET3_CNTL_PAINT                0xC0009100
 #define RADEON_CP_PACKET3_CNTL_BITBLT               0xC0009200
 #define RADEON_CP_PACKET3_CNTL_SMALLTEXT            0xC0009300
diff --git a/src/mesa/drivers/dri/s3v/s3v_context.c b/src/mesa/drivers/dri/s3v/s3v_context.c
index 14502f95ae..0a3bf7258d 100644
--- a/src/mesa/drivers/dri/s3v/s3v_context.c
+++ b/src/mesa/drivers/dri/s3v/s3v_context.c
@@ -108,6 +108,8 @@ GLboolean s3vCreateContext(const __GLcontextModes *glVisual,
 	ctx->Const.MaxLineWidthAA = 1.0;
 	ctx->Const.LineWidthGranularity = 1.0;
 
+	ctx->Const.MaxDrawBuffers = 1;
+
 	vmesa->texHeap = mmInit( 0, vmesa->s3vScreen->textureSize );
 	DEBUG(("vmesa->s3vScreen->textureSize = 0x%x\n",
 		vmesa->s3vScreen->textureSize));
diff --git a/src/mesa/drivers/dri/s3v/s3v_state.c b/src/mesa/drivers/dri/s3v/s3v_state.c
index c71c89a3e1..561f42c705 100644
--- a/src/mesa/drivers/dri/s3v/s3v_state.c
+++ b/src/mesa/drivers/dri/s3v/s3v_state.c
@@ -2,7 +2,6 @@
  * Author: Max Lingua <sunmax@libero.it>
  */
 
-#include <X11/Xlibint.h>
 #include "s3v_context.h"
 #include "s3v_macros.h"
 #include "s3v_dri.h"
@@ -24,7 +23,7 @@
 static void s3vUpdateAlphaMode( GLcontext *ctx )
 {
 	s3vContextPtr vmesa = S3V_CONTEXT(ctx);
-	CARD32 cmd = vmesa->CMD;
+	uint32_t cmd = vmesa->CMD;
 	cmd &= ~ALPHA_BLEND_MASK;
 
 	if ( ctx->Color.BlendEnabled ) {
@@ -173,7 +172,7 @@ static void s3vDDClear( GLcontext *ctx, GLbitfield mask )
 static void s3vUpdateZMode( GLcontext *ctx )
 {
 	s3vContextPtr vmesa = S3V_CONTEXT(ctx);
-	CARD32 cmd = vmesa->CMD;
+	uint32_t cmd = vmesa->CMD;
 
 	DEBUG(("Depth.Test = %i\n", ctx->Depth.Test));
 	DEBUG(("CMD was = 0x%x ", cmd));
diff --git a/src/mesa/drivers/dri/s3v/s3v_tex.c b/src/mesa/drivers/dri/s3v/s3v_tex.c
index 8bf2ea9878..9b92519862 100644
--- a/src/mesa/drivers/dri/s3v/s3v_tex.c
+++ b/src/mesa/drivers/dri/s3v/s3v_tex.c
@@ -132,8 +132,14 @@ static void s3vSetTexFilter(s3vContextPtr vmesa,
 
 static void s3vSetTexBorderColor(s3vContextPtr vmesa,
 				  s3vTextureObjectPtr t, 
-				  GLubyte color[4])
+				  const GLfloat color[4])
 {
+	GLubyte c[4];
+	CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+	CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+	CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+	CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
+
 #if TEX_DEBUG_ON
 	static unsigned int times=0;
 	DEBUG_TEX(("*** s3vSetTexBorderColor: #%i ***\n", ++times));
@@ -143,8 +149,7 @@ static void s3vSetTexBorderColor(s3vContextPtr vmesa,
 	/* switch(t0 ... t->TextureColorMode) */
 
 	/* case TEX_COL_ARGB1555: */
-	t->TextureBorderColor =
-		S3VIRGEPACKCOLOR555(color[0], color[1], color[2], color[3]);
+	t->TextureBorderColor =	S3VIRGEPACKCOLOR555(c[0], c[1], c[2], c[3]);
 
 	DEBUG(("TextureBorderColor = 0x%x\n", t->TextureBorderColor));
 
@@ -182,7 +187,7 @@ static void s3vTexParameter( GLcontext *ctx, GLenum target,
 		break;
   
 	case GL_TEXTURE_BORDER_COLOR:
-		s3vSetTexBorderColor( vmesa, t, tObj->_BorderChan );
+		s3vSetTexBorderColor( vmesa, t, tObj->BorderColor );
 		break;
 
 	case GL_TEXTURE_BASE_LEVEL:
@@ -502,20 +507,20 @@ static void s3vInitTextureObjects( GLcontext *ctx )
 #if 1
 	ctx->Texture.CurrentUnit = 0;
 
-	texObj = ctx->Texture.Unit[0].Current1D;
+	texObj = ctx->Texture.Unit[0].CurrentTex[TEXTURE_1D_INDEX];
 	s3vBindTexture( ctx, GL_TEXTURE_1D, texObj );
 
-	texObj = ctx->Texture.Unit[0].Current2D;
+	texObj = ctx->Texture.Unit[0].CurrentTex[TEXTURE_2D_INDEX];
 	s3vBindTexture( ctx, GL_TEXTURE_2D, texObj );
 #endif
 
 #if 0
 	ctx->Texture.CurrentUnit = 1;
 
-	texObj = ctx->Texture.Unit[1].Current1D;
+	texObj = ctx->Texture.Unit[1].CurrentTex[TEXTURE_1D_INDEX];
 	s3vBindTexture( ctx, GL_TEXTURE_1D, texObj );
 
-	texObj = ctx->Texture.Unit[1].Current2D;
+	texObj = ctx->Texture.Unit[1].CurrentTex[TEXTURE_2D_INDEX];
 	s3vBindTexture( ctx, GL_TEXTURE_2D, texObj );
 #endif
 
diff --git a/src/mesa/drivers/dri/s3v/s3v_xmesa.c b/src/mesa/drivers/dri/s3v/s3v_xmesa.c
index b18c8763c3..85f1481769 100644
--- a/src/mesa/drivers/dri/s3v/s3v_xmesa.c
+++ b/src/mesa/drivers/dri/s3v/s3v_xmesa.c
@@ -4,11 +4,12 @@
 
 #include "s3v_context.h"
 #include "s3v_vb.h"
+#include "s3v_dri.h"
 #include "main/context.h"
 #include "main/matrix.h"
-#include "s3v_dri.h"
 #include "main/framebuffer.h"
 #include "main/renderbuffer.h"
+#include "main/viewport.h"
 
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
@@ -131,7 +132,7 @@ s3vCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 s3vDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/savage/savage_xmesa.c b/src/mesa/drivers/dri/savage/savage_xmesa.c
index a344aab71b..931ceff0a8 100644
--- a/src/mesa/drivers/dri/savage/savage_xmesa.c
+++ b/src/mesa/drivers/dri/savage/savage_xmesa.c
@@ -23,7 +23,6 @@
  */
 
 
-#include <X11/Xlibint.h>
 #include <stdio.h>
 
 #include "main/context.h"
@@ -59,9 +58,6 @@
 #include "drirenderbuffer.h"
 #include "texmem.h"
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
 #define need_GL_EXT_secondary_color
 #include "extension_helper.h"
 
@@ -133,10 +129,7 @@ struct timeval tv_s1,tv_f1;
 
 static const struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     { "GL_EXT_stencil_wrap",               NULL },
     { "GL_EXT_texture_lod_bias",           NULL },
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
@@ -186,7 +179,7 @@ savageInitDriver(__DRIscreenPrivate *sPriv)
    }
 
    /* Allocate the private area */
-   savageScreen = (savageScreenPrivate *)Xmalloc(sizeof(savageScreenPrivate));
+   savageScreen = (savageScreenPrivate *)_mesa_malloc(sizeof(savageScreenPrivate));
    if (!savageScreen)
       return GL_FALSE;
 
@@ -233,7 +226,7 @@ savageInitDriver(__DRIscreenPrivate *sPriv)
 		  savageScreen->agpTextures.handle,
 		  savageScreen->agpTextures.size,
 		  (drmAddress *)&(savageScreen->agpTextures.map)) != 0) {
-	   Xfree(savageScreen);
+	   _mesa_free(savageScreen);
 	   sPriv->private = NULL;
 	   return GL_FALSE;
        }
@@ -253,7 +246,7 @@ savageInitDriver(__DRIscreenPrivate *sPriv)
 	      savageScreen->aperture.size, 
 	      (drmAddress *)&savageScreen->aperture.map) != 0) 
    {
-      Xfree(savageScreen);
+      _mesa_free(savageScreen);
       sPriv->private = NULL;
       return GL_FALSE;
    }
@@ -289,7 +282,7 @@ savageDestroyScreen(__DRIscreenPrivate *sPriv)
    /* free all option information */
    driDestroyOptionInfo (&savageScreen->optionCache);
 
-   Xfree(savageScreen);
+   _mesa_free(savageScreen);
    sPriv->private = NULL;
 }
 
@@ -307,7 +300,7 @@ savageCreateContext( const __GLcontextModes *mesaVis,
 						 savageScreen->sarea_priv_offset);
    int textureSize[SAVAGE_NR_TEX_HEAPS];
    int i;
-   imesa = (savageContextPtr)Xcalloc(sizeof(savageContext), 1);
+   imesa = (savageContextPtr)_mesa_calloc(sizeof(savageContext));
    if (!imesa) {
       return GL_FALSE;
    }
@@ -324,7 +317,7 @@ savageCreateContext( const __GLcontextModes *mesaVis,
       shareCtx = NULL;
    ctx = _mesa_create_context(mesaVis, shareCtx, &functions, imesa);
    if (!ctx) {
-      Xfree(imesa);
+      _mesa_free(imesa);
       return GL_FALSE;
    }
    driContextPriv->driverPrivate = imesa;
@@ -350,7 +343,9 @@ savageCreateContext( const __GLcontextModes *mesaVis,
    ctx->Const.MaxLineWidthAA = 3.0;
    ctx->Const.LineWidthGranularity = 1.0;
 #endif
-   
+
+   ctx->Const.MaxDrawBuffers = 1;
+
    /* Dri stuff
     */
    imesa->hHWContext = driContextPriv->hHWContext;
@@ -682,7 +677,7 @@ savageCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 savageDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 #if 0
@@ -914,7 +909,7 @@ savageFillInModes( __DRIscreenPrivate *psp,
 
     uint8_t depth_bits_array[2];
     uint8_t stencil_bits_array[2];
-
+    uint8_t msaa_samples_array[1];
 
     depth_bits_array[0] = depth_bits;
     depth_bits_array[1] = depth_bits;
@@ -926,6 +921,8 @@ savageFillInModes( __DRIscreenPrivate *psp,
     stencil_bits_array[0] = 0;
     stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+    msaa_samples_array[0] = 0;
+
     depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 2 : 1;
     back_buffer_factor  = (have_back_buffer) ? 2 : 1;
 
@@ -941,7 +938,8 @@ savageFillInModes( __DRIscreenPrivate *psp,
     configs = driCreateConfigs(fb_format, fb_type,
 			       depth_bits_array, stencil_bits_array,
 			       depth_buffer_factor,
-			       back_buffer_modes, back_buffer_factor);
+			       back_buffer_modes, back_buffer_factor,
+                               msaa_samples_array, 1);
     if (configs == NULL) {
 	fprintf( stderr, "[%s:%u] Error creating FBConfig!\n",
 		 __func__, __LINE__ );
diff --git a/src/mesa/drivers/dri/savage/savagecontext.h b/src/mesa/drivers/dri/savage/savagecontext.h
index fd6399d6a6..53a37db1cb 100644
--- a/src/mesa/drivers/dri/savage/savagecontext.h
+++ b/src/mesa/drivers/dri/savage/savagecontext.h
@@ -31,7 +31,6 @@ typedef struct savage_context_t savageContext;
 typedef struct savage_context_t *savageContextPtr;
 typedef struct savage_texture_object_t *savageTextureObjectPtr;
 
-#include <X11/Xlibint.h>
 #include "dri_util.h"
 #include "main/mtypes.h"
 #include "xf86drm.h"
diff --git a/src/mesa/drivers/dri/savage/savagestate.c b/src/mesa/drivers/dri/savage/savagestate.c
index 73d85ed57b..84e1b52585 100644
--- a/src/mesa/drivers/dri/savage/savagestate.c
+++ b/src/mesa/drivers/dri/savage/savagestate.c
@@ -514,7 +514,7 @@ static void savageDDDepthFunc_s4(GLcontext *ctx, GLenum func)
 	imesa->regs.s4.drawLocalCtrl.ni.flushPdZbufWrites = GL_TRUE;
 	imesa->regs.s4.zBufCtrl.ni.zBufEn = GL_TRUE;
     }
-    else if (imesa->glCtx->Stencil.Enabled && imesa->hw_stencil)
+    else if (imesa->glCtx->Stencil._Enabled && imesa->hw_stencil)
     {
         /* Need to keep Z on for Stencil. */
 	imesa->regs.s4.zBufCtrl.ni.zCmpFunc = CF_Always;
@@ -1092,7 +1092,7 @@ static void savageDDEnable_s4(GLcontext *ctx, GLenum cap, GLboolean state)
 		FALLBACK (ctx, SAVAGE_FALLBACK_STENCIL, state);
 	    else {
 		imesa->regs.s4.stencilCtrl.ni.stencilEn = state;
-		if (ctx->Stencil.Enabled &&
+		if (ctx->Stencil._Enabled &&
 		    imesa->regs.s4.zBufCtrl.ni.zBufEn != GL_TRUE)
 		{
 		    /* Stencil buffer requires Z enabled. */
diff --git a/src/mesa/drivers/dri/savage/savagetex.c b/src/mesa/drivers/dri/savage/savagetex.c
index a3bebfa8cf..fe239e1b05 100644
--- a/src/mesa/drivers/dri/savage/savagetex.c
+++ b/src/mesa/drivers/dri/savage/savagetex.c
@@ -474,7 +474,7 @@ static void savageSetTexFilter(savageTexObjPtr t, GLenum minf, GLenum magf)
 
 /* Need a fallback ?
  */
-static void savageSetTexBorderColor(savageTexObjPtr t, GLubyte color[4])
+static void savageSetTexBorderColor(savageTexObjPtr t, const GLfloat color[4])
 {
 /*    t->Setup[SAVAGE_TEXREG_TEXBORDERCOL] =  */
     /*t->setup.borderColor = SAVAGEPACKCOLOR8888(color[0],color[1],color[2],color[3]); */
@@ -512,7 +512,7 @@ savageAllocTexObj( struct gl_texture_object *texObj )
 
       savageSetTexWrapping(t,texObj->WrapS,texObj->WrapT);
       savageSetTexFilter(t,texObj->MinFilter,texObj->MagFilter);
-      savageSetTexBorderColor(t,texObj->_BorderChan);
+      savageSetTexBorderColor(t,texObj->BorderColor);
    }
 
    return t;
@@ -2018,7 +2018,7 @@ static void savageTexParameter( GLcontext *ctx, GLenum target,
       break;
   
    case GL_TEXTURE_BORDER_COLOR:
-      savageSetTexBorderColor(t,tObj->_BorderChan);
+      savageSetTexBorderColor(t,tObj->BorderColor);
       break;
 
    default:
diff --git a/src/mesa/drivers/dri/sis/sis_alloc.c b/src/mesa/drivers/dri/sis/sis_alloc.c
index 4ca4052803..ce34e44da2 100644
--- a/src/mesa/drivers/dri/sis/sis_alloc.c
+++ b/src/mesa/drivers/dri/sis/sis_alloc.c
@@ -137,7 +137,7 @@ sisAllocZStencilBuffer( sisContextPtr smesa )
 {
    int cpp = ( smesa->glCtx->Visual.depthBits +
                smesa->glCtx->Visual.stencilBits ) / 8;
-   unsigned char *addr;
+   char *addr;
 
    smesa->depth.bpp = cpp * 8;
    smesa->depth.pitch = ALIGNMENT(smesa->driDrawable->w * cpp, 4);
@@ -150,7 +150,7 @@ sisAllocZStencilBuffer( sisContextPtr smesa )
    addr = (char *)ALIGNMENT((unsigned long)addr, Z_BUFFER_HW_ALIGNMENT);
 
    smesa->depth.map = addr;
-   smesa->depth.offset = addr - smesa->FbBase;
+   smesa->depth.offset = addr - (char *)smesa->FbBase;
 
    /* stencil buffer is same as depth buffer */
    smesa->stencil.size = smesa->depth.size;
@@ -173,7 +173,7 @@ void
 sisAllocBackbuffer( sisContextPtr smesa )
 {
    int cpp = smesa->bytesPerPixel;
-   unsigned char *addr;
+   char *addr;
 
    smesa->back.bpp = smesa->bytesPerPixel * 8;
    smesa->back.pitch = ALIGNMENT(smesa->driDrawable->w * cpp, 4);
@@ -186,7 +186,7 @@ sisAllocBackbuffer( sisContextPtr smesa )
    addr = (char *)ALIGNMENT((unsigned long)addr, DRAW_BUFFER_HW_ALIGNMENT);
 
    smesa->back.map = addr;
-   smesa->back.offset = addr - smesa->FbBase;
+   smesa->back.offset = addr - (char *)smesa->FbBase;
 }
 
 void
diff --git a/src/mesa/drivers/dri/sis/sis_context.c b/src/mesa/drivers/dri/sis/sis_context.c
index 00d17da3ba..a070fe3d79 100644
--- a/src/mesa/drivers/dri/sis/sis_context.c
+++ b/src/mesa/drivers/dri/sis/sis_context.c
@@ -57,9 +57,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/tnl.h"
 #include "tnl/t_pipeline.h"
 
-#define need_GL_ARB_multisample
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_secondary_color
 #include "extension_helper.h"
@@ -74,12 +71,9 @@ int GlobalCmdQueueLen = 0;
 
 struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
     { "GL_ARB_texture_border_clamp",       NULL },
-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
     { "GL_ARB_texture_mirrored_repeat",    NULL },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     /*{ "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },*/
     { "GL_EXT_texture_lod_bias",           NULL },
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
@@ -91,9 +85,7 @@ struct dri_extension card_extensions[] =
 
 struct dri_extension card_extensions_6326[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     /*{ "GL_ARB_texture_border_clamp",       NULL },*/
-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
     /*{ "GL_ARB_texture_mirrored_repeat",    NULL },*/
     /*{ "GL_MESA_ycbcr_texture",             NULL },*/
     { NULL,                                NULL }
diff --git a/src/mesa/drivers/dri/sis/sis_screen.c b/src/mesa/drivers/dri/sis/sis_screen.c
index b1a5d15236..b5f04ae28d 100644
--- a/src/mesa/drivers/dri/sis/sis_screen.c
+++ b/src/mesa/drivers/dri/sis/sis_screen.c
@@ -77,6 +77,7 @@ sisFillInModes(__DRIscreenPrivate *psp, int bpp)
    };
    uint8_t depth_bits_array[4];
    uint8_t stencil_bits_array[4];
+   uint8_t msaa_samples_array[1];
 
    depth_bits_array[0] = 0;
    stencil_bits_array[0] = 0;
@@ -87,6 +88,8 @@ sisFillInModes(__DRIscreenPrivate *psp, int bpp)
    depth_bits_array[3] = 32;
    stencil_bits_array[3] = 0;
 
+   msaa_samples_array[0] = 0;
+
    depth_buffer_factor = 4;
    back_buffer_factor = 2;
 
@@ -100,7 +103,8 @@ sisFillInModes(__DRIscreenPrivate *psp, int bpp)
 
    configs = driCreateConfigs(fb_format, fb_type, depth_bits_array,
 			      stencil_bits_array, depth_buffer_factor,
-			      back_buffer_modes, back_buffer_factor);
+			      back_buffer_modes, back_buffer_factor,
+                              msaa_samples_array, 1);
    if (configs == NULL) {
       fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__, __LINE__);
       return NULL;
@@ -217,7 +221,7 @@ sisCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 sisDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static void sisCopyBuffer( __DRIdrawablePrivate *dPriv )
diff --git a/src/mesa/drivers/dri/sis/sis_texstate.c b/src/mesa/drivers/dri/sis/sis_texstate.c
index 63f23fc014..46417ce414 100644
--- a/src/mesa/drivers/dri/sis/sis_texstate.c
+++ b/src/mesa/drivers/dri/sis/sis_texstate.c
@@ -456,11 +456,16 @@ sis_set_texobj_parm( GLcontext *ctx, struct gl_texture_object *texObj,
       break;
    }
 
-   current->texture[hw_unit].hwTextureBorderColor = 
-      ((GLuint) texObj->_BorderChan[3] << 24) + 
-      ((GLuint) texObj->_BorderChan[0] << 16) + 
-      ((GLuint) texObj->_BorderChan[1] << 8) + 
-      ((GLuint) texObj->_BorderChan[2]);
+   {
+      GLubyte c[4];
+      CLAMPED_FLOAT_TO_UBYTE(c[0], texObj->BorderColor[0]);
+      CLAMPED_FLOAT_TO_UBYTE(c[1], texObj->BorderColor[1]);
+      CLAMPED_FLOAT_TO_UBYTE(c[2], texObj->BorderColor[2]);
+      CLAMPED_FLOAT_TO_UBYTE(c[3], texObj->BorderColor[3]);
+
+      current->texture[hw_unit].hwTextureBorderColor = 
+         PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
+   }
 
    if (current->texture[hw_unit].hwTextureBorderColor !=
        prev->texture[hw_unit].hwTextureBorderColor) 
diff --git a/src/mesa/drivers/dri/sis/sis_tris.c b/src/mesa/drivers/dri/sis/sis_tris.c
index 095941aea2..76d12d07b3 100644
--- a/src/mesa/drivers/dri/sis/sis_tris.c
+++ b/src/mesa/drivers/dri/sis/sis_tris.c
@@ -994,7 +994,7 @@ sisFlushPrimsLocked(sisContextPtr smesa)
 	 MMIO(REG_3D_PrimitiveSet, smesa->dwPrimitiveSet);
       }
       while (smesa->vb_last < smesa->vb_cur) {
-	 sis_emit_func(smesa, smesa->vb_last);
+	 sis_emit_func(smesa, (char *)smesa->vb_last);
 	 smesa->vb_last += incr;
       }
       mWait3DCmdQueue(1);
diff --git a/src/mesa/drivers/dri/swrast/swrast.c b/src/mesa/drivers/dri/swrast/swrast.c
index 15b57244dc..3016987d56 100644
--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -62,8 +62,11 @@
 #define need_GL_SGI_color_table
 
 /* sw extensions not associated with some GL version */
+#define need_GL_ARB_draw_elements_base_vertex
 #define need_GL_ARB_shader_objects
+#define need_GL_ARB_vertex_array_object
 #define need_GL_ARB_vertex_program
+#define need_GL_ARB_sync
 #define need_GL_APPLE_vertex_array_object
 #define need_GL_ATI_fragment_shader
 #define need_GL_ATI_separate_stencil
@@ -72,7 +75,7 @@
 #define need_GL_EXT_framebuffer_blit
 #define need_GL_EXT_gpu_program_parameters
 #define need_GL_EXT_paletted_texture
-#define need_GL_IBM_multimode_draw_arrays
+#define need_GL_EXT_stencil_two_side
 #define need_GL_MESA_resize_buffers
 #define need_GL_NV_vertex_program
 #define need_GL_NV_fragment_program
@@ -93,8 +96,12 @@ const struct dri_extension card_extensions[] =
     { "GL_EXT_histogram",		GL_EXT_histogram_functions },
     { "GL_SGI_color_table",		GL_SGI_color_table_functions },
 
+    { "GL_ARB_depth_clamp",		NULL },
+    { "GL_ARB_draw_elements_base_vertex", GL_ARB_draw_elements_base_vertex_functions },
     { "GL_ARB_shader_objects",		GL_ARB_shader_objects_functions },
+    { "GL_ARB_vertex_array_object",	GL_ARB_vertex_array_object_functions },
     { "GL_ARB_vertex_program",		GL_ARB_vertex_program_functions },
+    { "GL_ARB_sync",			GL_ARB_sync_functions },
     { "GL_APPLE_vertex_array_object",	GL_APPLE_vertex_array_object_functions },
     { "GL_ATI_fragment_shader",		GL_ATI_fragment_shader_functions },
     { "GL_ATI_separate_stencil",	GL_ATI_separate_stencil_functions },
@@ -103,8 +110,9 @@ const struct dri_extension card_extensions[] =
     { "GL_EXT_framebuffer_blit",	GL_EXT_framebuffer_blit_functions },
     { "GL_EXT_gpu_program_parameters",	GL_EXT_gpu_program_parameters_functions },
     { "GL_EXT_paletted_texture",	GL_EXT_paletted_texture_functions },
-    { "GL_IBM_multimode_draw_arrays",	GL_IBM_multimode_draw_arrays_functions },
+    { "GL_EXT_stencil_two_side",	GL_EXT_stencil_two_side_functions },
     { "GL_MESA_resize_buffers",		GL_MESA_resize_buffers_functions },
+    { "GL_NV_depth_clamp",		NULL },
     { "GL_NV_vertex_program",		GL_NV_vertex_program_functions },
     { "GL_NV_fragment_program",		GL_NV_fragment_program_functions },
     { NULL,				NULL }
@@ -147,6 +155,7 @@ swrastFillInModes(__DRIscreen *psp,
 
     uint8_t depth_bits_array[4];
     uint8_t stencil_bits_array[4];
+    uint8_t msaa_samples_array[1];
 
     depth_bits_array[0] = 0;
     depth_bits_array[1] = 0;
@@ -161,26 +170,38 @@ swrastFillInModes(__DRIscreen *psp,
     stencil_bits_array[2] = 0;
     stencil_bits_array[3] = (stencil_bits == 0) ? 8 : stencil_bits;
 
+    msaa_samples_array[0] = 0;
+
     depth_buffer_factor = 4;
     back_buffer_factor = 2;
 
-    if (pixel_bits == 8) {
+    switch (pixel_bits) {
+    case 8:
 	fb_format = GL_RGB;
 	fb_type = GL_UNSIGNED_BYTE_2_3_3_REV;
-    }
-    else if (pixel_bits == 16) {
+	break;
+    case 16:
 	fb_format = GL_RGB;
 	fb_type = GL_UNSIGNED_SHORT_5_6_5;
-    }
-    else {
+	break;
+    case 24:
+	fb_format = GL_BGR;
+	fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+	break;
+    case 32:
 	fb_format = GL_BGRA;
 	fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+	break;
+    default:
+	fprintf(stderr, "[%s:%u] bad depth %d\n", __func__, __LINE__,
+		pixel_bits);
+	return NULL;
     }
 
     configs = driCreateConfigs(fb_format, fb_type,
 			       depth_bits_array, stencil_bits_array,
 			       depth_buffer_factor, back_buffer_modes,
-			       back_buffer_factor);
+			       back_buffer_factor, msaa_samples_array, 1);
     if (configs == NULL) {
 	fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
 		__LINE__);
@@ -196,7 +217,7 @@ driCreateNewScreen(int scrn, const __DRIextension **extensions,
 {
     static const __DRIextension *emptyExtensionList[] = { NULL };
     __DRIscreen *psp;
-    __DRIconfig **configs8, **configs16, **configs32;
+    __DRIconfig **configs8, **configs16, **configs24, **configs32;
 
     (void) data;
 
@@ -213,11 +234,13 @@ driCreateNewScreen(int scrn, const __DRIextension **extensions,
 
     configs8  = swrastFillInModes(psp,  8,  8, 0, 1);
     configs16 = swrastFillInModes(psp, 16, 16, 0, 1);
+    configs24 = swrastFillInModes(psp, 24, 24, 8, 1);
     configs32 = swrastFillInModes(psp, 32, 24, 8, 1);
 
-    configs16 = (__DRIconfig **)driConcatConfigs(configs8, configs16);
-
-    *driver_configs = driConcatConfigs(configs16, configs32);
+    configs16 = driConcatConfigs(configs8, configs16);
+    configs24 = driConcatConfigs(configs16, configs24);
+    *driver_configs = (const __DRIconfig **)
+       driConcatConfigs(configs24, configs32);
 
     driInitExtensions( NULL, card_extensions, GL_FALSE );
 
@@ -249,19 +272,24 @@ static GLuint
 choose_pixel_format(const GLvisual *v)
 {
     if (v->rgbMode) {
-	int bpp = v->rgbBits;
+	int depth = v->rgbBits;
 
-	if (bpp == 32
+	if (depth == 32
 	    && v->redMask   == 0xff0000
 	    && v->greenMask == 0x00ff00
 	    && v->blueMask  == 0x0000ff)
 	    return PF_A8R8G8B8;
-	else if (bpp == 16
+	else if (depth == 24
+	    && v->redMask   == 0xff0000
+	    && v->greenMask == 0x00ff00
+	    && v->blueMask  == 0x0000ff)
+	    return PF_X8R8G8B8;
+	else if (depth == 16
 	    && v->redMask   == 0xf800
 	    && v->greenMask == 0x07e0
 	    && v->blueMask  == 0x001f)
 	    return PF_R5G6B5;
-	else if (bpp == 8
+	else if (depth == 8
 	    && v->redMask   == 0x07
 	    && v->greenMask == 0x38
 	    && v->blueMask  == 0xc0)
@@ -290,7 +318,6 @@ swrast_alloc_front_storage(GLcontext *ctx, struct gl_renderbuffer *rb,
 			   GLenum internalFormat, GLuint width, GLuint height)
 {
     struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
-    int bpp;
     unsigned mask = PITCH_ALIGN_BITS - 1;
 
     TRACE;
@@ -299,23 +326,8 @@ swrast_alloc_front_storage(GLcontext *ctx, struct gl_renderbuffer *rb,
     rb->Width = width;
     rb->Height = height;
 
-    switch (internalFormat) {
-    case GL_RGB:
-	bpp = rb->RedBits + rb->GreenBits + rb->BlueBits;
-	break;
-    case GL_RGBA:
-	bpp = rb->RedBits + rb->GreenBits + rb->BlueBits + rb->AlphaBits;
-	break;
-    case GL_COLOR_INDEX8_EXT:
-	bpp = rb->IndexBits;
-	break;
-    default:
-	_mesa_problem( NULL, "unexpected format in %s", __FUNCTION__ );
-	return GL_FALSE;
-    }
-
     /* always pad to PITCH_ALIGN_BITS */
-    xrb->pitch = ((width * bpp + mask) & ~mask) / 8;
+    xrb->pitch = ((width * xrb->bpp + mask) & ~mask) / 8;
 
     return GL_TRUE;
 }
@@ -371,6 +383,17 @@ swrast_new_renderbuffer(const GLvisual *visual, GLboolean front)
 	xrb->Base.GreenBits = 8 * sizeof(GLubyte);
 	xrb->Base.BlueBits  = 8 * sizeof(GLubyte);
 	xrb->Base.AlphaBits = 8 * sizeof(GLubyte);
+	xrb->bpp = 32;
+	break;
+    case PF_X8R8G8B8:
+	xrb->Base.InternalFormat = GL_RGB;
+	xrb->Base._BaseFormat = GL_RGB;
+	xrb->Base.DataType = GL_UNSIGNED_BYTE;
+	xrb->Base.RedBits   = 8 * sizeof(GLubyte);
+	xrb->Base.GreenBits = 8 * sizeof(GLubyte);
+	xrb->Base.BlueBits  = 8 * sizeof(GLubyte);
+	xrb->Base.AlphaBits = 0;
+	xrb->bpp = 32;
 	break;
     case PF_R5G6B5:
 	xrb->Base.InternalFormat = GL_RGB;
@@ -380,6 +403,7 @@ swrast_new_renderbuffer(const GLvisual *visual, GLboolean front)
 	xrb->Base.GreenBits = 6 * sizeof(GLubyte);
 	xrb->Base.BlueBits  = 5 * sizeof(GLubyte);
 	xrb->Base.AlphaBits = 0;
+	xrb->bpp = 16;
 	break;
     case PF_R3G3B2:
 	xrb->Base.InternalFormat = GL_RGB;
@@ -389,12 +413,14 @@ swrast_new_renderbuffer(const GLvisual *visual, GLboolean front)
 	xrb->Base.GreenBits = 3 * sizeof(GLubyte);
 	xrb->Base.BlueBits  = 2 * sizeof(GLubyte);
 	xrb->Base.AlphaBits = 0;
+	xrb->bpp = 8;
 	break;
     case PF_CI8:
 	xrb->Base.InternalFormat = GL_COLOR_INDEX8_EXT;
 	xrb->Base._BaseFormat = GL_COLOR_INDEX;
 	xrb->Base.DataType = GL_UNSIGNED_BYTE;
 	xrb->Base.IndexBits = 8 * sizeof(GLubyte);
+	xrb->bpp = 8;
 	break;
     default:
 	return NULL;
@@ -458,7 +484,7 @@ driDestroyDrawable(__DRIdrawable *buf)
 	_mesa_free(buf->row);
 
 	fb->DeletePending = GL_TRUE;
-	_mesa_unreference_framebuffer(&fb);
+	_mesa_reference_framebuffer(&fb, NULL);
     }
 }
 
diff --git a/src/mesa/drivers/dri/swrast/swrast_priv.h b/src/mesa/drivers/dri/swrast/swrast_priv.h
index a707ffc40a..1a5fb31d5a 100644
--- a/src/mesa/drivers/dri/swrast/swrast_priv.h
+++ b/src/mesa/drivers/dri/swrast/swrast_priv.h
@@ -90,6 +90,8 @@ struct swrast_renderbuffer {
 
     /* renderbuffer pitch (in bytes) */
     GLuint pitch;
+   /* bits per pixel of storage */
+    GLuint bpp;
 };
 
 static INLINE __DRIcontext *
@@ -115,10 +117,10 @@ swrast_renderbuffer(struct gl_renderbuffer *rb)
  * Pixel formats we support
  */
 #define PF_CI8        1		/**< Color Index mode */
-#define PF_A8R8G8B8   2		/**< 32-bit TrueColor:  8-A, 8-R, 8-G, 8-B bits */
-#define PF_R5G6B5     3		/**< 16-bit TrueColor:  5-R, 6-G, 5-B bits */
-#define PF_R3G3B2     4		/**<  8-bit TrueColor:  3-R, 3-G, 2-B bits */
-
+#define PF_A8R8G8B8   2		/**< 32bpp TrueColor:  8-A, 8-R, 8-G, 8-B bits */
+#define PF_R5G6B5     3		/**< 16bpp TrueColor:  5-R, 6-G, 5-B bits */
+#define PF_R3G3B2     4		/**<  8bpp TrueColor:  3-R, 3-G, 2-B bits */
+#define PF_X8R8G8B8   5		/**< 32bpp TrueColor:  8-R, 8-G, 8-B bits */
 
 /**
  * Renderbuffer pitch alignment (in bits).
diff --git a/src/mesa/drivers/dri/swrast/swrast_span.c b/src/mesa/drivers/dri/swrast/swrast_span.c
index 5e990368b2..2d3c25dcbe 100644
--- a/src/mesa/drivers/dri/swrast/swrast_span.c
+++ b/src/mesa/drivers/dri/swrast/swrast_span.c
@@ -79,6 +79,24 @@ static const GLubyte kernel[16] = {
    DST[BCOMP] = SRC[0]
 
 
+/* 32-bit BGRX */
+#define STORE_PIXEL_X8R8G8B8(DST, X, Y, VALUE) \
+   DST[3] = 0xff; \
+   DST[2] = VALUE[RCOMP]; \
+   DST[1] = VALUE[GCOMP]; \
+   DST[0] = VALUE[BCOMP]
+#define STORE_PIXEL_RGB_X8R8G8B8(DST, X, Y, VALUE) \
+   DST[3] = 0xff; \
+   DST[2] = VALUE[RCOMP]; \
+   DST[1] = VALUE[GCOMP]; \
+   DST[0] = VALUE[BCOMP]
+#define FETCH_PIXEL_X8R8G8B8(DST, SRC) \
+   DST[ACOMP] = 0xff; \
+   DST[RCOMP] = SRC[2]; \
+   DST[GCOMP] = SRC[1]; \
+   DST[BCOMP] = SRC[0]
+
+
 /* 16-bit BGR */
 #define STORE_PIXEL_R5G6B5(DST, X, Y, VALUE) \
    do { \
@@ -139,6 +157,24 @@ static const GLubyte kernel[16] = {
 #include "swrast/s_spantemp.h"
 
 
+/* 32-bit BGRX */
+#define NAME(FUNC) FUNC##_X8R8G8B8
+#define RB_TYPE GLubyte
+#define SPAN_VARS \
+   struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
+#define INIT_PIXEL_PTR(P, X, Y) \
+   GLubyte *P = (GLubyte *)xrb->Base.Data + YFLIP(xrb, Y) * xrb->pitch + (X) * 4;
+#define INC_PIXEL_PTR(P) P += 4
+#define STORE_PIXEL(DST, X, Y, VALUE) \
+   STORE_PIXEL_X8R8G8B8(DST, X, Y, VALUE)
+#define STORE_PIXEL_RGB(DST, X, Y, VALUE) \
+   STORE_PIXEL_RGB_X8R8G8B8(DST, X, Y, VALUE)
+#define FETCH_PIXEL(DST, SRC) \
+   FETCH_PIXEL_X8R8G8B8(DST, SRC)
+
+#include "swrast/s_spantemp.h"
+
+
 /* 16-bit BGR */
 #define NAME(FUNC) FUNC##_R5G6B5
 #define RB_TYPE GLubyte
@@ -210,6 +246,24 @@ static const GLubyte kernel[16] = {
 #include "swrast_spantemp.h"
 
 
+/* 32-bit BGRX */
+#define NAME(FUNC) FUNC##_X8R8G8B8_front
+#define RB_TYPE GLubyte
+#define SPAN_VARS \
+   struct swrast_renderbuffer *xrb = swrast_renderbuffer(rb);
+#define INIT_PIXEL_PTR(P, X, Y) \
+   GLubyte *P = (GLubyte *)row;
+#define INC_PIXEL_PTR(P) P += 4
+#define STORE_PIXEL(DST, X, Y, VALUE) \
+   STORE_PIXEL_X8R8G8B8(DST, X, Y, VALUE)
+#define STORE_PIXEL_RGB(DST, X, Y, VALUE) \
+   STORE_PIXEL_RGB_X8R8G8B8(DST, X, Y, VALUE)
+#define FETCH_PIXEL(DST, SRC) \
+   FETCH_PIXEL_X8R8G8B8(DST, SRC)
+
+#include "swrast_spantemp.h"
+
+
 /* 16-bit BGR */
 #define NAME(FUNC) FUNC##_R5G6B5_front
 #define RB_TYPE GLubyte
@@ -279,6 +333,15 @@ swrast_set_span_funcs_back(struct swrast_renderbuffer *xrb,
 	xrb->Base.PutValues = put_values_A8R8G8B8;
 	xrb->Base.PutMonoValues = put_mono_values_A8R8G8B8;
 	break;
+    case PF_X8R8G8B8:
+	xrb->Base.GetRow = get_row_X8R8G8B8;
+	xrb->Base.GetValues = get_values_X8R8G8B8;
+	xrb->Base.PutRow = put_row_X8R8G8B8;
+	xrb->Base.PutRowRGB = put_row_rgb_X8R8G8B8;
+	xrb->Base.PutMonoRow = put_mono_row_X8R8G8B8;
+	xrb->Base.PutValues = put_values_X8R8G8B8;
+	xrb->Base.PutMonoValues = put_mono_values_X8R8G8B8;
+	break;
     case PF_R5G6B5:
 	xrb->Base.GetRow = get_row_R5G6B5;
 	xrb->Base.GetValues = get_values_R5G6B5;
@@ -334,6 +397,15 @@ swrast_set_span_funcs_front(struct swrast_renderbuffer *xrb,
 	xrb->Base.PutValues = put_values_A8R8G8B8_front;
 	xrb->Base.PutMonoValues = put_mono_values_A8R8G8B8_front;
 	break;
+    case PF_X8R8G8B8:
+	xrb->Base.GetRow = get_row_X8R8G8B8_front;
+	xrb->Base.GetValues = get_values_X8R8G8B8_front;
+	xrb->Base.PutRow = put_row_X8R8G8B8_front;
+	xrb->Base.PutRowRGB = put_row_rgb_X8R8G8B8_front;
+	xrb->Base.PutMonoRow = put_mono_row_X8R8G8B8_front;
+	xrb->Base.PutValues = put_values_X8R8G8B8_front;
+	xrb->Base.PutMonoValues = put_mono_values_X8R8G8B8_front;
+	break;
     case PF_R5G6B5:
 	xrb->Base.GetRow = get_row_R5G6B5_front;
 	xrb->Base.GetValues = get_values_R5G6B5_front;
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_context.c b/src/mesa/drivers/dri/tdfx/tdfx_context.c
index ef688d103d..68b5027561 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_context.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_context.c
@@ -58,21 +58,15 @@
 
 #include "utils.h"
 
-#define need_GL_ARB_multisample
 /* #define need_GL_ARB_point_parameters */
 #define need_GL_ARB_occlusion_query
-#define need_GL_ARB_texture_compression
-#define need_GL_ARB_vertex_buffer_object
 /* #define need_GL_ARB_vertex_program */
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
 #define need_GL_EXT_blend_minmax
 #define need_GL_EXT_fog_coord
-#define need_GL_EXT_multi_draw_arrays
 #define need_GL_EXT_paletted_texture
 /* #define need_GL_EXT_secondary_color */
-#define need_GL_IBM_multimode_draw_arrays
-/* #define need_GL_MESA_program_debug */
 /* #define need_GL_NV_vertex_program */
 #include "extension_helper.h"
 
@@ -82,20 +76,16 @@
  */
 const struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
     { "GL_ARB_texture_mirrored_repeat",    NULL },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
 
     { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
-    { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
     { "GL_EXT_paletted_texture",           GL_EXT_paletted_texture_functions },
     { "GL_EXT_shared_texture_palette",     NULL },
     { "GL_EXT_stencil_wrap",               NULL },
     { "GL_EXT_texture_env_add",            NULL },
     { "GL_EXT_texture_lod_bias",           NULL },
-    { "GL_IBM_multimode_draw_arrays",      GL_IBM_multimode_draw_arrays_functions },
 
 #ifdef need_GL_ARB_point_parameters
     { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
@@ -111,9 +101,6 @@ const struct dri_extension card_extensions[] =
     { "GL_NV_vertex_program",              GL_NV_vertex_program_functions }
     { "GL_NV_vertex_program1_1",           NULL },
 #endif
-#ifdef need_GL_MESA_program_debug
-    { "GL_MESA_program_debug",             GL_MESA_program_debug_functions },
-#endif
     { NULL,                                NULL }
 };
 
@@ -122,7 +109,6 @@ const struct dri_extension card_extensions[] =
  */
 const struct dri_extension napalm_extensions[] =
 {
-    { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
     { "GL_ARB_texture_env_combine",        NULL },
     { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
     { "GL_EXT_blend_subtract",             GL_EXT_blend_minmax_functions },
@@ -319,6 +305,8 @@ GLboolean tdfxCreateContext( const __GLcontextModes *mesaVis,
    ctx->Const.MaxLineWidthAA = 1.0;
    ctx->Const.LineWidthGranularity = 1.0;
 
+   ctx->Const.MaxDrawBuffers = 1;
+
    /* Initialize the software rasterizer and helper modules.
     */
    _swrast_CreateContext( ctx );
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_dd.c b/src/mesa/drivers/dri/tdfx/tdfx_dd.c
index 2cef079515..8472df607a 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_dd.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_dd.c
@@ -110,13 +110,13 @@ static const GLubyte *tdfxDDGetString( GLcontext *ctx, GLenum name )
 
 
 static void
-tdfxBeginQuery(GLcontext *ctx, GLenum target, struct gl_query_object *q)
+tdfxBeginQuery(GLcontext *ctx, struct gl_query_object *q)
 {
    tdfxContextPtr fxMesa = TDFX_CONTEXT(ctx);
 
    (void) q;
 
-   if (target == GL_SAMPLES_PASSED_ARB) {
+   if (q->Target == GL_SAMPLES_PASSED_ARB) {
       LOCK_HARDWARE(fxMesa);
       fxMesa->Glide.grFinish();
       fxMesa->Glide.grReset(GR_STATS_PIXELS);
@@ -126,14 +126,14 @@ tdfxBeginQuery(GLcontext *ctx, GLenum target, struct gl_query_object *q)
 
 
 static void
-tdfxEndQuery(GLcontext *ctx, GLenum target, struct gl_query_object *q)
+tdfxEndQuery(GLcontext *ctx, struct gl_query_object *q)
 {
    tdfxContextPtr fxMesa = TDFX_CONTEXT(ctx);
    FxI32 total_pixels;
    FxI32 z_fail_pixels;
 
 
-   if (target == GL_SAMPLES_PASSED_ARB) {
+   if (q->Target == GL_SAMPLES_PASSED_ARB) {
       LOCK_HARDWARE(fxMesa);
       fxMesa->Glide.grFinish();
 
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_pixels.c b/src/mesa/drivers/dri/tdfx/tdfx_pixels.c
index 9ab9c05f2b..18729d5ae0 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_pixels.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_pixels.c
@@ -610,7 +610,7 @@ tdfx_drawpixels_R8G8B8A8(GLcontext * ctx, GLint x, GLint y,
        ctx->Depth.Test ||
        ctx->Fog.Enabled ||
        ctx->Scissor.Enabled ||
-       ctx->Stencil.Enabled ||
+       ctx->Stencil._Enabled ||
        !ctx->Color.ColorMask[0] ||
        !ctx->Color.ColorMask[1] ||
        !ctx->Color.ColorMask[2] ||
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_render.c b/src/mesa/drivers/dri/tdfx/tdfx_render.c
index cf840c57a7..2cd8e12d95 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_render.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_render.c
@@ -740,7 +740,7 @@ void tdfxEmitHwStateLocked( tdfxContextPtr fxMesa )
    }
 
    if ( fxMesa->dirty & TDFX_UPLOAD_STENCIL ) {
-      if (fxMesa->glCtx->Stencil.Enabled) {
+      if (fxMesa->glCtx->Stencil._Enabled) {
          fxMesa->Glide.grEnable(GR_STENCIL_MODE_EXT);
          fxMesa->Glide.grStencilOp(fxMesa->Stencil.FailFunc,
                                    fxMesa->Stencil.ZFailFunc,
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_screen.c b/src/mesa/drivers/dri/tdfx/tdfx_screen.c
index cd22b84951..58bd48b294 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_screen.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_screen.c
@@ -232,7 +232,7 @@ tdfxCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 tdfxDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 
@@ -361,6 +361,7 @@ tdfxFillInModes(__DRIscreenPrivate *psp,
 	static const GLenum db_modes[2] = { GLX_NONE, GLX_SWAP_UNDEFINED_OML };
 	uint8_t depth_bits_array[4];
 	uint8_t stencil_bits_array[4];
+        uint8_t msaa_samples_array[1];
 	if(deep) {
 		depth_bits_array[0] = 0;
 		depth_bits_array[1] = 24;
@@ -377,13 +378,17 @@ tdfxFillInModes(__DRIscreenPrivate *psp,
 		stencil_bits_array[3] = 8;
 	}
 
-	return driCreateConfigs(
-		deep ? GL_RGBA : GL_RGB,
-		deep ? GL_UNSIGNED_INT_8_8_8_8 : GL_UNSIGNED_SHORT_5_6_5,
-		depth_bits_array,
-		stencil_bits_array,
-		deep ? 2 : 4,
-		db_modes, 2);
+	msaa_samples_array[0] = 0;
+
+	return (const __DRIconfig **)
+	   driCreateConfigs(deep ? GL_RGBA : GL_RGB,
+			    deep ? GL_UNSIGNED_INT_8_8_8_8 :
+				   GL_UNSIGNED_SHORT_5_6_5,
+			    depth_bits_array,
+			    stencil_bits_array,
+			    deep ? 2 : 4,
+			    db_modes, 2,
+			    msaa_samples_array, 1);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_state.c b/src/mesa/drivers/dri/tdfx/tdfx_state.c
index a2d7bcd97d..591df8a905 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_state.c
+++ b/src/mesa/drivers/dri/tdfx/tdfx_state.c
@@ -459,7 +459,7 @@ static void tdfxUpdateStencil( GLcontext *ctx )
    }
 
    if (fxMesa->haveHwStencil) {
-      if (ctx->Stencil.Enabled) {
+      if (ctx->Stencil._Enabled) {
          fxMesa->Stencil.Function = ctx->Stencil.Function[0] - GL_NEVER + GR_CMP_NEVER;
          fxMesa->Stencil.RefValue = ctx->Stencil.Ref[0] & 0xff;
          fxMesa->Stencil.ValueMask = ctx->Stencil.ValueMask[0] & 0xff;
diff --git a/src/mesa/drivers/dri/trident/trident_context.c b/src/mesa/drivers/dri/trident/trident_context.c
index e134cfcf8e..b5126b07ea 100644
--- a/src/mesa/drivers/dri/trident/trident_context.c
+++ b/src/mesa/drivers/dri/trident/trident_context.c
@@ -41,6 +41,7 @@
 #include "main/extensions.h"
 #include "main/framebuffer.h"
 #include "main/renderbuffer.h"
+#include "main/viewport.h"
 #if defined(USE_X86_ASM)
 #include "x86/common_x86_asm.h"
 #endif
@@ -128,6 +129,8 @@ tridentCreateContext( const __GLcontextModes *glVisual,
    ctx->Const.MaxPointSizeAA = 16.0; 
    ctx->Const.PointSizeGranularity = 0.25;
 
+   ctx->Const.MaxDrawBuffers = 1;
+
 #if 0
    tmesa->texHeap = mmInit( 0, tmesa->tridentScreen->textureSize );
 
@@ -279,7 +282,7 @@ tridentCreateBuffer( __DRIscreenPrivate *driScrnPriv,
 static void
 tridentDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/trident/trident_vb.c b/src/mesa/drivers/dri/trident/trident_vb.c
index b231f5ef15..055a914595 100644
--- a/src/mesa/drivers/dri/trident/trident_vb.c
+++ b/src/mesa/drivers/dri/trident/trident_vb.c
@@ -402,7 +402,7 @@ void tridentInitVB( GLcontext *ctx )
    tridentContextPtr tmesa = TRIDENT_CONTEXT(ctx);
    GLuint size = TNL_CONTEXT(ctx)->vb.Size;
 
-   tmesa->verts = (char *)ALIGN_MALLOC( size * 16 * 4, 32 );
+   tmesa->verts = (GLubyte *)ALIGN_MALLOC( size * 16 * 4, 32 );
 
    {
       static int firsttime = 1;
diff --git a/src/mesa/drivers/dri/unichrome/via_context.c b/src/mesa/drivers/dri/unichrome/via_context.c
index f5bdb65eb0..6eb19ac079 100644
--- a/src/mesa/drivers/dri/unichrome/via_context.c
+++ b/src/mesa/drivers/dri/unichrome/via_context.c
@@ -62,9 +62,7 @@
 #include "main/macros.h"
 #include "drirenderbuffer.h"
 
-#define need_GL_ARB_multisample
 #define need_GL_ARB_point_parameters
-#define need_GL_ARB_vertex_buffer_object
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_secondary_color
 #include "extension_helper.h"
@@ -366,14 +364,12 @@ void viaReAllocateBuffers(GLcontext *ctx, GLframebuffer *drawbuffer,
  */
 const struct dri_extension card_extensions[] =
 {
-    { "GL_ARB_multisample",                GL_ARB_multisample_functions },
     { "GL_ARB_multitexture",               NULL },
     { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
     { "GL_ARB_texture_env_add",            NULL },
     { "GL_ARB_texture_env_combine",        NULL },
 /*    { "GL_ARB_texture_env_dot3",           NULL }, */
     { "GL_ARB_texture_mirrored_repeat",    NULL },
-    { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
     { "GL_EXT_stencil_wrap",               NULL },
@@ -577,6 +573,8 @@ viaCreateContext(const __GLcontextModes *visual,
     ctx->Const.MaxPointSizeAA = 1.0;
     ctx->Const.PointSizeGranularity = 1.0;
 
+    ctx->Const.MaxDrawBuffers = 1;
+
     ctx->Driver.GetString = viaGetString;
 
     ctx->DriverCtx = (void *)vmesa;
diff --git a/src/mesa/drivers/dri/unichrome/via_screen.c b/src/mesa/drivers/dri/unichrome/via_screen.c
index 988f9935ac..3dbb570571 100644
--- a/src/mesa/drivers/dri/unichrome/via_screen.c
+++ b/src/mesa/drivers/dri/unichrome/via_screen.c
@@ -316,7 +316,7 @@ viaCreateBuffer(__DRIscreenPrivate *driScrnPriv,
 static void
 viaDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_unreference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)));
+   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
 static const __DRIconfig **
@@ -342,6 +342,7 @@ viaFillInModes( __DRIscreenPrivate *psp,
      */
     static const uint8_t depth_bits_array[4]   = { 0, 16, 24, 32 };
     static const uint8_t stencil_bits_array[4] = { 0,  0,  8,  0 };
+    uint8_t msaa_samples_array[1] = { 0 };
     const unsigned depth_buffer_factor = 3;
 
     if ( pixel_bits == 16 ) {
@@ -356,7 +357,8 @@ viaFillInModes( __DRIscreenPrivate *psp,
     configs = driCreateConfigs(fb_format, fb_type,
 			       depth_bits_array, stencil_bits_array,
 			       depth_buffer_factor, back_buffer_modes,
-			       back_buffer_factor);
+			       back_buffer_factor,
+                               msaa_samples_array, 1);
     if (configs == NULL) {
 	fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
 		__LINE__);
diff --git a/src/mesa/drivers/dri/unichrome/via_state.c b/src/mesa/drivers/dri/unichrome/via_state.c
index 1cef01ab03..840e4e42da 100644
--- a/src/mesa/drivers/dri/unichrome/via_state.c
+++ b/src/mesa/drivers/dri/unichrome/via_state.c
@@ -1342,7 +1342,7 @@ static void viaChooseStencilState(GLcontext *ctx)
 {
     struct via_context *vmesa = VIA_CONTEXT(ctx);
     
-    if (ctx->Stencil.Enabled) {
+    if (ctx->Stencil._Enabled) {
         GLuint temp;
 
         vmesa->regEnable |= HC_HenST_MASK;
diff --git a/src/mesa/drivers/fbdev/glfbdev.c b/src/mesa/drivers/fbdev/glfbdev.c
index 3c874ba57a..531558dc4d 100644
--- a/src/mesa/drivers/fbdev/glfbdev.c
+++ b/src/mesa/drivers/fbdev/glfbdev.c
@@ -685,7 +685,7 @@ glFBDevDestroyBuffer( GLFBDevBufferPtr buffer )
       }
       {
          struct gl_framebuffer *fb = &buffer->glframebuffer;
-         _mesa_unreference_framebuffer(&fb);
+         _mesa_reference_framebuffer(&fb, NULL);
       }
    }
 }
diff --git a/src/mesa/drivers/ggi/default/stubs.c b/src/mesa/drivers/ggi/default/stubs.c
index 7b442b6d20..62722972b2 100644
--- a/src/mesa/drivers/ggi/default/stubs.c
+++ b/src/mesa/drivers/ggi/default/stubs.c
@@ -472,7 +472,7 @@ static void GGItriangle_flat_depth(GLcontext *ctx, const SWvertex *v0, const SWv
 
 static swrast_tri_func ggimesa_stubs_get_triangle_func(GLcontext *ctx)
 {
-	if (ctx->Stencil.Enabled) return NULL;
+	if (ctx->Stencil._Enabled) return NULL;
 	if (ctx->Polygon.SmoothFlag) return NULL;
 	if (ctx->Polygon.StippleFlag) return NULL;
 	if (ctx->Texture._ReallyEnabled) return NULL;  
diff --git a/src/mesa/drivers/glide/fxapi.c b/src/mesa/drivers/glide/fxapi.c
index b7708fd636..238f491599 100644
--- a/src/mesa/drivers/glide/fxapi.c
+++ b/src/mesa/drivers/glide/fxapi.c
@@ -728,7 +728,7 @@ errorhandler:
        FREE(fxMesa->fogTable);
     }
     if (fxMesa->glBuffer) {
-       _mesa_unreference_framebuffer(&fxMesa->glBuffer);
+       _mesa_reference_framebuffer(&fxMesa->glBuffer, NULL);
     }
     if (fxMesa->glVis) {
        _mesa_destroy_visual(fxMesa->glVis);
@@ -828,7 +828,7 @@ fxMesaDestroyContext(fxMesaContext fxMesa)
    fxDDDestroyFxMesaContext(fxMesa); /* must be before _mesa_destroy_context */
    _mesa_destroy_visual(fxMesa->glVis);
    _mesa_destroy_context(fxMesa->glCtx);
-   _mesa_unreference_framebuffer(&fxMesa->glBuffer);
+   _mesa_reference_framebuffer(&fxMesa->glBuffer, NULL);
    fxTMClose(fxMesa); /* must be after _mesa_destroy_context */
 
    FREE(fxMesa);
diff --git a/src/mesa/drivers/glide/fxdd.c b/src/mesa/drivers/glide/fxdd.c
index 213ef2382f..2bc60399ea 100644
--- a/src/mesa/drivers/glide/fxdd.c
+++ b/src/mesa/drivers/glide/fxdd.c
@@ -1773,6 +1773,8 @@ fxDDInitFxMesaContext(fxMesaContext fxMesa)
    ctx->Const.MaxTextureImageUnits = fxMesa->haveTwoTMUs ? 2 : 1;
    ctx->Const.MaxTextureUnits = MAX2(ctx->Const.MaxTextureImageUnits, ctx->Const.MaxTextureCoordUnits);
 
+   ctx->Const.MaxDrawBuffers = 1;
+
    fxMesa->new_state = _NEW_ALL;
    if (!fxMesa->haveHwStencil) {
       /* don't touch stencil if there is none */
@@ -1862,7 +1864,6 @@ fxDDInitExtensions(GLcontext * ctx)
    }
 
    if (fxMesa->type >= GR_SSTTYPE_Voodoo4) {
-      _mesa_enable_extension(ctx, "GL_ARB_texture_compression");
       _mesa_enable_extension(ctx, "GL_3DFX_texture_compression_FXT1");
       _mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
       _mesa_enable_extension(ctx, "GL_S3_s3tc");
@@ -1894,7 +1895,6 @@ fxDDInitExtensions(GLcontext * ctx)
        * fxSetupSingleTMU_NoLock/fxSetupDoubleTMU_NoLock:
        *    grTexDownloadTable(GR_TEXTABLE_NCC0, &(ti->palette));
        */
-      /*_mesa_enable_extension(ctx, "GL_ARB_texture_compression");*/
       _mesa_enable_extension(ctx, "GL_SGIS_generate_mipmap");
    }
 
@@ -1917,15 +1917,11 @@ fxDDInitExtensions(GLcontext * ctx)
    }
 
    /* core-level extensions */
-   _mesa_enable_extension(ctx, "GL_EXT_multi_draw_arrays");
-   _mesa_enable_extension(ctx, "GL_IBM_multimode_draw_arrays");
-   _mesa_enable_extension(ctx, "GL_ARB_vertex_buffer_object");
    /* dangerous */
    if (getenv("MESA_FX_ALLOW_VP")) {
       _mesa_enable_extension(ctx, "GL_ARB_vertex_program");
       _mesa_enable_extension(ctx, "GL_NV_vertex_program");
       _mesa_enable_extension(ctx, "GL_NV_vertex_program1_1");
-      _mesa_enable_extension(ctx, "GL_MESA_program_debug");
    }
 #if 0
    /* this requires _tnl_vertex_cull_stage in the pipeline */
@@ -1951,7 +1947,7 @@ fx_check_IsInHardware(GLcontext * ctx)
       return FX_FALLBACK_RENDER_MODE;
    }
 
-   if (ctx->Stencil.Enabled && !fxMesa->haveHwStencil) {
+   if (ctx->Stencil._Enabled && !fxMesa->haveHwStencil) {
       return FX_FALLBACK_STENCIL;
    }
 
diff --git a/src/mesa/drivers/glslcompiler/glslcompiler.c b/src/mesa/drivers/glslcompiler/glslcompiler.c
index 34cce977c8..e4527abdec 100644
--- a/src/mesa/drivers/glslcompiler/glslcompiler.c
+++ b/src/mesa/drivers/glslcompiler/glslcompiler.c
@@ -72,6 +72,8 @@ struct options {
    const char *VertFile;
    const char *FragFile;
    const char *OutputFile;
+   GLboolean Params;
+   struct gl_sl_pragmas Pragmas;
 };
 
 static struct options Options;
@@ -147,6 +149,9 @@ CreateContext(void)
    TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
    _swsetup_Wakeup( ctx );
 
+   /* Override the context's default pragma settings */
+   ctx->Shader.DefaultPragmas = Options.Pragmas;
+
    _mesa_make_current(ctx, buf, buf);
 
    return GL_TRUE;
@@ -227,7 +232,9 @@ PrintShaderInstructions(GLuint shader, FILE *f)
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader *sh = _mesa_lookup_shader(ctx, shader);
    struct gl_program *prog = sh->Program;
-   _mesa_print_program_opt(prog, Options.Mode, Options.LineNumbers);
+   _mesa_fprint_program_opt(stdout, prog, Options.Mode, Options.LineNumbers);
+   if (Options.Params)
+      _mesa_print_program_parameters(ctx, prog);
 }
 
 
@@ -253,11 +260,15 @@ Usage(void)
    printf("Usage:\n");
    printf("  --vs FILE          vertex shader input filename\n");
    printf("  --fs FILE          fragment shader input filename\n");
-   printf("  --arb              emit ARB-style instructions (the default)\n");
+   printf("  --arb              emit ARB-style instructions\n");
    printf("  --nv               emit NV-style instructions\n");
-   printf("  --debug            emit debug-style instructions\n");
-   printf("  --number, -n       emit line numbers\n");
+   printf("  --debug            force #pragma debug(on)\n");
+   printf("  --nodebug          force #pragma debug(off)\n");
+   printf("  --opt              force #pragma optimize(on)\n");
+   printf("  --noopt            force #pragma optimize(off)\n");
+   printf("  --number, -n       emit line numbers (if --arb or --nv)\n");
    printf("  --output, -o FILE  output filename\n");
+   printf("  --params           also emit program parameter info\n");
    printf("  --help             display this information\n");
 }
 
@@ -268,10 +279,15 @@ ParseOptions(int argc, char *argv[])
    int i;
 
    Options.LineNumbers = GL_FALSE;
-   Options.Mode = PROG_PRINT_ARB;
+   Options.Mode = PROG_PRINT_DEBUG;
    Options.VertFile = NULL;
    Options.FragFile = NULL;
    Options.OutputFile = NULL;
+   Options.Params = GL_FALSE;
+   Options.Pragmas.IgnoreOptimize = GL_FALSE;
+   Options.Pragmas.IgnoreDebug = GL_FALSE;
+   Options.Pragmas.Debug = GL_FALSE;
+   Options.Pragmas.Optimize = GL_TRUE;
 
    if (argc == 1) {
       Usage();
@@ -294,7 +310,20 @@ ParseOptions(int argc, char *argv[])
          Options.Mode = PROG_PRINT_NV;
       }
       else if (strcmp(argv[i], "--debug") == 0) {
-         Options.Mode = PROG_PRINT_DEBUG;
+         Options.Pragmas.IgnoreDebug = GL_TRUE;
+         Options.Pragmas.Debug = GL_TRUE;
+      }
+      else if (strcmp(argv[i], "--nodebug") == 0) {
+         Options.Pragmas.IgnoreDebug = GL_TRUE;
+         Options.Pragmas.Debug = GL_FALSE;
+      }
+      else if (strcmp(argv[i], "--opt") == 0) {
+         Options.Pragmas.IgnoreOptimize = GL_TRUE;
+         Options.Pragmas.Optimize = GL_TRUE;
+      }
+      else if (strcmp(argv[i], "--noopt") == 0) {
+         Options.Pragmas.IgnoreOptimize = GL_TRUE;
+         Options.Pragmas.Optimize = GL_FALSE;
       }
       else if (strcmp(argv[i], "--number") == 0 ||
                strcmp(argv[i], "-n") == 0) {
@@ -305,6 +334,9 @@ ParseOptions(int argc, char *argv[])
          Options.OutputFile = argv[i + 1];
          i++;
       }
+      else if (strcmp(argv[i], "--params") == 0) {
+         Options.Params = GL_TRUE;
+      }
       else if (strcmp(argv[i], "--help") == 0) {
          Usage();
          exit(0);
@@ -315,6 +347,11 @@ ParseOptions(int argc, char *argv[])
          exit(1);
       }
    }
+
+   if (Options.Mode == PROG_PRINT_DEBUG) {
+      /* always print line numbers when emitting debug-style output */
+      Options.LineNumbers = GL_TRUE;
+   }
 }
 
 
@@ -323,13 +360,13 @@ main(int argc, char *argv[])
 {
    GLuint shader = 0;
 
+   ParseOptions(argc, argv);
+
    if (!CreateContext()) {
       fprintf(stderr, "%s: Failed to create compiler context\n", Prog);
       exit(1);
    }
 
-   ParseOptions(argc, argv);
-
    if (Options.VertFile) {
       shader = CompileShader(Options.VertFile, GL_VERTEX_SHADER);
    }
diff --git a/src/mesa/drivers/osmesa/Makefile b/src/mesa/drivers/osmesa/Makefile
index 3b3984200a..92d4149466 100644
--- a/src/mesa/drivers/osmesa/Makefile
+++ b/src/mesa/drivers/osmesa/Makefile
@@ -19,11 +19,12 @@ INCLUDE_DIRS = \
 	-I$(TOP)/src/mesa \
 	-I$(TOP)/src/mesa/main
 
+# Standalone osmesa needs to be linked with core Mesa APIs
+ifeq ($(DRIVER_DIRS), osmesa)
 CORE_MESA = $(TOP)/src/mesa/libmesa.a $(TOP)/src/mesa/libglapi.a
-
-
-.PHONY: osmesa8
-.PHONY: osmesa16
+else
+CORE_MESA =
+endif
 
 
 .c.o:
@@ -31,31 +32,12 @@ CORE_MESA = $(TOP)/src/mesa/libmesa.a $(TOP)/src/mesa/libglapi.a
 
 
 default: $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
-	@ if [ "${DRIVER_DIRS}" = "osmesa" ] ; then \
-		$(MAKE) osmesa16 ; \
-	else \
-		$(MAKE) osmesa8 ; \
-	fi
-
-
-
-
-# The normal libOSMesa is used in conjuction with libGL
-osmesa8: $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
-
-$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OBJECTS)
-	$(MKLIB) -o $(OSMESA_LIB) -linker '$(CC)' -ldflags '$(LDFLAGS)' \
-		-major $(MESA_MAJOR) -minor $(MESA_MINOR) -patch $(MESA_TINY) \
-		-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
-		-id $(INSTALL_LIB_DIR)/lib$(OSMESA_LIB).$(MESA_MAJOR).dylib \
-		$(OSMESA_LIB_DEPS) $(OBJECTS)
-
-
 
 
-# The libOSMesa16/libOSMesa32 libraries do not use libGL but rather are built
-# with all the other Mesa sources (compiled with -DCHAN_BITS=16/32
-osmesa16: $(OBJECTS) $(CORE_MESA)
+# libOSMesa can be used in conjuction with libGL or with all other Mesa
+# sources. We can also build libOSMesa16/libOSMesa32 by setting
+# -DCHAN_BITS=16/32.
+$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OBJECTS) $(CORE_MESA)
 	$(MKLIB) -o $(OSMESA_LIB) -linker '$(CC)' -ldflags '$(LDFLAGS)' \
 		-major $(MESA_MAJOR) -minor $(MESA_MINOR) -patch $(MESA_TINY) \
 		-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index f2367bbbb7..904659e345 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -1310,7 +1310,7 @@ OSMesaDestroyContext( OSMesaContext osmesa )
       _swrast_DestroyContext( &osmesa->mesa );
 
       _mesa_destroy_visual( osmesa->gl_visual );
-      _mesa_unreference_framebuffer( &osmesa->gl_buffer );
+      _mesa_reference_framebuffer( &osmesa->gl_buffer, NULL );
 
       _mesa_free_context_data( &osmesa->mesa );
       _mesa_free( osmesa );
diff --git a/src/mesa/drivers/windows/gdi/mesa.def b/src/mesa/drivers/windows/gdi/mesa.def
index b386e34aad..bd3e5b2137 100644
--- a/src/mesa/drivers/windows/gdi/mesa.def
+++ b/src/mesa/drivers/windows/gdi/mesa.def
@@ -902,7 +902,6 @@ EXPORTS
 	_mesa_generate_mipmap
 	_mesa_get_compressed_teximage
 	_mesa_get_current_context
-	_mesa_get_program_register
 	_mesa_get_teximage
 	_mesa_init_driver_functions
 	_mesa_init_glsl_driver_functions
@@ -940,7 +939,7 @@ EXPORTS
 	_mesa_store_texsubimage3d
 	_mesa_strcmp
 	_mesa_test_proxy_teximage
-	_mesa_unreference_framebuffer
+	_mesa_reference_framebuffer
 	_mesa_update_framebuffer_visual
 	_mesa_use_program
 	_mesa_Viewport
diff --git a/src/mesa/drivers/windows/gdi/wmesa.c b/src/mesa/drivers/windows/gdi/wmesa.c
index 80746950c7..e1971db693 100644
--- a/src/mesa/drivers/windows/gdi/wmesa.c
+++ b/src/mesa/drivers/windows/gdi/wmesa.c
@@ -62,7 +62,7 @@ wmesa_free_framebuffer(HDC hdc)
 	else
 	    prev->next = pwfb->next;
         fb = &pwfb->Base;
-        _mesa_unreference_framebuffer(&fb); 
+        _mesa_reference_framebuffer(&fb, NULL); 
     }
 }
 
@@ -1679,80 +1679,3 @@ void WMesaShareLists(WMesaContext ctx_to_share, WMesaContext ctx)
 	_mesa_share_state(&ctx->gl_ctx, &ctx_to_share->gl_ctx);	
 }
 
-/* This is hopefully a temporary hack to define some needed dispatch
- * table entries.  Hopefully, I'll find a better solution.  The
- * dispatch table generation scripts ought to be making these dummy
- * stubs as well. */
-#if !defined(__MINGW32__) || !defined(GL_NO_STDCALL)
-void gl_dispatch_stub_543(void){}
-void gl_dispatch_stub_544(void){}
-void gl_dispatch_stub_545(void){}
-void gl_dispatch_stub_546(void){}
-void gl_dispatch_stub_547(void){}
-void gl_dispatch_stub_548(void){}
-void gl_dispatch_stub_549(void){}
-void gl_dispatch_stub_550(void){}
-void gl_dispatch_stub_551(void){}
-void gl_dispatch_stub_552(void){}
-void gl_dispatch_stub_553(void){}
-void gl_dispatch_stub_554(void){}
-void gl_dispatch_stub_555(void){}
-void gl_dispatch_stub_556(void){}
-void gl_dispatch_stub_557(void){}
-void gl_dispatch_stub_558(void){}
-void gl_dispatch_stub_559(void){}
-void gl_dispatch_stub_560(void){}
-void gl_dispatch_stub_561(void){}
-void gl_dispatch_stub_565(void){}
-void gl_dispatch_stub_566(void){}
-void gl_dispatch_stub_577(void){}
-void gl_dispatch_stub_578(void){}
-void gl_dispatch_stub_603(void){}
-void gl_dispatch_stub_645(void){}
-void gl_dispatch_stub_646(void){}
-void gl_dispatch_stub_647(void){}
-void gl_dispatch_stub_648(void){}
-void gl_dispatch_stub_649(void){}
-void gl_dispatch_stub_650(void){}
-void gl_dispatch_stub_651(void){}
-void gl_dispatch_stub_652(void){}
-void gl_dispatch_stub_653(void){}
-void gl_dispatch_stub_733(void){}
-void gl_dispatch_stub_734(void){}
-void gl_dispatch_stub_735(void){}
-void gl_dispatch_stub_736(void){}
-void gl_dispatch_stub_737(void){}
-void gl_dispatch_stub_738(void){}
-void gl_dispatch_stub_744(void){}
-void gl_dispatch_stub_745(void){}
-void gl_dispatch_stub_746(void){}
-void gl_dispatch_stub_760(void){}
-void gl_dispatch_stub_761(void){}
-void gl_dispatch_stub_763(void){}
-void gl_dispatch_stub_765(void){}
-void gl_dispatch_stub_766(void){}
-void gl_dispatch_stub_767(void){}
-void gl_dispatch_stub_768(void){}
-
-void gl_dispatch_stub_562(void){}
-void gl_dispatch_stub_563(void){}
-void gl_dispatch_stub_564(void){}
-void gl_dispatch_stub_567(void){}
-void gl_dispatch_stub_568(void){}
-void gl_dispatch_stub_569(void){}
-void gl_dispatch_stub_580(void){}
-void gl_dispatch_stub_581(void){}
-void gl_dispatch_stub_606(void){}
-void gl_dispatch_stub_654(void){}
-void gl_dispatch_stub_655(void){}
-void gl_dispatch_stub_656(void){}
-void gl_dispatch_stub_739(void){}
-void gl_dispatch_stub_740(void){}
-void gl_dispatch_stub_741(void){}
-void gl_dispatch_stub_748(void){}
-void gl_dispatch_stub_749(void){}
-void gl_dispatch_stub_769(void){}
-void gl_dispatch_stub_770(void){}
-void gl_dispatch_stub_771(void){}
-
-#endif
diff --git a/src/mesa/drivers/windows/gldirect/dglcontext.c b/src/mesa/drivers/windows/gldirect/dglcontext.c
index 4ad7a76e67..e9c23d1ccb 100644
--- a/src/mesa/drivers/windows/gldirect/dglcontext.c
+++ b/src/mesa/drivers/windows/gldirect/dglcontext.c
@@ -1482,6 +1482,7 @@ SkipPrimaryCreate:
 #else
 	lpCtx->glCtx->Const.MaxTextureSize = 1024;
 #endif
+	lpCtx->glCtx->Const.MaxDrawBuffers = 1;
 
 	// Setup the Display Driver pointers
 	dglSetupDDPointers(lpCtx->glCtx);
diff --git a/src/mesa/drivers/windows/gldirect/dx7/gld_wgl_dx7.c b/src/mesa/drivers/windows/gldirect/dx7/gld_wgl_dx7.c
index 0f8fe33eb1..fa44a952a0 100644
--- a/src/mesa/drivers/windows/gldirect/dx7/gld_wgl_dx7.c
+++ b/src/mesa/drivers/windows/gldirect/dx7/gld_wgl_dx7.c
@@ -1422,6 +1422,8 @@ BOOL gldInitialiseMesa_DX(
 		lpCtx->glCtx->Const.MaxTextureUnits = 1;
 	}
 
+	lpCtx->glCtx->Const.MaxDrawBuffers = 1;
+
 	// max texture size
 //	MaxTextureSize = min(gld->d3dCaps8.MaxTextureHeight, gld->d3dCaps8.MaxTextureWidth);
 	MaxTextureSize = min(gld->d3dCaps.dwMaxTextureHeight, gld->d3dCaps.dwMaxTextureWidth);
diff --git a/src/mesa/drivers/windows/gldirect/dx8/gld_wgl_dx8.c b/src/mesa/drivers/windows/gldirect/dx8/gld_wgl_dx8.c
index 690f68b68f..011d810e97 100644
--- a/src/mesa/drivers/windows/gldirect/dx8/gld_wgl_dx8.c
+++ b/src/mesa/drivers/windows/gldirect/dx8/gld_wgl_dx8.c
@@ -1204,6 +1204,7 @@ BOOL gldInitialiseMesa_DX(
 		MaxTextureSize >>= 1;
 	}
 	lpCtx->glCtx->Const.MaxTextureLevels = (TextureLevels) ? TextureLevels : 8;
+	lpCtx->glCtx->Const.MaxDrawBuffers = 1;
 
 	IDirect3DDevice8_SetRenderState(gld->pDev, D3DRS_LIGHTING, FALSE);
 	IDirect3DDevice8_SetRenderState(gld->pDev, D3DRS_CULLMODE, D3DCULL_NONE);
diff --git a/src/mesa/drivers/windows/gldirect/dx9/gld_wgl_dx9.c b/src/mesa/drivers/windows/gldirect/dx9/gld_wgl_dx9.c
index dc465c5418..a03b865bb4 100644
--- a/src/mesa/drivers/windows/gldirect/dx9/gld_wgl_dx9.c
+++ b/src/mesa/drivers/windows/gldirect/dx9/gld_wgl_dx9.c
@@ -1206,6 +1206,7 @@ BOOL gldInitialiseMesa_DX(
 		MaxTextureSize >>= 1;
 	}
 	lpCtx->glCtx->Const.MaxTextureLevels = (TextureLevels) ? TextureLevels : 8;
+	lpCtx->glCtx->Const.MaxDrawBuffers = 1;
 
 	IDirect3DDevice9_SetRenderState(gld->pDev, D3DRS_LIGHTING, FALSE);
 	IDirect3DDevice9_SetRenderState(gld->pDev, D3DRS_CULLMODE, D3DCULL_NONE);
diff --git a/src/mesa/drivers/windows/gldirect/mesasw/gld_wgl_mesasw.c b/src/mesa/drivers/windows/gldirect/mesasw/gld_wgl_mesasw.c
index b590dc795a..342a742867 100644
--- a/src/mesa/drivers/windows/gldirect/mesasw/gld_wgl_mesasw.c
+++ b/src/mesa/drivers/windows/gldirect/mesasw/gld_wgl_mesasw.c
@@ -1625,6 +1625,7 @@ BOOL gldInitialiseMesa_MesaSW(
 
 	// Added this to force max texture diminsion to 256. KeithH
 	ctx->Const.MaxTextureLevels = 8;
+	ctx->Const.MaxDrawBuffers = 1;
 
 	_mesa_enable_sw_extensions(ctx);
 	_mesa_enable_imaging_extensions(ctx);
diff --git a/src/mesa/drivers/x11/Makefile b/src/mesa/drivers/x11/Makefile
index d2780e62c9..5e427d2d5c 100644
--- a/src/mesa/drivers/x11/Makefile
+++ b/src/mesa/drivers/x11/Makefile
@@ -41,7 +41,8 @@ OBJECTS = $(SOURCES:.c=.o)
 INCLUDE_DIRS = \
 	-I$(TOP)/include \
 	-I$(TOP)/src/mesa \
-	-I$(TOP)/src/mesa/main
+	-I$(TOP)/src/mesa/main \
+	$(X11_INCLUDES)
 
 CORE_MESA = $(TOP)/src/mesa/libmesa.a $(TOP)/src/mesa/libglapi.a
 
diff --git a/src/mesa/drivers/x11/fakeglx.c b/src/mesa/drivers/x11/fakeglx.c
index ea3585258d..34e0b8bc8d 100644
--- a/src/mesa/drivers/x11/fakeglx.c
+++ b/src/mesa/drivers/x11/fakeglx.c
@@ -1,8 +1,9 @@
 /*
  * Mesa 3-D graphics library
- * Version:  7.1
+ * Version:  7.5
  *
- * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2009  VMware, Inc.   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -1392,6 +1393,25 @@ Fake_glXChooseVisual( Display *dpy, int screen, int *list )
 }
 
 
+/**
+ * Init basic fields of a new fake_glx_context.
+ */
+static void
+init_glx_context(struct fake_glx_context *glxCtx, Display *dpy)
+{
+   /* Always return True.  See if anyone's confused... */
+   GLboolean direct = GL_TRUE;
+
+   glxCtx->xmesaContext->direct = direct;
+   glxCtx->glxContext.isDirect = direct;
+   glxCtx->glxContext.currentDpy = dpy;
+   glxCtx->glxContext.xid = (XID) glxCtx;  /* self pointer */
+
+   assert((void *) glxCtx == (void *) &(glxCtx->glxContext));
+}
+
+
+
 static GLXContext
 Fake_glXCreateContext( Display *dpy, XVisualInfo *visinfo,
                        GLXContext share_list, Bool direct )
@@ -1430,12 +1450,7 @@ Fake_glXCreateContext( Display *dpy, XVisualInfo *visinfo,
       return NULL;
    }
 
-   glxCtx->xmesaContext->direct = GL_FALSE;
-   glxCtx->glxContext.isDirect = GL_FALSE;
-   glxCtx->glxContext.currentDpy = dpy;
-   glxCtx->glxContext.xid = (XID) glxCtx;  /* self pointer */
-
-   assert((void *) glxCtx == (void *) &(glxCtx->glxContext));
+   init_glx_context(glxCtx, dpy);
 
    return (GLXContext) glxCtx;
 }
@@ -2203,7 +2218,7 @@ Fake_glXCreatePixmap( Display *dpy, GLXFBConfig config, Pixmap pixmap,
    if (!dpy || !config || !pixmap)
       return 0;
 
-   for (attr = attribList; *attr; attr++) {
+   for (attr = attribList; attr && *attr; attr++) {
       switch (*attr) {
       case GLX_TEXTURE_FORMAT_EXT:
          attr++;
@@ -2441,12 +2456,7 @@ Fake_glXCreateNewContext( Display *dpy, GLXFBConfig config,
       return NULL;
    }
 
-   glxCtx->xmesaContext->direct = GL_FALSE;
-   glxCtx->glxContext.isDirect = GL_FALSE;
-   glxCtx->glxContext.currentDpy = dpy;
-   glxCtx->glxContext.xid = (XID) glxCtx;  /* self pointer */
-
-   assert((void *) glxCtx == (void *) &(glxCtx->glxContext));
+   init_glx_context(glxCtx, dpy);
 
    return (GLXContext) glxCtx;
 }
@@ -2664,12 +2674,7 @@ Fake_glXCreateContextWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, int re
       return NULL;
    }
 
-   glxCtx->xmesaContext->direct = GL_FALSE;
-   glxCtx->glxContext.isDirect = GL_FALSE;
-   glxCtx->glxContext.currentDpy = dpy;
-   glxCtx->glxContext.xid = (XID) glxCtx;  /* self pointer */
-
-   assert((void *) glxCtx == (void *) &(glxCtx->glxContext));
+   init_glx_context(glxCtx, dpy);
 
    return (GLXContext) glxCtx;
 }
diff --git a/src/mesa/drivers/x11/glxapi.c b/src/mesa/drivers/x11/glxapi.c
index c2ccce6f52..02eea25a71 100644
--- a/src/mesa/drivers/x11/glxapi.c
+++ b/src/mesa/drivers/x11/glxapi.c
@@ -35,6 +35,7 @@
 #include <stdio.h>
 #include <string.h>
 #include "main/glheader.h"
+#include "main/compiler.h"
 #include "glapi/glapi.h"
 #include "glxapi.h"
 
@@ -1374,7 +1375,12 @@ _glxapi_get_proc_address(const char *funcName)
 {
    GLuint i;
    for (i = 0; GLX_functions[i].Name; i++) {
+#ifdef MANGLE
+      /* skip the "m" prefix on the name */
+      if (strcmp(GLX_functions[i].Name, funcName+1) == 0)
+#else
       if (strcmp(GLX_functions[i].Name, funcName) == 0)
+#endif
          return GLX_functions[i].Address;
    }
    return NULL;
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index 18aa8bcc09..662c61ae7e 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -79,6 +79,7 @@
 #include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
 #include "drivers/common/driverfuncs.h"
+#include "drivers/common/meta.h"
 
 /**
  * Global X driver lock
@@ -492,7 +493,7 @@ xmesa_free_buffer(XMesaBuffer buffer)
          b->frontxrb->drawable = 0;
 
          /* Unreference.  If count = zero we'll really delete the buffer */
-         _mesa_unreference_framebuffer(&fb);
+         _mesa_reference_framebuffer(&fb, NULL);
 
          return;
       }
@@ -1316,7 +1317,9 @@ xmesa_convert_from_x_visual_type( int visualType )
 #define need_GL_SGI_color_table
 
 /* sw extensions not associated with some GL version */
+#define need_GL_ARB_draw_elements_base_vertex
 #define need_GL_ARB_shader_objects
+#define need_GL_ARB_sync
 #define need_GL_ARB_vertex_program
 #define need_GL_APPLE_vertex_array_object
 #define need_GL_ATI_fragment_shader
@@ -1325,7 +1328,6 @@ xmesa_convert_from_x_visual_type( int visualType )
 #define need_GL_EXT_framebuffer_blit
 #define need_GL_EXT_gpu_program_parameters
 #define need_GL_EXT_paletted_texture
-#define need_GL_IBM_multimode_draw_arrays
 #define need_GL_MESA_resize_buffers
 #define need_GL_NV_vertex_program
 #define need_GL_NV_fragment_program
@@ -1346,7 +1348,10 @@ const struct dri_extension card_extensions[] =
    { "GL_EXT_histogram",		GL_EXT_histogram_functions },
    { "GL_SGI_color_table",		GL_SGI_color_table_functions },
 
+   { "GL_ARB_depth_clamp",		NULL },
+   { "GL_ARB_draw_elements_base_vertex", GL_ARB_draw_elements_base_vertex_functions },
    { "GL_ARB_shader_objects",		GL_ARB_shader_objects_functions },
+   { "GL_ARB_sync",			GL_ARB_sync_functions },
    { "GL_ARB_vertex_program",		GL_ARB_vertex_program_functions },
    { "GL_APPLE_vertex_array_object",	GL_APPLE_vertex_array_object_functions },
    { "GL_ATI_fragment_shader",		GL_ATI_fragment_shader_functions },
@@ -1355,8 +1360,8 @@ const struct dri_extension card_extensions[] =
    { "GL_EXT_framebuffer_blit",		GL_EXT_framebuffer_blit_functions },
    { "GL_EXT_gpu_program_parameters",	GL_EXT_gpu_program_parameters_functions },
    { "GL_EXT_paletted_texture",		GL_EXT_paletted_texture_functions },
-   { "GL_IBM_multimode_draw_arrays",	GL_IBM_multimode_draw_arrays_functions },
    { "GL_MESA_resize_buffers",		GL_MESA_resize_buffers_functions },
+   { "GL_NV_depth_clamp",		NULL },
    { "GL_NV_vertex_program",		GL_NV_vertex_program_functions },
    { "GL_NV_fragment_program",		GL_NV_fragment_program_functions },
    { NULL,				NULL }
@@ -1586,6 +1591,14 @@ XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
       return NULL;
    }
 
+   /* Enable this to exercise fixed function -> shader translation
+    * with software rendering.
+    */
+   if (0) {
+      mesaCtx->VertexProgram._MaintainTnlProgram = GL_TRUE;
+      mesaCtx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+   }
+
    _mesa_enable_sw_extensions(mesaCtx);
    _mesa_enable_1_3_extensions(mesaCtx);
    _mesa_enable_1_4_extensions(mesaCtx);
@@ -1635,6 +1648,9 @@ XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
    xmesa_register_swrast_functions( mesaCtx );
    _swsetup_Wakeup(mesaCtx);
 
+   if (TEST_META_FUNCS)
+      _mesa_meta_init(mesaCtx);
+
    return c;
 }
 
@@ -1649,6 +1665,9 @@ void XMesaDestroyContext( XMesaContext c )
    FXdestroyContext( XMESA_BUFFER(mesaCtx->DrawBuffer) );
 #endif
 
+   if (TEST_META_FUNCS)
+      _mesa_meta_free( mesaCtx );
+
    _swsetup_DestroyContext( mesaCtx );
    _swrast_DestroyContext( mesaCtx );
    _tnl_DestroyContext( mesaCtx );
@@ -2412,11 +2431,8 @@ xbuffer_to_renderbuffer(int buffer)
    case GLX_AUX0_EXT:
       return BUFFER_AUX0;
    case GLX_AUX1_EXT:
-      return BUFFER_AUX1;
    case GLX_AUX2_EXT:
-      return BUFFER_AUX2;
    case GLX_AUX3_EXT:
-      return BUFFER_AUX3;
    case GLX_AUX4_EXT:
    case GLX_AUX5_EXT:
    case GLX_AUX6_EXT:
@@ -2463,13 +2479,13 @@ XMesaBindTexImage(XMesaDisplay *dpy, XMesaBuffer drawable, int buffer,
 #if 0
    switch (drawable->TextureTarget) {
    case GLX_TEXTURE_1D_EXT:
-      texObj = texUnit->Current1D;
+      texObj = texUnit->CurrentTex[TEXTURE_1D_INDEX];
       break;
    case GLX_TEXTURE_2D_EXT:
-      texObj = texUnit->Current2D;
+      texObj = texUnit->CurrentTex[TEXTURE_2D_INDEX];
       break;
    case GLX_TEXTURE_RECTANGLE_EXT:
-      texObj = texUnit->CurrentRect;
+      texObj = texUnit->CurrentTex[TEXTURE_RECT_INDEX];
       break;
    default:
       return; /* BadMatch error */
diff --git a/src/mesa/drivers/x11/xm_buffer.c b/src/mesa/drivers/x11/xm_buffer.c
index 7ad67bc34d..821e2a8e08 100644
--- a/src/mesa/drivers/x11/xm_buffer.c
+++ b/src/mesa/drivers/x11/xm_buffer.c
@@ -229,6 +229,7 @@ alloc_back_buffer(XMesaBuffer b, GLuint width, GLuint height)
       }
 
       b->backxrb->ximage = NULL;
+      b->backxrb->drawable = b->backxrb->pixmap;
    }
 }
 
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index 305df548fa..4e9c001cc7 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -51,6 +51,7 @@
 #include "swrast_setup/swrast_setup.h"
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
+#include "drivers/common/meta.h"
 #include "xmesaP.h"
 
 
@@ -912,8 +913,9 @@ xmesa_update_state( GLcontext *ctx, GLbitfield new_state )
    /*
     * GL_DITHER, GL_READ/DRAW_BUFFER, buffer binding state, etc. effect
     * renderbuffer span/clear funcs.
+    * Check _NEW_COLOR to detect dither enable/disable.
     */
-   if (new_state & (_NEW_COLOR | _NEW_PIXEL | _NEW_BUFFERS)) {
+   if (new_state & (_NEW_COLOR | _NEW_BUFFERS)) {
       XMesaBuffer xmbuf = XMESA_BUFFER(ctx->DrawBuffer);
       struct xmesa_renderbuffer *front_xrb, *back_xrb;
 
@@ -1146,19 +1148,28 @@ xmesa_init_driver_functions( XMesaVisual xmvisual,
    driver->IndexMask = index_mask;
    driver->ColorMask = color_mask;
    driver->Enable = enable;
-   driver->Clear = clear_buffers;
    driver->Viewport = xmesa_viewport;
-#ifndef XFree86Server
-   driver->CopyPixels = xmesa_CopyPixels;
-   if (xmvisual->undithered_pf == PF_8R8G8B &&
-       xmvisual->dithered_pf == PF_8R8G8B &&
-       xmvisual->BitsPerPixel == 32) {
-      driver->DrawPixels = xmesa_DrawPixels_8R8G8B;
-   }
-   else if (xmvisual->undithered_pf == PF_5R6G5B) {
-      driver->DrawPixels = xmesa_DrawPixels_5R6G5B;
+   if (TEST_META_FUNCS) {
+      driver->Clear = _mesa_meta_clear;
+      driver->CopyPixels = _mesa_meta_copy_pixels;
+      driver->BlitFramebuffer = _mesa_meta_blit_framebuffer;
+      driver->DrawPixels = _mesa_meta_draw_pixels;
+      driver->Bitmap = _mesa_meta_bitmap;
    }
+   else {
+      driver->Clear = clear_buffers;
+#ifndef XFree86Server
+      driver->CopyPixels = xmesa_CopyPixels;
+      if (xmvisual->undithered_pf == PF_8R8G8B &&
+          xmvisual->dithered_pf == PF_8R8G8B &&
+          xmvisual->BitsPerPixel == 32) {
+         driver->DrawPixels = xmesa_DrawPixels_8R8G8B;
+      }
+      else if (xmvisual->undithered_pf == PF_5R6G5B) {
+         driver->DrawPixels = xmesa_DrawPixels_5R6G5B;
+      }
 #endif
+   }
    driver->TestProxyTexImage = test_proxy_teximage;
 #if ENABLE_EXT_texure_compression_s3tc
    driver->ChooseTextureFormat = choose_tex_format;
diff --git a/src/mesa/drivers/x11/xm_span.c b/src/mesa/drivers/x11/xm_span.c
index 57b5749448..309cefcb8e 100644
--- a/src/mesa/drivers/x11/xm_span.c
+++ b/src/mesa/drivers/x11/xm_span.c
@@ -471,8 +471,26 @@ static void put_row_8R8G8B_pixmap( PUT_ROW_ARGS )
    if (mask) {
       for (i=0;i<n;i++,x++) {
          if (mask[i]) {
+#if 1
+            /*
+             * XXX Something funny is going on here.
+             * If we're drawing into a window that uses a depth 32 TrueColor
+             * visual, we see the right pixels on screen, but when we read
+             * them back with XGetImage() we get random colors.
+             * The alternative code below which uses XPutImage() instead
+             * seems to mostly fix the problem, but not always.
+             * We don't normally create windows with this visual, but glean
+             * does and we're seeing some failures there.
+             */
             XMesaSetForeground( dpy, gc, PACK_8R8G8B( rgba[i][RCOMP], rgba[i][GCOMP], rgba[i][BCOMP] ));
             XMesaDrawPoint( dpy, buffer, gc, (int) x, (int) y );
+#else
+            /* This code works more often, but not always */
+            XMesaImage *rowimg = XMESA_BUFFER(ctx->DrawBuffer)->rowimage;
+            GLuint *ptr4 = (GLuint *) rowimg->data;
+            *ptr4 = PACK_8R8G8B( rgba[i][RCOMP], rgba[i][GCOMP], rgba[i][BCOMP] );
+            XMesaPutImage( dpy, buffer, gc, rowimg, 0, 0, x, y, 1, 1 );
+#endif
          }
       }
    }
diff --git a/src/mesa/drivers/x11/xmesaP.h b/src/mesa/drivers/x11/xmesaP.h
index 65e747d7b9..25db55862e 100644
--- a/src/mesa/drivers/x11/xmesaP.h
+++ b/src/mesa/drivers/x11/xmesaP.h
@@ -581,4 +581,8 @@ extern void xmesa_register_swrast_functions( GLcontext *ctx );
 #define ENABLE_EXT_timer_query 0 /* may not have 64-bit GLuint64EXT */
 #endif
 
+
+#define TEST_META_FUNCS 0
+
+
 #endif
author	Chia-I Wu <olvaffe@gmail.com>	2009-09-15 14:16:22 +0800
committer	Chia-I Wu <olvaffe@gmail.com>	2009-09-15 14:16:22 +0800
commit	e2ba90a9cc762cf00a168f0a59d31e7dc52fc42e (patch)
tree	fe3206d7602ad935296884742980f3c4d30bd867 /src/mesa/drivers
parent	11a4292d4eb515813b82b8d688a318adef66b3e6 (diff)
parent	b4b8800315637d9218a81c76f09df7d601710d29 (diff)