44 files changed, 3909 insertions, 8427 deletions
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index 6ca934204f..bdb09624be 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -3,6 +3,8 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
+CFLAGS += $(RADEON_CFLAGS)
+
 LIBNAME = r300_dri.so
 
 MINIGLX_SOURCES = server/radeon_dri.c
@@ -20,20 +22,26 @@ COMMON_SOURCES = \
 	../common/xmlconfig.c \
 	../common/dri_util.c
 
+RADEON_COMMON_SOURCES = \
+	radeon_texture.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_dma.c \
+	radeon_lock.c \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_mipmap_tree.c \
+	radeon_span.c \
+	radeon_fbo.c
+
 DRIVER_SOURCES = \
 		 radeon_screen.c \
-		 radeon_context.c \
-		 radeon_ioctl.c \
-		 radeon_lock.c \
-		 radeon_span.c \
-		 radeon_state.c \
-		 r300_mem.c \
 		 r300_context.c \
+		 r300_draw.c \
 		 r300_ioctl.c \
 		 r300_cmdbuf.c \
 		 r300_state.c \
 		 r300_render.c \
-		 r300_texmem.c \
 		 r300_tex.c \
 		 r300_texstate.c \
 		 radeon_program.c \
@@ -41,6 +49,7 @@ DRIVER_SOURCES = \
 		 radeon_program_pair.c \
 		 radeon_nqssadce.c \
 		 r300_vertprog.c \
+		 r300_fragprog_common.c \
 		 r300_fragprog.c \
 		 r300_fragprog_swizzle.c \
 		 r300_fragprog_emit.c \
@@ -49,12 +58,15 @@ DRIVER_SOURCES = \
 		 r300_shader.c \
 		 r300_emit.c \
 		 r300_swtcl.c \
+		 $(RADEON_COMMON_SOURCES) \
 		 $(EGL_SOURCES)
 
 C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
 
 DRIVER_DEFINES = -DCOMPILE_R300 -DR200_MERGED=0 \
-	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300
+	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300 \
+#	-DRADEON_BO_TRACK \
+	-Wall
 
 SYMLINKS = \
 	server/radeon_dri.c \
@@ -68,7 +80,29 @@ COMMON_SYMLINKS = \
 	radeon_chipset.h \
 	radeon_screen.c \
 	radeon_screen.h \
-	radeon_span.h
+	radeon_span.h \
+	radeon_span.c \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_bo_legacy.h \
+	radeon_cs_legacy.h \
+	radeon_bocs_wrapper.h \
+	radeon_lock.c \
+	radeon_lock.h \
+	radeon_common.c \
+	radeon_common.h \
+	radeon_common_context.c \
+	radeon_common_context.h \
+	radeon_cmdbuf.h \
+	radeon_dma.c \
+	radeon_dma.h \
+	radeon_mipmap_tree.c \
+	radeon_mipmap_tree.h \
+	radeon_texture.c \
+	radeon_texture.h \
+	radeon_fbo.c
+
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 ##### TARGETS #####
 
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
index f4472756f1..b5c6bd1835 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
@@ -44,245 +44,417 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "radeon_drm.h"
 
-#include "radeon_ioctl.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "radeon_reg.h"
 #include "r300_reg.h"
 #include "r300_cmdbuf.h"
 #include "r300_emit.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_state.h"
+#include "radeon_reg.h"
 
-// Set this to 1 for extremely verbose debugging of command buffers
-#define DEBUG_CMDBUF		0
-
-/**
- * Send the current command buffer via ioctl to the hardware.
+/** # of dwords reserved for additional instructions that may need to be written
+ * during flushing.
  */
-int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller)
+#define SPACE_FOR_FLUSHING	4
+
+static unsigned packet0_count(r300ContextPtr r300, uint32_t *pkt)
 {
-	int ret;
-	int i;
-	drm_radeon_cmd_buffer_t cmd;
-	int start;
-
-	if (r300->radeon.lost_context) {
-		start = 0;
-		r300->radeon.lost_context = GL_FALSE;
-	} else
-		start = r300->cmdbuf.count_reemit;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "%s from %s - %i cliprects\n",
-			__FUNCTION__, caller, r300->radeon.numClipRects);
-
-		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE)
-			for (i = start; i < r300->cmdbuf.count_used; ++i)
-				fprintf(stderr, "%d: %08x\n", i,
-					r300->cmdbuf.cmd_buf[i]);
-	}
+    if (r300->radeon.radeonScreen->kernel_mm) {
+        return ((((*pkt) >> 16) & 0x3FFF) + 1);
+    } else {
+        drm_r300_cmd_header_t *t = (drm_r300_cmd_header_t*)pkt;
+        return t->packet0.count;
+    }
+}
 
-	cmd.buf = (char *)(r300->cmdbuf.cmd_buf + start);
-	cmd.bufsz = (r300->cmdbuf.count_used - start) * 4;
+#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
 
-	if (r300->radeon.state.scissor.enabled) {
-		cmd.nbox = r300->radeon.state.scissor.numClipRects;
-		cmd.boxes =
-		    (drm_clip_rect_t *) r300->radeon.state.scissor.pClipRects;
-	} else {
-		cmd.nbox = r300->radeon.numClipRects;
-		cmd.boxes = (drm_clip_rect_t *) r300->radeon.pClipRects;
+void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	drm_r300_cmd_header_t cmd;
+	uint32_t addr, ndw, i;
+
+	if (!r300->radeon.radeonScreen->kernel_mm) {
+		uint32_t dwords;
+		dwords = (*atom->check) (ctx, atom);
+		BEGIN_BATCH_NO_AUTOSTATE(dwords);
+		OUT_BATCH_TABLE(atom->cmd, dwords);
+		END_BATCH();
+		return;
 	}
 
-	ret = drmCommandWrite(r300->radeon.dri.fd,
-			      DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "Syncing in %s (from %s)\n\n",
-			__FUNCTION__, caller);
-		radeonWaitForIdleLocked(&r300->radeon);
+	cmd.u = atom->cmd[0];
+	addr = (cmd.vpu.adrhi << 8) | cmd.vpu.adrlo;
+	ndw = cmd.vpu.count * 4;
+	if (ndw) {
+
+		if (r300->vap_flush_needed) {
+			BEGIN_BATCH_NO_AUTOSTATE(15 + ndw);
+
+			/* flush processing vertices */
+			OUT_BATCH_REGVAL(R300_SC_SCREENDOOR, 0);
+			OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+			OUT_BATCH_REGVAL(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+			OUT_BATCH_REGVAL(R300_SC_SCREENDOOR, 0xffffff);
+			OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+			r300->vap_flush_needed = GL_FALSE;
+		} else {
+			BEGIN_BATCH_NO_AUTOSTATE(5 + ndw);
+		}
+		OUT_BATCH_REGVAL(R300_VAP_PVS_VECTOR_INDX_REG, addr);
+		OUT_BATCH(CP_PACKET0(R300_VAP_PVS_UPLOAD_DATA, ndw-1) | RADEON_ONE_REG_WR);
+		for (i = 0; i < ndw; i++) {
+			OUT_BATCH(atom->cmd[i+1]);
+		}
+		OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+		END_BATCH();
 	}
-
-	r300->dma.nr_released_bufs = 0;
-	r300->cmdbuf.count_used = 0;
-	r300->cmdbuf.count_reemit = 0;
-
-	return ret;
 }
 
-int r300FlushCmdBuf(r300ContextPtr r300, const char *caller)
+void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	int ret;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	drm_r300_cmd_header_t cmd;
+	uint32_t addr, ndw, i, sz;
+	int type, clamp, stride;
+
+	if (!r300->radeon.radeonScreen->kernel_mm) {
+		uint32_t dwords;
+		dwords = (*atom->check) (ctx, atom);
+		BEGIN_BATCH_NO_AUTOSTATE(dwords);
+		OUT_BATCH_TABLE(atom->cmd, dwords);
+		END_BATCH();
+		return;
+	}
 
-	LOCK_HARDWARE(&r300->radeon);
+	cmd.u = atom->cmd[0];
+	sz = cmd.r500fp.count;
+	addr = ((cmd.r500fp.adrhi_flags & 1) << 8) | cmd.r500fp.adrlo;
+	type = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_TYPE);
+	clamp = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_CLAMP);
 
-	ret = r300FlushCmdBufLocked(r300, caller);
+	addr |= (type << 16);
+	addr |= (clamp << 17);
 
-	UNLOCK_HARDWARE(&r300->radeon);
+	stride = type ? 4 : 6;
 
-	if (ret) {
-		fprintf(stderr, "drmRadeonCmdBuffer: %d\n", ret);
-		_mesa_exit(ret);
-	}
+	ndw = sz * stride;
+	if (ndw) {
 
-	return ret;
+		BEGIN_BATCH_NO_AUTOSTATE(3 + ndw);
+		OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_INDEX, 0));
+		OUT_BATCH(addr);
+		OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_DATA, ndw-1) | RADEON_ONE_REG_WR);
+		for (i = 0; i < ndw; i++) {
+			OUT_BATCH(atom->cmd[i+1]);
+		}
+		END_BATCH();
+	}
 }
 
-static void r300PrintStateAtom(r300ContextPtr r300, struct r300_state_atom *state)
+static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	int i, j, reg;
-	int dwords = (*state->check) (r300, state);
-	drm_r300_cmd_header_t cmd;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	int numtmus = packet0_count(r300, r300->hw.tex.offset.cmd);
+	int notexture = 0;
 
-	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords,
-		state->cmd_size);
-
-	if (RADEON_DEBUG & DEBUG_VERBOSE) {
-		for (i = 0; i < dwords;) {
-			cmd = *((drm_r300_cmd_header_t *) &state->cmd[i]);
-			reg = (cmd.packet0.reghi << 8) | cmd.packet0.reglo;
-			fprintf(stderr, "      %s[%d]: cmdpacket0 (first reg=0x%04x, count=%d)\n",
-					state->name, i, reg, cmd.packet0.count);
-			++i;
-			for (j = 0; j < cmd.packet0.count; j++) {
-				fprintf(stderr, "      %s[%d]: 0x%04x = %08x\n",
-					state->name, i, reg, state->cmd[i]);
-				reg += 4;
-				++i;
-			}
+	if (numtmus) {
+		int i;
+
+		for(i = 0; i < numtmus; ++i) {
+		    radeonTexObj *t = r300->hw.textures[i];
+
+		    if (!t)
+			notexture = 1;
+		}
+
+		if (r300->radeon.radeonScreen->kernel_mm && notexture) {
+			return;
+		}
+		for(i = 0; i < numtmus; ++i) {
+		    radeonTexObj *t = r300->hw.textures[i];
+		    if (t && !t->image_override) {
+                BEGIN_BATCH_NO_AUTOSTATE(4);
+                OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+			    OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+					    RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+                END_BATCH();
+		    } else if (!t) {
+                /* Texture unit hasn't a texture bound nothings to do */
+		    } else { /* override cases */
+			    if (t->bo) {
+                    BEGIN_BATCH_NO_AUTOSTATE(4);
+                    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+				    OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
+						    RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+                    END_BATCH();
+			    } else if (!r300->radeon.radeonScreen->kernel_mm) {
+                    BEGIN_BATCH_NO_AUTOSTATE(2);
+                    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+				    OUT_BATCH(t->override_offset);
+                    END_BATCH();
+			    } else {
+                    /* Texture unit hasn't a texture bound nothings to do */
+                }
+		    }
 		}
 	}
 }
 
-/**
- * Emit all atoms with a dirty field equal to dirty.
- *
- * The caller must have ensured that there is enough space in the command
- * buffer.
- */
-static INLINE void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
+void r300_emit_scissor(GLcontext *ctx)
 {
-	struct r300_state_atom *atom;
-	uint32_t *dest;
-	int dwords;
-
-	dest = r300->cmdbuf.cmd_buf + r300->cmdbuf.count_used;
-
-	/* Emit WAIT */
-	*dest = cmdwait(R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit cache flush */
-	*dest = cmdpacket0(R300_TX_INVALTAGS, 1);
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	*dest = R300_TX_FLUSH;
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit END3D */
-	*dest = cmdpacify();
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit actual atoms */
-
-	foreach(atom, &r300->hw.atomlist) {
-		if ((atom->dirty || r300->hw.all_dirty) == dirty) {
-			dwords = (*atom->check) (r300, atom);
-			if (dwords) {
-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					r300PrintStateAtom(r300, atom);
-				}
-				memcpy(dest, atom->cmd, dwords * 4);
-				dest += dwords;
-				r300->cmdbuf.count_used += dwords;
-				atom->dirty = GL_FALSE;
-			} else {
-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					fprintf(stderr, "  skip state %s\n",
-						atom->name);
-				}
-			}
-		}
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    unsigned x1, y1, x2, y2;
+	struct radeon_renderbuffer *rrb;
+
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        return;
+    }
+	rrb = radeon_get_colorbuffer(&r300->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return;
 	}
+    if (r300->radeon.state.scissor.enabled) {
+        x1 = r300->radeon.state.scissor.rect.x1;
+        y1 = r300->radeon.state.scissor.rect.y1;
+        x2 = r300->radeon.state.scissor.rect.x2 - 1;
+        y2 = r300->radeon.state.scissor.rect.y2 - 1;
+    } else {
+        x1 = 0;
+        y1 = 0;
+        x2 = rrb->width - 1;
+        y2 = rrb->height - 1;
+    }
+    if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+        x1 += R300_SCISSORS_OFFSET;
+        y1 += R300_SCISSORS_OFFSET;
+        x2 += R300_SCISSORS_OFFSET;
+        y2 += R300_SCISSORS_OFFSET;
+    }
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_BATCH((x1 << R300_SCISSORS_X_SHIFT)|(y1 << R300_SCISSORS_Y_SHIFT));
+    OUT_BATCH((x2 << R300_SCISSORS_X_SHIFT)|(y2 << R300_SCISSORS_Y_SHIFT));
+    END_BATCH();
 }
 
-/**
- * Copy dirty hardware state atoms into the command buffer.
- *
- * We also copy out clean state if we're at the start of a buffer. That makes
- * it easy to recover from lost contexts.
- */
-void r300EmitState(r300ContextPtr r300)
+static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_PRIMS))
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (r300->cmdbuf.count_used && !r300->hw.is_dirty
-	    && !r300->hw.all_dirty)
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t cbpitch;
+	uint32_t offset = r300->radeon.state.color.draw_offset;
+	uint32_t dw = 6;
+    int i;
+
+	rrb = radeon_get_colorbuffer(&r300->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
 		return;
+	}
+
+	cbpitch = (rrb->pitch / rrb->cpp);
+	if (rrb->cpp == 4)
+		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+	else
+		cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+	if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+		cbpitch |= R300_COLOR_TILE_ENABLE;
+
+    	if (r300->radeon.radeonScreen->kernel_mm)
+		dw += 2;
+	BEGIN_BATCH_NO_AUTOSTATE(dw);
+	OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+	OUT_BATCH_RELOC(offset, rrb->bo, offset, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	OUT_BATCH_REGSEQ(R300_RB3D_COLORPITCH0, 1);
+    	if (!r300->radeon.radeonScreen->kernel_mm)
+		OUT_BATCH(cbpitch);
+	else
+		OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	END_BATCH();
+    if (r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+            BEGIN_BATCH_NO_AUTOSTATE(3);
+            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+            OUT_BATCH(0);
+            OUT_BATCH(((rrb->width - 1) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->height - 1) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+            BEGIN_BATCH_NO_AUTOSTATE(16);
+            for (i = 0; i < 4; i++) {
+                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+                OUT_BATCH((0 << R300_CLIPRECT_X_SHIFT) | (0 << R300_CLIPRECT_Y_SHIFT));
+                OUT_BATCH(((rrb->width - 1) << R300_CLIPRECT_X_SHIFT) | ((rrb->height - 1) << R300_CLIPRECT_Y_SHIFT));
+            }
+            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+            OUT_BATCH(0xAAAA);
+            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+            OUT_BATCH(0xffffff);
+            END_BATCH();
+        } else {
+            BEGIN_BATCH_NO_AUTOSTATE(3);
+            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+            OUT_BATCH((R300_SCISSORS_OFFSET << R300_SCISSORS_X_SHIFT) |
+                    (R300_SCISSORS_OFFSET << R300_SCISSORS_Y_SHIFT));
+            OUT_BATCH(((rrb->width + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->height + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+            BEGIN_BATCH_NO_AUTOSTATE(16);
+            for (i = 0; i < 4; i++) {
+                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+                OUT_BATCH((R300_SCISSORS_OFFSET << R300_CLIPRECT_X_SHIFT) | (R300_SCISSORS_OFFSET << R300_CLIPRECT_Y_SHIFT));
+                OUT_BATCH(((R300_SCISSORS_OFFSET + rrb->width - 1) << R300_CLIPRECT_X_SHIFT) |
+                          ((R300_SCISSORS_OFFSET + rrb->height - 1) << R300_CLIPRECT_Y_SHIFT));
+            }
+            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+            OUT_BATCH(0xAAAA);
+            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+            OUT_BATCH(0xffffff);
+            END_BATCH();
+        }
+    }
+}
 
-	/* To avoid going across the entire set of states multiple times, just check
-	 * for enough space for the case of emitting all state, and inline the
-	 * r300AllocCmdBuf code here without all the checks.
-	 */
-	r300EnsureCmdBufSpace(r300, r300->hw.max_state_size, __FUNCTION__);
+static void emit_zb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t zbpitch;
 
-	if (!r300->cmdbuf.count_used) {
-		if (RADEON_DEBUG & DEBUG_STATE)
-			fprintf(stderr, "Begin reemit state\n");
+	rrb = radeon_get_depthbuffer(&r300->radeon);
+	if (!rrb)
+		return;
 
-		r300EmitAtoms(r300, GL_FALSE);
-		r300->cmdbuf.count_reemit = r300->cmdbuf.count_used;
+	zbpitch = (rrb->pitch / rrb->cpp);
+	if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+		zbpitch |= R300_DEPTHMACROTILE_ENABLE;
+	}
+	if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
+		zbpitch |= R300_DEPTHMICROTILE_TILED;
 	}
 
-	if (RADEON_DEBUG & DEBUG_STATE)
-		fprintf(stderr, "Begin dirty state\n");
+	BEGIN_BATCH_NO_AUTOSTATE(6);
+	OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
+	OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	OUT_BATCH_REGVAL(R300_ZB_DEPTHPITCH, zbpitch);
+	END_BATCH();
+}
 
-	r300EmitAtoms(r300, GL_TRUE);
+static void emit_gb_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        BEGIN_BATCH_NO_AUTOSTATE(4);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        OUT_BATCH(atom->cmd[2]);
+        OUT_BATCH(atom->cmd[3]);
+        END_BATCH();
+    }
+}
 
-	assert(r300->cmdbuf.count_used < r300->cmdbuf.size);
+static void emit_threshold_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        OUT_BATCH(atom->cmd[2]);
+        END_BATCH();
+    }
+}
 
-	r300->hw.is_dirty = GL_FALSE;
-	r300->hw.all_dirty = GL_FALSE;
+static void emit_shade_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        BEGIN_BATCH_NO_AUTOSTATE(2);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        END_BATCH();
+    }
 }
 
-#define packet0_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->packet0.count)
-#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
-#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
+static void emit_zstencil_format(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t format = 0;
+
+	rrb = radeon_get_depthbuffer(&r300->radeon);
+	if (!rrb)
+	  format = 0;
+	else {
+	  if (rrb->cpp == 2)
+	    format = R300_DEPTHFORMAT_16BIT_INT_Z;
+	  else if (rrb->cpp == 4)
+	    format = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+	}
+
+	OUT_BATCH(atom->cmd[0]);
+	atom->cmd[1] &= ~0xf;
+	atom->cmd[1] |= format;
+	OUT_BATCH(atom->cmd[1]);
+	OUT_BATCH(atom->cmd[2]);
+	OUT_BATCH(atom->cmd[3]);
+	OUT_BATCH(atom->cmd[4]);
+}
 
-static int check_always(r300ContextPtr r300, struct r300_state_atom *atom)
+static int check_always(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	return atom->cmd_size;
 }
 
-static int check_variable(r300ContextPtr r300, struct r300_state_atom *atom)
+static int check_variable(GLcontext *ctx, struct radeon_state_atom *atom)
 {
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int cnt;
-	cnt = packet0_count(atom->cmd);
+	if (atom->cmd[0] == CP_PACKET2) {
+		return 0;
+	}
+	cnt = packet0_count(r300, atom->cmd);
 	return cnt ? cnt + 1 : 0;
 }
 
-static int check_vpu(r300ContextPtr r300, struct r300_state_atom *atom)
+int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
+
 	cnt = vpu_count(atom->cmd);
 	return cnt ? (cnt * 4) + 1 : 0;
 }
 
-static int check_r500fp(r300ContextPtr r300, struct r300_state_atom *atom)
+int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
+
 	cnt = r500fp_count(atom->cmd);
 	return cnt ? (cnt * 6) + 1 : 0;
 }
 
-static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
+int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
+
 	cnt = r500fp_count(atom->cmd);
 	return cnt ? (cnt * 4) + 1 : 0;
 }
@@ -295,8 +467,8 @@ static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
       r300->hw.ATOM.idx = (IDX);					\
       r300->hw.ATOM.check = check_##CHK;				\
       r300->hw.ATOM.dirty = GL_FALSE;					\
-      r300->hw.max_state_size += (SZ);					\
-      insert_at_tail(&r300->hw.atomlist, &r300->hw.ATOM);		\
+      r300->radeon.hw.max_state_size += (SZ);					\
+      insert_at_tail(&r300->radeon.hw.atomlist, &r300->hw.ATOM);		\
    } while (0)
 /**
  * Allocate memory for the command buffer and initialize the state atom
@@ -304,18 +476,16 @@ static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
  */
 void r300InitCmdBuf(r300ContextPtr r300)
 {
-	int size, mtu;
-	int has_tcl = 1;
+	int mtu;
+	int has_tcl;
 	int is_r500 = 0;
-	int i;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
 		is_r500 = 1;
 
-	r300->hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
+	r300->radeon.hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
 
 	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
 	if (RADEON_DEBUG & DEBUG_TEXTURE) {
@@ -323,232 +493,248 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	}
 
 	/* Setup the atom linked list */
-	make_empty_list(&r300->hw.atomlist);
-	r300->hw.atomlist.name = "atom-list";
+	make_empty_list(&r300->radeon.hw.atomlist);
+	r300->radeon.hw.atomlist.name = "atom-list";
 
 	/* Initialize state atoms */
 	ALLOC_STATE(vpt, always, R300_VPT_CMDSIZE, 0);
-	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(R300_SE_VPORT_XSCALE, 6);
+	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VPORT_XSCALE, 6);
 	ALLOC_STATE(vap_cntl, always, R300_VAP_CNTL_SIZE, 0);
-	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(R300_VAP_PVS_STATE_FLUSH_REG, 1);
+	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_STATE_FLUSH_REG, 1);
 	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH_1] = 0;
-	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(R300_VAP_CNTL, 1);
-	if (is_r500) {
+	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL, 1);
+	if (is_r500 && !r300->radeon.radeonScreen->kernel_mm) {
 	    ALLOC_STATE(vap_index_offset, always, 2, 0);
-	    r300->hw.vap_index_offset.cmd[0] = cmdpacket0(R500_VAP_INDEX_OFFSET, 1);
+	    r300->hw.vap_index_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_VAP_INDEX_OFFSET, 1);
 	    r300->hw.vap_index_offset.cmd[1] = 0;
 	}
 	ALLOC_STATE(vte, always, 3, 0);
-	r300->hw.vte.cmd[0] = cmdpacket0(R300_SE_VTE_CNTL, 2);
+	r300->hw.vte.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VTE_CNTL, 2);
 	ALLOC_STATE(vap_vf_max_vtx_indx, always, 3, 0);
-	r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(R300_VAP_VF_MAX_VTX_INDX, 2);
+	r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VF_MAX_VTX_INDX, 2);
 	ALLOC_STATE(vap_cntl_status, always, 2, 0);
-	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(R300_VAP_CNTL_STATUS, 1);
+	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL_STATUS, 1);
 	ALLOC_STATE(vir[0], variable, R300_VIR_CMDSIZE, 0);
 	r300->hw.vir[0].cmd[R300_VIR_CMD_0] =
-	    cmdpacket0(R300_VAP_PROG_STREAM_CNTL_0, 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_0, 1);
 	ALLOC_STATE(vir[1], variable, R300_VIR_CMDSIZE, 1);
 	r300->hw.vir[1].cmd[R300_VIR_CMD_0] =
-	    cmdpacket0(R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
 	ALLOC_STATE(vic, always, R300_VIC_CMDSIZE, 0);
-	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(R300_VAP_VTX_STATE_CNTL, 2);
+	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VTX_STATE_CNTL, 2);
 	ALLOC_STATE(vap_psc_sgn_norm_cntl, always, 2, 0);
-	r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
+	r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
 
 	if (has_tcl) {
 		ALLOC_STATE(vap_clip_cntl, always, 2, 0);
-		r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(R300_VAP_CLIP_CNTL, 1);
+		r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CLIP_CNTL, 1);
 		ALLOC_STATE(vap_clip, always, 5, 0);
-		r300->hw.vap_clip.cmd[0] = cmdpacket0(R300_VAP_GB_VERT_CLIP_ADJ, 4);
+		r300->hw.vap_clip.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_GB_VERT_CLIP_ADJ, 4);
 		ALLOC_STATE(vap_pvs_vtx_timeout_reg, always, 2, 0);
-		r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(VAP_PVS_VTX_TIMEOUT_REG, 1);
+		r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, VAP_PVS_VTX_TIMEOUT_REG, 1);
 	}
 
 	ALLOC_STATE(vof, always, R300_VOF_CMDSIZE, 0);
 	r300->hw.vof.cmd[R300_VOF_CMD_0] =
-	    cmdpacket0(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_OUTPUT_VTX_FMT_0, 2);
 
 	if (has_tcl) {
 		ALLOC_STATE(pvs, always, R300_PVS_CMDSIZE, 0);
 		r300->hw.pvs.cmd[R300_PVS_CMD_0] =
-		    cmdpacket0(R300_VAP_PVS_CODE_CNTL_0, 3);
+		    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_CODE_CNTL_0, 3);
 	}
 
 	ALLOC_STATE(gb_enable, always, 2, 0);
-	r300->hw.gb_enable.cmd[0] = cmdpacket0(R300_GB_ENABLE, 1);
+	r300->hw.gb_enable.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_ENABLE, 1);
 	ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
-	r300->hw.gb_misc.cmd[0] = cmdpacket0(R300_GB_MSPOS0, 5);
+	r300->hw.gb_misc.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_MSPOS0, 3);
+	r300->hw.gb_misc.emit = emit_gb_misc;
+	ALLOC_STATE(gb_misc2, always, R300_GB_MISC2_CMDSIZE, 0);
+    r300->hw.gb_misc2.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x401C, 2);
 	ALLOC_STATE(txe, always, R300_TXE_CMDSIZE, 0);
-	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(R300_TX_ENABLE, 1);
+	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_ENABLE, 1);
 	ALLOC_STATE(ga_point_s0, always, 5, 0);
-	r300->hw.ga_point_s0.cmd[0] = cmdpacket0(R300_GA_POINT_S0, 4);
+	r300->hw.ga_point_s0.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_S0, 4);
 	ALLOC_STATE(ga_triangle_stipple, always, 2, 0);
-	r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(R300_GA_TRIANGLE_STIPPLE, 1);
+	r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_TRIANGLE_STIPPLE, 1);
 	ALLOC_STATE(ps, always, R300_PS_CMDSIZE, 0);
-	r300->hw.ps.cmd[0] = cmdpacket0(R300_GA_POINT_SIZE, 1);
+	r300->hw.ps.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_SIZE, 1);
 	ALLOC_STATE(ga_point_minmax, always, 4, 0);
-	r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(R300_GA_POINT_MINMAX, 3);
+	r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_MINMAX, 3);
 	ALLOC_STATE(lcntl, always, 2, 0);
-	r300->hw.lcntl.cmd[0] = cmdpacket0(R300_GA_LINE_CNTL, 1);
+	r300->hw.lcntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_CNTL, 1);
 	ALLOC_STATE(ga_line_stipple, always, 4, 0);
-	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(R300_GA_LINE_STIPPLE_VALUE, 3);
-	ALLOC_STATE(shade, always, 5, 0);
-	r300->hw.shade.cmd[0] = cmdpacket0(R300_GA_ENHANCE, 4);
+	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_STIPPLE_VALUE, 3);
+	ALLOC_STATE(shade, always, 2, 0);
+	r300->hw.shade.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_ENHANCE, 1);
+	r300->hw.shade.emit = emit_shade_misc;
+	ALLOC_STATE(shade2, always, 4, 0);
+	r300->hw.shade2.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x4278, 3);
 	ALLOC_STATE(polygon_mode, always, 4, 0);
-	r300->hw.polygon_mode.cmd[0] = cmdpacket0(R300_GA_POLY_MODE, 3);
+	r300->hw.polygon_mode.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POLY_MODE, 3);
 	ALLOC_STATE(fogp, always, 3, 0);
-	r300->hw.fogp.cmd[0] = cmdpacket0(R300_GA_FOG_SCALE, 2);
+	r300->hw.fogp.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_FOG_SCALE, 2);
 	ALLOC_STATE(zbias_cntl, always, 2, 0);
-	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(R300_SU_TEX_WRAP, 1);
+	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_TEX_WRAP, 1);
 	ALLOC_STATE(zbs, always, R300_ZBS_CMDSIZE, 0);
 	r300->hw.zbs.cmd[R300_ZBS_CMD_0] =
-	    cmdpacket0(R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
 	ALLOC_STATE(occlusion_cntl, always, 2, 0);
-	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(R300_SU_POLY_OFFSET_ENABLE, 1);
+	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_ENABLE, 1);
 	ALLOC_STATE(cul, always, R300_CUL_CMDSIZE, 0);
-	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(R300_SU_CULL_MODE, 1);
+	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_CULL_MODE, 1);
 	ALLOC_STATE(su_depth_scale, always, 3, 0);
-	r300->hw.su_depth_scale.cmd[0] = cmdpacket0(R300_SU_DEPTH_SCALE, 2);
+	r300->hw.su_depth_scale.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_DEPTH_SCALE, 2);
 	ALLOC_STATE(rc, always, R300_RC_CMDSIZE, 0);
-	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(R300_RS_COUNT, 2);
+	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_COUNT, 2);
 	if (is_r500) {
-		ALLOC_STATE(ri, always, R500_RI_CMDSIZE, 0);
-		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R500_RS_IP_0, 16);
-		for (i = 0; i < 8; i++) {
-			r300->hw.ri.cmd[R300_RI_CMD_0 + i +1] =
-			  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-                          (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
-		}
+		ALLOC_STATE(ri, variable, R500_RI_CMDSIZE, 0);
+		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, 16);
 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
-		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, 1);
+		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, 1);
 	} else {
-		ALLOC_STATE(ri, always, R300_RI_CMDSIZE, 0);
-		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R300_RS_IP_0, 8);
+		ALLOC_STATE(ri, variable, R300_RI_CMDSIZE, 0);
+		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, 8);
 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
-		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, 1);
+		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, 1);
 	}
 	ALLOC_STATE(sc_hyperz, always, 3, 0);
-	r300->hw.sc_hyperz.cmd[0] = cmdpacket0(R300_SC_HYPERZ, 2);
+	r300->hw.sc_hyperz.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_HYPERZ, 2);
 	ALLOC_STATE(sc_screendoor, always, 2, 0);
-	r300->hw.sc_screendoor.cmd[0] = cmdpacket0(R300_SC_SCREENDOOR, 1);
+	r300->hw.sc_screendoor.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_SCREENDOOR, 1);
 	ALLOC_STATE(us_out_fmt, always, 6, 0);
-	r300->hw.us_out_fmt.cmd[0] = cmdpacket0(R300_US_OUT_FMT, 5);
+	r300->hw.us_out_fmt.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_OUT_FMT, 5);
 
 	if (is_r500) {
 		ALLOC_STATE(fp, always, R500_FP_CMDSIZE, 0);
-		r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(R500_US_CONFIG, 2);
+		r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CONFIG, 2);
 		r300->hw.fp.cmd[R500_FP_CNTL] = R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO;
-		r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(R500_US_CODE_ADDR, 3);
-		r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(R500_US_FC_CTRL, 1);
+		r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CODE_ADDR, 3);
+		r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(r300->radeon.radeonScreen, R500_US_FC_CTRL, 1);
 		r300->hw.fp.cmd[R500_FP_FC_CNTL] = 0; /* FIXME when we add flow control */
 
 		ALLOC_STATE(r500fp, r500fp, R500_FPI_CMDSIZE, 0);
-		r300->hw.r500fp.cmd[R300_FPI_CMD_0] = cmdr500fp(0, 0, 0, 0);
+		r300->hw.r500fp.cmd[R300_FPI_CMD_0] =
+			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 0, 0);
+		r300->hw.r500fp.emit = emit_r500fp;
 		ALLOC_STATE(r500fp_const, r500fp_const, R500_FPP_CMDSIZE, 0);
-		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] = cmdr500fp(0, 0, 1, 0);
+		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] =
+			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 1, 0);
+		r300->hw.r500fp_const.emit = emit_r500fp;
 	} else {
 		ALLOC_STATE(fp, always, R300_FP_CMDSIZE, 0);
-		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(R300_US_CONFIG, 3);
-		r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(R300_US_CODE_ADDR_0, 4);
+		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CONFIG, 3);
+		r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CODE_ADDR_0, 4);
+
 		ALLOC_STATE(fpt, variable, R300_FPT_CMDSIZE, 0);
-		r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(R300_US_TEX_INST_0, 0);
+		r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_TEX_INST_0, 0);
 
 		ALLOC_STATE(fpi[0], variable, R300_FPI_CMDSIZE, 0);
-		r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, 1);
+		r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, 1);
 		ALLOC_STATE(fpi[1], variable, R300_FPI_CMDSIZE, 1);
-		r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, 1);
+		r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, 1);
 		ALLOC_STATE(fpi[2], variable, R300_FPI_CMDSIZE, 2);
-		r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, 1);
+		r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, 1);
 		ALLOC_STATE(fpi[3], variable, R300_FPI_CMDSIZE, 3);
-		r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, 1);
+		r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, 1);
 		ALLOC_STATE(fpp, variable, R300_FPP_CMDSIZE, 0);
-		r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, 0);
+		r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_PFS_PARAM_0_X, 0);
 	}
 	ALLOC_STATE(fogs, always, R300_FOGS_CMDSIZE, 0);
-	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(R300_FG_FOG_BLEND, 1);
+	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_BLEND, 1);
 	ALLOC_STATE(fogc, always, R300_FOGC_CMDSIZE, 0);
-	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(R300_FG_FOG_COLOR_R, 3);
+	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_COLOR_R, 3);
 	ALLOC_STATE(at, always, R300_AT_CMDSIZE, 0);
-	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(R300_FG_ALPHA_FUNC, 2);
+	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_ALPHA_FUNC, 2);
 	ALLOC_STATE(fg_depth_src, always, 2, 0);
-	r300->hw.fg_depth_src.cmd[0] = cmdpacket0(R300_FG_DEPTH_SRC, 1);
+	r300->hw.fg_depth_src.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_DEPTH_SRC, 1);
 	ALLOC_STATE(rb3d_cctl, always, 2, 0);
-	r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(R300_RB3D_CCTL, 1);
+	r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CCTL, 1);
 	ALLOC_STATE(bld, always, R300_BLD_CMDSIZE, 0);
-	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(R300_RB3D_CBLEND, 2);
+	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CBLEND, 2);
 	ALLOC_STATE(cmk, always, R300_CMK_CMDSIZE, 0);
-	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(RB3D_COLOR_CHANNEL_MASK, 1);
+	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, RB3D_COLOR_CHANNEL_MASK, 1);
 	if (is_r500) {
 		ALLOC_STATE(blend_color, always, 3, 0);
-		r300->hw.blend_color.cmd[0] = cmdpacket0(R500_RB3D_CONSTANT_COLOR_AR, 2);
+		r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_CONSTANT_COLOR_AR, 2);
 	} else {
 		ALLOC_STATE(blend_color, always, 2, 0);
-		r300->hw.blend_color.cmd[0] = cmdpacket0(R300_RB3D_BLEND_COLOR, 1);
+		r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_BLEND_COLOR, 1);
 	}
 	ALLOC_STATE(rop, always, 2, 0);
-	r300->hw.rop.cmd[0] = cmdpacket0(R300_RB3D_ROPCNTL, 1);
+	r300->hw.rop.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_ROPCNTL, 1);
 	ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0);
-	r300->hw.cb.cmd[R300_CB_CMD_0] = cmdpacket0(R300_RB3D_COLOROFFSET0, 1);
-	r300->hw.cb.cmd[R300_CB_CMD_1] = cmdpacket0(R300_RB3D_COLORPITCH0, 1);
+	r300->hw.cb.emit = &emit_cb_offset;
 	ALLOC_STATE(rb3d_dither_ctl, always, 10, 0);
-	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(R300_RB3D_DITHER_CTL, 9);
+	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DITHER_CTL, 9);
 	ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
-	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(R300_RB3D_AARESOLVE_CTL, 1);
-	ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_AARESOLVE_CTL, 1);
+    ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+	r300->hw.rb3d_discard_src_pixel_lte_threshold.emit = emit_threshold_misc;
 	ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
 	r300->hw.zs.cmd[R300_ZS_CMD_0] =
-	    cmdpacket0(R300_ZB_CNTL, 3);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_CNTL, 3);
+
 	ALLOC_STATE(zstencil_format, always, 5, 0);
 	r300->hw.zstencil_format.cmd[0] =
-	    cmdpacket0(R300_ZB_FORMAT, 4);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_FORMAT, 4);
+	r300->hw.zstencil_format.emit = emit_zstencil_format;
+
 	ALLOC_STATE(zb, always, R300_ZB_CMDSIZE, 0);
-	r300->hw.zb.cmd[R300_ZB_CMD_0] = cmdpacket0(R300_ZB_DEPTHOFFSET, 2);
+	r300->hw.zb.emit = emit_zb_offset;
 	ALLOC_STATE(zb_depthclearvalue, always, 2, 0);
-	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(R300_ZB_DEPTHCLEARVALUE, 1);
-	ALLOC_STATE(unk4F30, always, 3, 0);
-	r300->hw.unk4F30.cmd[0] = cmdpacket0(0x4F30, 2);
+	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_DEPTHCLEARVALUE, 1);
+	ALLOC_STATE(zb_zmask, always, 3, 0);
+	r300->hw.zb_zmask.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_ZMASK_OFFSET, 2);
 	ALLOC_STATE(zb_hiz_offset, always, 2, 0);
-	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(R300_ZB_HIZ_OFFSET, 1);
+	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_OFFSET, 1);
 	ALLOC_STATE(zb_hiz_pitch, always, 2, 0);
-	r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(R300_ZB_HIZ_PITCH, 1);
+	r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_PITCH, 1);
 
 	/* VPU only on TCL */
 	if (has_tcl) {
    	        int i;
 		ALLOC_STATE(vpi, vpu, R300_VPI_CMDSIZE, 0);
-		r300->hw.vpi.cmd[R300_VPI_CMD_0] =
-		    cmdvpu(R300_PVS_CODE_START, 0);
+		r300->hw.vpi.cmd[0] =
+		    cmdvpu(r300->radeon.radeonScreen, R300_PVS_CODE_START, 0);
+		r300->hw.vpi.emit = emit_vpu;
 
 		if (is_r500) {
 		    ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
-		    r300->hw.vpp.cmd[R300_VPP_CMD_0] =
-			cmdvpu(R500_PVS_CONST_START, 0);
+		    r300->hw.vpp.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R500_PVS_CONST_START, 0);
+		    r300->hw.vpp.emit = emit_vpu;
 
 		    ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
-		    r300->hw.vps.cmd[R300_VPS_CMD_0] =
-			cmdvpu(R500_POINT_VPORT_SCALE_OFFSET, 1);
+		    r300->hw.vps.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R500_POINT_VPORT_SCALE_OFFSET, 1);
+		    r300->hw.vps.emit = emit_vpu;
 
 			for (i = 0; i < 6; i++) {
-				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
-				r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
-					cmdvpu(R500_PVS_UCP_START + i, 1);
+			  ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
+			  r300->hw.vpucp[i].cmd[0] =
+				  cmdvpu(r300->radeon.radeonScreen,
+                           R500_PVS_UCP_START + i, 1);
+				r300->hw.vpucp[i].emit = emit_vpu;
 			}
 		} else {
 		    ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
-		    r300->hw.vpp.cmd[R300_VPP_CMD_0] =
-			cmdvpu(R300_PVS_CONST_START, 0);
+		    r300->hw.vpp.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R300_PVS_CONST_START, 0);
+		    r300->hw.vpp.emit = emit_vpu;
 
 		    ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
-		    r300->hw.vps.cmd[R300_VPS_CMD_0] =
-			cmdvpu(R300_POINT_VPORT_SCALE_OFFSET, 1);
+		    r300->hw.vps.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R300_POINT_VPORT_SCALE_OFFSET, 1);
+		    r300->hw.vps.emit = emit_vpu;
 
 			for (i = 0; i < 6; i++) {
 				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
-				r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
-					cmdvpu(R300_PVS_UCP_START + i, 1);
+				r300->hw.vpucp[i].cmd[0] =
+					cmdvpu(r300->radeon.radeonScreen,
+					       R300_PVS_UCP_START + i, 1);
+				r300->hw.vpucp[i].emit = emit_vpu;
 			}
 		}
 	}
@@ -556,130 +742,37 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	/* Textures */
 	ALLOC_STATE(tex.filter, variable, mtu + 1, 0);
 	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER0_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 0);
 
 	ALLOC_STATE(tex.filter_1, variable, mtu + 1, 0);
 	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER1_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, 0);
 
 	ALLOC_STATE(tex.size, variable, mtu + 1, 0);
-	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_SIZE_0, 0);
+	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, 0);
 
 	ALLOC_STATE(tex.format, variable, mtu + 1, 0);
 	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, 0);
 
 	ALLOC_STATE(tex.pitch, variable, mtu + 1, 0);
-	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_FORMAT2_0, 0);
+	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, 0);
 
-	ALLOC_STATE(tex.offset, variable, mtu + 1, 0);
+	ALLOC_STATE(tex.offset, variable, 1, 0);
 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_OFFSET_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, 0);
+	r300->hw.tex.offset.emit = &emit_tex_offsets;
 
 	ALLOC_STATE(tex.chroma_key, variable, mtu + 1, 0);
 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_CHROMA_KEY_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, 0);
 
 	ALLOC_STATE(tex.border_color, variable, mtu + 1, 0);
 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_BORDER_COLOR_0, 0);
-
-	r300->hw.is_dirty = GL_TRUE;
-	r300->hw.all_dirty = GL_TRUE;
-
-	/* Initialize command buffer */
-	size =
-	    256 * driQueryOptioni(&r300->radeon.optionCache,
-				  "command_buffer_size");
-	if (size < 2 * r300->hw.max_state_size) {
-		size = 2 * r300->hw.max_state_size + 65535;
-	}
-	if (size > 64 * 256)
-		size = 64 * 256;
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA)) {
-		fprintf(stderr, "sizeof(drm_r300_cmd_header_t)=%zd\n",
-			sizeof(drm_r300_cmd_header_t));
-		fprintf(stderr, "sizeof(drm_radeon_cmd_buffer_t)=%zd\n",
-			sizeof(drm_radeon_cmd_buffer_t));
-		fprintf(stderr,
-			"Allocating %d bytes command buffer (max state is %d bytes)\n",
-			size * 4, r300->hw.max_state_size * 4);
-	}
-
-	r300->cmdbuf.size = size;
-	r300->cmdbuf.cmd_buf = (uint32_t *) CALLOC(size * 4);
-	r300->cmdbuf.count_used = 0;
-	r300->cmdbuf.count_reemit = 0;
-}
-
-/**
- * Destroy the command buffer and state atoms.
- */
-void r300DestroyCmdBuf(r300ContextPtr r300)
-{
-	struct r300_state_atom *atom;
-
-	FREE(r300->cmdbuf.cmd_buf);
-
-	foreach(atom, &r300->hw.atomlist) {
-		FREE(atom->cmd);
-	}
-}
-
-void r300EmitBlit(r300ContextPtr rmesa,
-		  GLuint color_fmt,
-		  GLuint src_pitch,
-		  GLuint src_offset,
-		  GLuint dst_pitch,
-		  GLuint dst_offset,
-		  GLint srcx, GLint srcy,
-		  GLint dstx, GLint dsty, GLuint w, GLuint h)
-{
-	drm_r300_cmd_header_t *cmd;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr,
-			"%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
-			__FUNCTION__, src_pitch, src_offset, srcx, srcy,
-			dst_pitch, dst_offset, dstx, dsty, w, h);
-
-	assert((src_pitch & 63) == 0);
-	assert((dst_pitch & 63) == 0);
-	assert((src_offset & 1023) == 0);
-	assert((dst_offset & 1023) == 0);
-	assert(w < (1 << 16));
-	assert(h < (1 << 16));
-
-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 8, __FUNCTION__);
-
-	cmd[0].header.cmd_type = R300_CMD_PACKET3;
-	cmd[0].header.pad0 = R300_CMD_PACKET3_RAW;
-	cmd[1].u = R300_CP_CMD_BITBLT_MULTI | (5 << 16);
-	cmd[2].u = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
-		    RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-		    RADEON_GMC_BRUSH_NONE |
-		    (color_fmt << 8) |
-		    RADEON_GMC_SRC_DATATYPE_COLOR |
-		    RADEON_ROP3_S |
-		    RADEON_DP_SRC_SOURCE_MEMORY |
-		    RADEON_GMC_CLR_CMP_CNTL_DIS | RADEON_GMC_WR_MSK_DIS);
-
-	cmd[3].u = ((src_pitch / 64) << 22) | (src_offset >> 10);
-	cmd[4].u = ((dst_pitch / 64) << 22) | (dst_offset >> 10);
-	cmd[5].u = (srcx << 16) | srcy;
-	cmd[6].u = (dstx << 16) | dsty;	/* dst */
-	cmd[7].u = (w << 16) | h;
-}
-
-void r300EmitWait(r300ContextPtr rmesa, GLuint flags)
-{
-	drm_r300_cmd_header_t *cmd;
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, 0);
 
-	assert(!(flags & ~(R300_WAIT_2D | R300_WAIT_3D)));
+	r300->radeon.hw.is_dirty = GL_TRUE;
+	r300->radeon.hw.all_dirty = GL_TRUE;
 
-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].u = 0;
-	cmd[0].wait.cmd_type = R300_CMD_WAIT;
-	cmd[0].wait.flags = flags;
+	rcommonInitCmdBuf(&r300->radeon);
 }
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.h b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
index a8eaa580bd..53bcc0eeb4 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.h
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
@@ -38,79 +38,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "r300_context.h"
 
-extern int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller);
-extern int r300FlushCmdBuf(r300ContextPtr r300, const char *caller);
-
-extern void r300EmitState(r300ContextPtr r300);
-
 extern void r300InitCmdBuf(r300ContextPtr r300);
-extern void r300DestroyCmdBuf(r300ContextPtr r300);
-
-/**
- * Make sure that enough space is available in the command buffer
- * by flushing if necessary.
- *
- * \param dwords The number of dwords we need to be free on the command buffer
- */
-static INLINE void r300EnsureCmdBufSpace(r300ContextPtr r300,
-					     int dwords, const char *caller)
-{
-	assert(dwords < r300->cmdbuf.size);
-
-	if (r300->cmdbuf.count_used + dwords > r300->cmdbuf.size)
-		r300FlushCmdBuf(r300, caller);
-}
-
-/**
- * Allocate the given number of dwords in the command buffer and return
- * a pointer to the allocated area.
- * When necessary, these functions cause a flush. r300AllocCmdBuf() also
- * causes state reemission after a flush. This is necessary to ensure
- * correct hardware state after an unlock.
- */
-static INLINE uint32_t *r300RawAllocCmdBuf(r300ContextPtr r300,
-					       int dwords, const char *caller)
-{
-	uint32_t *ptr;
-
-	r300EnsureCmdBufSpace(r300, dwords, caller);
-
-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
-	r300->cmdbuf.count_used += dwords;
-	return ptr;
-}
-
-static INLINE uint32_t *r300AllocCmdBuf(r300ContextPtr r300,
-					    int dwords, const char *caller)
-{
-	uint32_t *ptr;
-
-	r300EnsureCmdBufSpace(r300, dwords, caller);
-
-	if (!r300->cmdbuf.count_used) {
-		if (RADEON_DEBUG & DEBUG_IOCTL)
-			fprintf(stderr,
-				"Reemit state after flush (from %s)\n", caller);
-		r300EmitState(r300);
-	}
-
-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
-	r300->cmdbuf.count_used += dwords;
-	return ptr;
-}
+void r300_emit_scissor(GLcontext *ctx);
 
-extern void r300EmitBlit(r300ContextPtr rmesa,
-			 GLuint color_fmt,
-			 GLuint src_pitch,
-			 GLuint src_offset,
-			 GLuint dst_pitch,
-			 GLuint dst_offset,
-			 GLint srcx, GLint srcy,
-			 GLint dstx, GLint dsty, GLuint w, GLuint h);
+void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom);
+int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom);
 
-extern void r300EmitWait(r300ContextPtr rmesa, GLuint flags);
-extern void r300EmitLOAD_VBPNTR(r300ContextPtr rmesa, int start);
-extern void r300EmitVertexShader(r300ContextPtr rmesa);
-extern void r300EmitPixelShader(r300ContextPtr rmesa);
+void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom);
+int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom);
+int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom);
 
 #endif				/* __R300_CMDBUF_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 7d6705058f..14a11ea1fb 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -43,8 +43,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/matrix.h"
 #include "main/extensions.h"
 #include "main/state.h"
-#include "main/texobj.h"
 #include "main/bufferobj.h"
+#include "main/texobj.h"
 
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
@@ -56,42 +56,40 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "drivers/common/driverfuncs.h"
 
-#include "radeon_ioctl.h"
-#include "radeon_span.h"
 #include "r300_context.h"
+#include "radeon_context.h"
+#include "radeon_span.h"
 #include "r300_cmdbuf.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
+#include "r300_render.h"
 #include "r300_swtcl.h"
+#include "radeon_bocs_wrapper.h"
 
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
 
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h"		/* for symbolic values of enum-type options */
 
-/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
-int future_hw_tcl_on = 1;
-int hw_tcl_on = 1;
-
 #define need_GL_VERSION_2_0
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_vertex_program
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
 #define need_GL_EXT_blend_minmax
+#define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_gpu_program_parameters
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_stencil_two_side
 #define need_GL_ATI_separate_stencil
 #define need_GL_NV_vertex_program
+
 #include "extension_helper.h"
 
+
 const struct dri_extension card_extensions[] = {
   /* *INDENT-OFF* */
   {"GL_ARB_depth_texture",		NULL},
@@ -112,6 +110,7 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
   {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
   {"GL_EXT_blend_subtract",		NULL},
+  {"GL_EXT_packed_depth_stencil",	NULL},
   {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
   {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
   {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
@@ -125,6 +124,8 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_texture_lod_bias",		NULL},
   {"GL_EXT_texture_mirror_clamp",	NULL},
   {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_EXT_texture_sRGB",		NULL},
+  {"GL_EXT_vertex_array_bgra",		NULL},
   {"GL_ATI_separate_stencil",		GL_ATI_separate_stencil_functions},
   {"GL_ATI_texture_env_combine3",	NULL},
   {"GL_ATI_texture_mirror_once",	NULL},
@@ -139,6 +140,11 @@ const struct dri_extension card_extensions[] = {
 };
 
 
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
 /**
  * The GL 2.0 functions are needed to make display lists work with
  * functions added by GL_ATI_separate_stencil.
@@ -147,16 +153,8 @@ const struct dri_extension gl_20_extension[] = {
   {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
 };
 
-
-extern struct tnl_pipeline_stage _r300_render_stage;
-extern const struct tnl_pipeline_stage _r300_tcl_stage;
-
 static const struct tnl_pipeline_stage *r300_pipeline[] = {
 
-	/* Try and go straight to t&l
-	 */
-	&_r300_tcl_stage,
-
 	/* Catch any t&l fallbacks
 	 */
 	&_tnl_vertex_transform_stage,
@@ -165,6 +163,7 @@ static const struct tnl_pipeline_stage *r300_pipeline[] = {
 	&_tnl_fog_coordinate_stage,
 	&_tnl_texgen_stage,
 	&_tnl_texture_transform_stage,
+	&_tnl_point_attenuation_stage,
 	&_tnl_vertex_program_stage,
 
 	/* Try again to go to tcl?
@@ -184,6 +183,186 @@ static const struct tnl_pipeline_stage *r300_pipeline[] = {
 	0,
 };
 
+static void r300_get_lock(radeonContextPtr rmesa)
+{
+	drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+		sarea->ctx_owner = rmesa->dri.hwContext;
+		if (!rmesa->radeonScreen->kernel_mm)
+			radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
+	}
+}
+
+static void r300_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+    /* please flush pipe do all pending work */
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_SCREENDOOR, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_SCREENDOOR, 1));
+    radeon_cs_write_dword(cs, 0x00FFFFFF);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_HYPERZ, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_US_CONFIG, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_ZB_CNTL, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen, R300_WAIT_3D));
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_RB3D_DSTCACHE_CTLSTAT, 1));
+    radeon_cs_write_dword(cs, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_ZB_ZCACHE_CTLSTAT, 1));
+    radeon_cs_write_dword(cs, R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE);
+    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen,
+                               R300_WAIT_3D | R300_WAIT_3D_CLEAN));
+}
+
+static void r300_vtbl_pre_emit_atoms(radeonContextPtr radeon)
+{
+	r300ContextPtr r300 = (r300ContextPtr)radeon;
+	BATCH_LOCALS(radeon);
+
+	r300->vap_flush_needed = GL_TRUE;
+
+	cp_wait(radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	BEGIN_BATCH_NO_AUTOSTATE(2);
+	OUT_BATCH_REGVAL(R300_TX_INVALTAGS, R300_TX_FLUSH);
+	END_BATCH();
+	end_3d(radeon);
+}
+
+static void r300_fallback(GLcontext *ctx, GLuint bit, GLboolean mode)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	if (mode)
+		r300->radeon.Fallback |= bit;
+	else
+		r300->radeon.Fallback &= ~bit;
+}
+
+static void r300_init_vtbl(radeonContextPtr radeon)
+{
+	radeon->vtbl.get_lock = r300_get_lock;
+	radeon->vtbl.update_viewport_offset = r300UpdateViewportOffset;
+	radeon->vtbl.emit_cs_header = r300_vtbl_emit_cs_header;
+	radeon->vtbl.swtcl_flush = r300_swtcl_flush;
+	radeon->vtbl.pre_emit_atoms = r300_vtbl_pre_emit_atoms;
+	radeon->vtbl.fallback = r300_fallback;
+}
+
+static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	ctx->Const.MaxTextureImageUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
+	ctx->Const.MaxTextureCoordUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
+	ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureImageUnits,
+		 ctx->Const.MaxTextureCoordUnits);
+
+	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+	ctx->Const.MaxTextureLodBias = 16.0;
+
+	if (screen->chip_family >= CHIP_FAMILY_RV515)
+		ctx->Const.MaxTextureLevels = 13;
+	else
+		ctx->Const.MaxTextureLevels = 12;
+
+	ctx->Const.MinPointSize = 1.0;
+	ctx->Const.MinPointSizeAA = 1.0;
+	ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
+	ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
+
+	ctx->Const.MinLineWidth = 1.0;
+	ctx->Const.MinLineWidthAA = 1.0;
+	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
+	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
+
+	ctx->Const.MaxDrawBuffers = 1;
+
+	/* currently bogus data */
+	if (r300->options.hw_tcl_enabled) {
+		ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+		ctx->Const.VertexProgram.MaxNativeInstructions =
+		  VSF_MAX_FRAGMENT_LENGTH / 4;
+		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+		ctx->Const.VertexProgram.MaxTemps = 32;
+		ctx->Const.VertexProgram.MaxNativeTemps =
+		  /*VSF_MAX_FRAGMENT_TEMPS */ 32;
+		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+	}
+
+	if (screen->chip_family >= CHIP_FAMILY_RV515) {
+		ctx->Const.FragmentProgram.MaxNativeTemps = R500_PFS_NUM_TEMP_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+		ctx->Const.FragmentProgram.MaxNativeParameters = R500_PFS_NUM_CONST_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+	} else {
+		ctx->Const.FragmentProgram.MaxNativeTemps = R300_PFS_NUM_TEMP_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+		ctx->Const.FragmentProgram.MaxNativeParameters = R300_PFS_NUM_CONST_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R300_PFS_MAX_ALU_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R300_PFS_MAX_TEX_INST;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = R300_PFS_MAX_ALU_INST + R300_PFS_MAX_TEX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R300_PFS_MAX_TEX_INDIRECT;
+		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+	}
+}
+
+static void r300ParseOptions(r300ContextPtr r300, radeonScreenPtr screen)
+{
+	struct r300_options options = { 0 };
+
+	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r300");
+
+	r300->radeon.initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache, "def_max_anisotropy");
+
+	options.stencil_two_side_disabled = driQueryOptionb(&r300->radeon.optionCache, "disable_stencil_two_side");
+	options.s3tc_force_enabled = driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable");
+	options.s3tc_force_disabled = driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc");
+
+	if (!(screen->chip_flags & RADEON_CHIPSET_TCL) || driQueryOptioni(&r300->radeon.optionCache, "tcl_mode") == DRI_CONF_TCL_SW)
+		options.hw_tcl_enabled = 0;
+	else
+		options.hw_tcl_enabled = 1;
+
+	options.conformance_mode = !driQueryOptionb(&r300->radeon.optionCache, "disable_lowimpact_fallback");
+
+	r300->options = options;
+}
+
+static void r300InitGLExtensions(GLcontext *ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+	if (r300->radeon.radeonScreen->kernel_mm)
+		driInitExtensions(ctx, mm_extensions, GL_FALSE);
+
+	if (r300->options.stencil_two_side_disabled)
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+	if (ctx->Mesa_DXTn && !r300->options.s3tc_force_enabled) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else if (r300->options.s3tc_force_disabled) {
+		_mesa_disable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	}
+}
+
 /* Create the device specific rendering context.
  */
 GLboolean r300CreateContext(const __GLcontextModes * glVisual,
@@ -195,42 +374,25 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	struct dd_function_table functions;
 	r300ContextPtr r300;
 	GLcontext *ctx;
-	int tcl_mode, i;
 
 	assert(glVisual);
 	assert(driContextPriv);
 	assert(screen);
 
-	/* Allocate the R300 context */
 	r300 = (r300ContextPtr) CALLOC(sizeof(*r300));
 	if (!r300)
 		return GL_FALSE;
 
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-		hw_tcl_on = future_hw_tcl_on = 0;
+	r300ParseOptions(r300, screen);
 
-	/* Parse configuration files.
-	 * Do this here so that initialMaxAnisotropy is set before we create
-	 * the default textures.
-	 */
-	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
-			    screen->driScreen->myNum, "r300");
-	r300->initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
-						     "def_max_anisotropy");
+	r300_init_vtbl(&r300->radeon);
 
-	/* Init default driver functions then plug in our R300-specific functions
-	 * (the texture functions are especially important)
-	 */
 	_mesa_init_driver_functions(&functions);
 	r300InitIoctlFuncs(&functions);
 	r300InitStateFuncs(&functions);
 	r300InitTextureFuncs(&functions);
 	r300InitShaderFuncs(&functions);
 
-#ifdef USER_BUFFERS
-	r300_mem_init(r300);
-#endif
-
 	if (!radeonInitContext(&r300->radeon, &functions,
 			       glVisual, driContextPriv,
 			       sharedContextPrivate)) {
@@ -238,94 +400,17 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 		return GL_FALSE;
 	}
 
-	/* Init r300 context data */
-	r300->dma.buf0_address =
-	    r300->radeon.radeonScreen->buffers->list[0].address;
-
-	(void)memset(r300->texture_heaps, 0, sizeof(r300->texture_heaps));
-	make_empty_list(&r300->swapped);
-
-	r300->nr_heaps = 1 /* screen->numTexHeaps */ ;
-	assert(r300->nr_heaps < RADEON_NR_TEX_HEAPS);
-	for (i = 0; i < r300->nr_heaps; i++) {
-		/* *INDENT-OFF* */
-		r300->texture_heaps[i] = driCreateTextureHeap(i, r300,
-							       screen->
-							       texSize[i], 12,
-							       RADEON_NR_TEX_REGIONS,
-							       (drmTextureRegionPtr)
-							       r300->radeon.sarea->
-							       tex_list[i],
-							       &r300->radeon.sarea->
-							       tex_age[i],
-							       &r300->swapped,
-							       sizeof
-							       (r300TexObj),
-							       (destroy_texture_object_t
-								*)
-							       r300DestroyTexObj);
-		/* *INDENT-ON* */
-	}
-	r300->texture_depth = driQueryOptioni(&r300->radeon.optionCache,
-					      "texture_depth");
-	if (r300->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
-		r300->texture_depth = (screen->cpp == 4) ?
-		    DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
-
-	/* Set the maximum texture size small enough that we can guarentee that
-	 * all texture units can bind a maximal texture and have them both in
-	 * texturable memory at once.
-	 */
-
 	ctx = r300->radeon.glCtx;
 
-	ctx->Const.MaxTextureImageUnits =
-	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
-	ctx->Const.MaxTextureCoordUnits =
-	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
-	ctx->Const.MaxTextureUnits =
-	    MIN2(ctx->Const.MaxTextureImageUnits,
-		 ctx->Const.MaxTextureCoordUnits);
-	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
-	ctx->Const.MaxTextureLodBias = 16.0;
-
-	if (screen->chip_family >= CHIP_FAMILY_RV515)
-	    ctx->Const.MaxTextureLevels = 13;
-	else
-	    ctx->Const.MaxTextureLevels = 12;
-
-        driCalculateMaxTextureLevels( r300->texture_heaps,
-                                      r300->nr_heaps,
-                                      & ctx->Const,
-                                      4,
-                                      ctx->Const.MaxTextureLevels - 1,
-                                      MIN2(ctx->Const.MaxTextureLevels,
-                                           MAX_3D_TEXTURE_LEVELS) - 1,
-                                      ctx->Const.MaxTextureLevels - 1,
-                                      ctx->Const.MaxTextureLevels - 1,
-                                      ctx->Const.MaxTextureLevels - 1,
-                                      GL_FALSE,
-                                      2 );
-
-	ctx->Const.MinPointSize = 1.0;
-	ctx->Const.MinPointSizeAA = 1.0;
-	ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
-	ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
+	r300->fallback = 0;
+	if (r300->options.hw_tcl_enabled)
+		ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
 
-	ctx->Const.MinLineWidth = 1.0;
-	ctx->Const.MinLineWidthAA = 1.0;
-	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
-	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
-#ifdef USER_BUFFERS
-	/* Needs further modifications */
-#if 0
-	ctx->Const.MaxArrayLockSize =
-	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
-#endif
-#endif
+	r300InitConstValues(ctx, screen);
 
-	ctx->Const.MaxDrawBuffers = 1;
+	_mesa_set_mvp_with_dp4( ctx, GL_TRUE );
 
 	_mesa_set_mvp_with_dp4( ctx, GL_TRUE );
 
@@ -336,16 +421,12 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	_tnl_CreateContext(ctx);
 	_swsetup_CreateContext(ctx);
 	_swsetup_Wakeup(ctx);
-	_ae_create_context(ctx);
 
 	/* Install the customized pipeline:
 	 */
 	_tnl_destroy_pipeline(ctx);
 	_tnl_install_pipeline(ctx, r300_pipeline);
-
-	/* Try and keep materials and vertices separate:
-	 */
-/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
 
 	/* Configure swrast and TNL to match hardware characteristics:
 	 */
@@ -354,226 +435,25 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	_tnl_allow_pixel_fog(ctx, GL_FALSE);
 	_tnl_allow_vertex_fog(ctx, GL_TRUE);
 
-	/* currently bogus data */
-	if (screen->chip_flags & RADEON_CHIPSET_TCL) {
-	        ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeInstructions =
-		  VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
-		ctx->Const.VertexProgram.MaxTemps = 32;
-		ctx->Const.VertexProgram.MaxNativeTemps =
-		  /*VSF_MAX_FRAGMENT_TEMPS */ 32;
-		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
-		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
-	}
-
-	ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
-	ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
-	ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
-	ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
-	ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
-	ctx->Const.FragmentProgram.MaxNativeInstructions =
-	    PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
-	ctx->Const.FragmentProgram.MaxNativeTexIndirections =
-	    PFS_MAX_TEX_INDIRECT;
-	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
-	ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
-	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
-
-	driInitExtensions(ctx, card_extensions, GL_TRUE);
-
-	if (driQueryOptionb
-	    (&r300->radeon.optionCache, "disable_stencil_two_side"))
-		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
-
-	if (r300->radeon.glCtx->Mesa_DXTn
-	    && !driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc")) {
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
-		_mesa_enable_extension(ctx, "GL_S3_s3tc");
-	} else
-	    if (driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable"))
-	{
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	if (r300->options.hw_tcl_enabled) {
+		r300InitDraw(ctx);
+	} else {
+		r300InitSwtcl(ctx);
 	}
 
-	r300->disable_lowimpact_fallback =
-	    driQueryOptionb(&r300->radeon.optionCache,
-			    "disable_lowimpact_fallback");
-
-	radeonInitSpanFuncs(ctx);
+	radeon_fbo_init(&r300->radeon);
+	radeonInitSpanFuncs( ctx );
 	r300InitCmdBuf(r300);
 	r300InitState(r300);
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-	        r300InitSwtcl(ctx);
+	r300InitShaderFunctions(r300);
 
-	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
-
-	tcl_mode = driQueryOptioni(&r300->radeon.optionCache, "tcl_mode");
-	if (driQueryOptionb(&r300->radeon.optionCache, "no_rast")) {
-		fprintf(stderr, "disabling 3D acceleration\n");
-#if R200_MERGED
-		FALLBACK(&r300->radeon, RADEON_FALLBACK_DISABLE, 1);
-#endif
-	}
-	if (tcl_mode == DRI_CONF_TCL_SW ||
-	    !(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
-		if (r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
-			r300->radeon.radeonScreen->chip_flags &=
-			    ~RADEON_CHIPSET_TCL;
-			fprintf(stderr, "Disabling HW TCL support\n");
-		}
-		TCL_FALLBACK(r300->radeon.glCtx,
-			     RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+	if (screen->chip_family == CHIP_FAMILY_RS600 ||	screen->chip_family == CHIP_FAMILY_RS690 ||
+		screen->chip_family == CHIP_FAMILY_RS740) {
+		r300->radeon.texture_row_align = 64;
 	}
 
-	return GL_TRUE;
-}
-
-static void r300FreeGartAllocations(r300ContextPtr r300)
-{
-	int i, ret, tries = 0, done_age, in_use = 0;
-	drm_radeon_mem_free_t memfree;
+	r300InitGLExtensions(ctx);
 
-	memfree.region = RADEON_MEM_REGION_GART;
-
-#ifdef USER_BUFFERS
-	for (i = r300->rmm->u_last; i > 0; i--) {
-		if (r300->rmm->u_list[i].ptr == NULL) {
-			continue;
-		}
-
-		/* check whether this buffer is still in use */
-		if (r300->rmm->u_list[i].pending) {
-			in_use++;
-		}
-	}
-	/* Cannot flush/lock if no context exists. */
-	if (in_use)
-		r300FlushCmdBuf(r300, __FUNCTION__);
-
-	done_age = radeonGetAge((radeonContextPtr) r300);
-
-	for (i = r300->rmm->u_last; i > 0; i--) {
-		if (r300->rmm->u_list[i].ptr == NULL) {
-			continue;
-		}
-
-		/* check whether this buffer is still in use */
-		if (!r300->rmm->u_list[i].pending) {
-			continue;
-		}
-
-		assert(r300->rmm->u_list[i].h_pending == 0);
-
-		tries = 0;
-		while (r300->rmm->u_list[i].age > done_age && tries++ < 1000) {
-			usleep(10);
-			done_age = radeonGetAge((radeonContextPtr) r300);
-		}
-		if (tries >= 1000) {
-			WARN_ONCE("Failed to idle region!");
-		}
-
-		memfree.region_offset = (char *)r300->rmm->u_list[i].ptr -
-		    (char *)r300->radeon.radeonScreen->gartTextures.map;
-
-		ret = drmCommandWrite(r300->radeon.radeonScreen->driScreen->fd,
-				      DRM_RADEON_FREE, &memfree,
-				      sizeof(memfree));
-		if (ret) {
-			fprintf(stderr, "Failed to free at %p\nret = %s\n",
-				r300->rmm->u_list[i].ptr, strerror(-ret));
-		} else {
-			if (i == r300->rmm->u_last)
-				r300->rmm->u_last--;
-
-			r300->rmm->u_list[i].pending = 0;
-			r300->rmm->u_list[i].ptr = NULL;
-		}
-	}
-	r300->rmm->u_head = i;
-#endif				/* USER_BUFFERS */
+	return GL_TRUE;
 }
 
-/* Destroy the device specific context.
- */
-void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
-{
-	GET_CURRENT_CONTEXT(ctx);
-	r300ContextPtr r300 = (r300ContextPtr) driContextPriv->driverPrivate;
-	radeonContextPtr radeon = (radeonContextPtr) r300;
-	radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
-	int i;
-
-	if (RADEON_DEBUG & DEBUG_DRI) {
-		fprintf(stderr, "Destroying context !\n");
-	}
-
-	/* check if we're deleting the currently bound context */
-	if (&r300->radeon == current) {
-		radeonFlush(r300->radeon.glCtx);
-		_mesa_make_current(NULL, NULL, NULL);
-	}
-
-	/* Free r300 context resources */
-	assert(r300);		/* should never be null */
-
-	if (r300) {
-		GLboolean release_texture_heaps;
-
-		release_texture_heaps =
-		    (r300->radeon.glCtx->Shared->RefCount == 1);
-		_swsetup_DestroyContext(r300->radeon.glCtx);
-		_tnl_DestroyContext(r300->radeon.glCtx);
-		_vbo_DestroyContext(r300->radeon.glCtx);
-		_swrast_DestroyContext(r300->radeon.glCtx);
-
-		if (r300->dma.current.buf) {
-			r300ReleaseDmaRegion(r300, &r300->dma.current,
-					     __FUNCTION__);
-#ifndef USER_BUFFERS
-			r300FlushCmdBuf(r300, __FUNCTION__);
-#endif
-		}
-		r300FreeGartAllocations(r300);
-		r300DestroyCmdBuf(r300);
-
-		if (radeon->state.scissor.pClipRects) {
-			FREE(radeon->state.scissor.pClipRects);
-			radeon->state.scissor.pClipRects = NULL;
-		}
-
-		if (release_texture_heaps) {
-			/* This share group is about to go away, free our private
-			 * texture object data.
-			 */
-			int i;
-
-			for (i = 0; i < r300->nr_heaps; i++) {
-				driDestroyTextureHeap(r300->texture_heaps[i]);
-				r300->texture_heaps[i] = NULL;
-			}
-
-			assert(is_empty_list(&r300->swapped));
-		}
-
-                /* Drop texture object references from current hardware state */
-		for (i = 0; i < 8; i++) {
-			_mesa_reference_texobj(&r300->state.texture.unit[i].texobj, NULL);
-		}
-
-		radeonCleanupContext(&r300->radeon);
-
-#ifdef USER_BUFFERS
-		/* the memory manager might be accessed when Mesa frees the shared
-		 * state, so don't destroy it earlier
-		 */
-		r300_mem_destroy(r300);
-#endif
-
-		/* free the option cache */
-		driDestroyOptionCache(&r300->radeon.optionCache);
-
-		FREE(r300);
-	}
-}
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 96a3205f1a..026c33c67c 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -37,26 +37,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __R300_CONTEXT_H__
 #define __R300_CONTEXT_H__
 
-#include "tnl/t_vertex.h"
 #include "drm.h"
 #include "radeon_drm.h"
 #include "dri_util.h"
-#include "texmem.h"
+#include "radeon_common.h"
 
-#include "main/macros.h"
 #include "main/mtypes.h"
-#include "main/colormac.h"
-
-#define USER_BUFFERS
+#include "shader/prog_instruction.h"
 
 struct r300_context;
 typedef struct r300_context r300ContextRec;
 typedef struct r300_context *r300ContextPtr;
 
-#include "radeon_lock.h"
-#include "main/mm.h"
 
-/* From http://gcc.gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
+/* From http://gcc. gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
    I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
    with other compilers ... GLUE!
 */
@@ -73,180 +67,14 @@ typedef struct r300_context *r300ContextPtr;
 	}
 
 #include "r300_vertprog.h"
-#include "r500_fragprog.h"
-
-/**
- * This function takes a float and packs it into a uint32_t
- */
-static INLINE uint32_t r300PackFloat32(float fl)
-{
-	union {
-		float fl;
-		uint32_t u;
-	} u;
-
-	u.fl = fl;
-	return u.u;
-}
-
-/* This is probably wrong for some values, I need to test this
- * some more.  Range checking would be a good idea also..
- *
- * But it works for most things.  I'll fix it later if someone
- * else with a better clue doesn't
- */
-static INLINE uint32_t r300PackFloat24(float f)
-{
-	float mantissa;
-	int exponent;
-	uint32_t float24 = 0;
-
-	if (f == 0.0)
-		return 0;
-
-	mantissa = frexpf(f, &exponent);
-
-	/* Handle -ve */
-	if (mantissa < 0) {
-		float24 |= (1 << 23);
-		mantissa = mantissa * -1.0;
-	}
-	/* Handle exponent, bias of 63 */
-	exponent += 62;
-	float24 |= (exponent << 16);
-	/* Kill 7 LSB of mantissa */
-	float24 |= (r300PackFloat32(mantissa) & 0x7FFFFF) >> 7;
-
-	return float24;
-}
-
-/************ DMA BUFFERS **************/
-
-/* Need refcounting on dma buffers:
- */
-struct r300_dma_buffer {
-	int refcount;		/**< the number of retained regions in buf */
-	drmBufPtr buf;
-	int id;
-};
-#undef GET_START
-#ifdef USER_BUFFERS
-#define GET_START(rvb) (r300GartOffsetFromVirtual(rmesa, (rvb)->address+(rvb)->start))
-#else
-#define GET_START(rvb) (rmesa->radeon.radeonScreen->gart_buffer_offset +		\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-#endif
-/* A retained region, eg vertices for indexed vertices.
- */
-struct r300_dma_region {
-	struct r300_dma_buffer *buf;
-	char *address;		/* == buf->address */
-	int start, end, ptr;	/* offsets from start of buf */
-
-	int aos_offset;		/* address in GART memory */
-	int aos_stride;		/* distance between elements, in dwords */
-	int aos_size;		/* number of components (1-4) */
-};
-
-struct r300_dma {
-	/* Active dma region.  Allocations for vertices and retained
-	 * regions come from here.  Also used for emitting random vertices,
-	 * these may be flushed by calling flush_current();
-	 */
-	struct r300_dma_region current;
 
-	void (*flush) (r300ContextPtr);
-
-	char *buf0_address;	/* start of buf[0], for index calcs */
-
-	/* Number of "in-flight" DMA buffers, i.e. the number of buffers
-	 * for which a DISCARD command is currently queued in the command buffer.
-	 */
-	GLuint nr_released_bufs;
-};
-
-       /* Texture related */
-
-typedef struct r300_tex_obj r300TexObj, *r300TexObjPtr;
-
-/* Maximum number of mipmap levels supported by any supported GPU
- */
-#define R300_MAX_TEXTURE_LEVELS 13
-
-/* Texture object in locally shared texture space.
- */
-struct r300_tex_obj {
-	driTextureObject base;
-
-	GLuint bufAddr;		/* Offset to start of locally
-				   shared texture block */
-
-	drm_radeon_tex_image_t image[6][R300_MAX_TEXTURE_LEVELS];
-	/* Six, for the cube faces */
-
-	GLboolean image_override;	/* Image overridden by GLX_EXT_tfp */
-
-	GLuint pitch;		/* this isn't sent to hardware just used in calculations */
-	/* hardware register values */
-	/* Note that R200 has 8 registers per texture and R300 only 7 */
-	GLuint filter;
-	GLuint filter_1;
-	GLuint pitch_reg;
-	GLuint size;		/* npot only */
-	GLuint format;
-	GLuint offset;		/* Image location in the card's address space.
-				   All cube faces follow. */
-	GLuint unknown4;
-	GLuint unknown5;
-	/* end hardware registers */
-
-	/* registers computed by r200 code - keep them here to
-	   compare against what is actually written.
-
-	   to be removed later.. */
-	GLuint pp_border_color;
-	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
-	GLuint format_x;
-
-	GLboolean border_fallback;
-
-	GLuint tile_bits;	/* hw texture tile bits used on this texture */
-};
-
-struct r300_texture_env_state {
-	struct gl_texture_object *texobj;
-	GLenum format;
-	GLenum envMode;
-};
 
 /* The blit width for texture uploads
  */
 #define R300_BLIT_WIDTH_BYTES 1024
 #define R300_MAX_TEXTURE_UNITS 8
 
-struct r300_texture_state {
-	struct r300_texture_env_state unit[R300_MAX_TEXTURE_UNITS];
-	int tc_count;		/* number of incoming texture coordinates from VAP */
-};
 
-/**
- * A block of hardware state.
- *
- * When check returns non-zero, the returned number of dwords must be
- * copied verbatim into the command buffer in order to update a state atom
- * when it is dirty.
- */
-struct r300_state_atom {
-	struct r300_state_atom *next, *prev;
-	const char *name;	/* for debug */
-	int cmd_size;		/* maximum size in dwords */
-	GLuint idx;		/* index in an array (e.g. textures) */
-	uint32_t *cmd;
-	GLboolean dirty;
-
-	int (*check) (r300ContextPtr, struct r300_state_atom * atom);
-};
 
 #define R300_VPT_CMD_0		0
 #define R300_VPT_XSCALE		1
@@ -288,9 +116,11 @@ struct r300_state_atom {
 #define R300_GB_MISC_MSPOS_0		1
 #define R300_GB_MISC_MSPOS_1		2
 #define R300_GB_MISC_TILE_CONFIG	3
-#define R300_GB_MISC_SELECT		4
-#define R300_GB_MISC_AA_CONFIG		5
-#define R300_GB_MISC_CMDSIZE		6
+#define R300_GB_MISC_CMDSIZE		4
+#define R300_GB_MISC2_CMD_0		    0
+#define R300_GB_MISC2_SELECT		1
+#define R300_GB_MISC2_AA_CONFIG		2
+#define R300_GB_MISC2_CMDSIZE		3
 
 #define R300_TXE_CMD_0		0
 #define R300_TXE_ENABLE		1
@@ -463,124 +293,100 @@ struct r300_state_atom {
  * Cache for hardware register state.
  */
 struct r300_hw_state {
-	struct r300_state_atom atomlist;
-
-	GLboolean is_dirty;
-	GLboolean all_dirty;
-	int max_state_size;	/* in dwords */
-
-	struct r300_state_atom vpt;	/* viewport (1D98) */
-	struct r300_state_atom vap_cntl;
-        struct r300_state_atom vap_index_offset; /* 0x208c r5xx only */
-	struct r300_state_atom vof;	/* VAP output format register 0x2090 */
-	struct r300_state_atom vte;	/* (20B0) */
-	struct r300_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
-	struct r300_state_atom vap_cntl_status;
-	struct r300_state_atom vir[2];	/* vap input route (2150/21E0) */
-	struct r300_state_atom vic;	/* vap input control (2180) */
-	struct r300_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
-	struct r300_state_atom vap_clip_cntl;
-	struct r300_state_atom vap_clip;
-	struct r300_state_atom vap_pvs_vtx_timeout_reg;	/* Vertex timeout register (2288) */
-	struct r300_state_atom pvs;	/* pvs_cntl (22D0) */
-	struct r300_state_atom gb_enable;	/* (4008) */
-	struct r300_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
-	struct r300_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
-	struct r300_state_atom ga_triangle_stipple;	/* (4214) */
-	struct r300_state_atom ps;	/* pointsize (421C) */
-	struct r300_state_atom ga_point_minmax;	/* (4230) */
-	struct r300_state_atom lcntl;	/* line control */
-	struct r300_state_atom ga_line_stipple;	/* (4260) */
-	struct r300_state_atom shade;
-	struct r300_state_atom polygon_mode;
-	struct r300_state_atom fogp;	/* fog parameters (4294) */
-	struct r300_state_atom ga_soft_reset;	/* (429C) */
-	struct r300_state_atom zbias_cntl;
-	struct r300_state_atom zbs;	/* zbias (42A4) */
-	struct r300_state_atom occlusion_cntl;
-	struct r300_state_atom cul;	/* cull cntl (42B8) */
-	struct r300_state_atom su_depth_scale;	/* (42C0) */
-	struct r300_state_atom rc;	/* rs control (4300) */
-	struct r300_state_atom ri;	/* rs interpolators (4310) */
-	struct r300_state_atom rr;	/* rs route (4330) */
-	struct r300_state_atom sc_hyperz;	/* (43A4) */
-	struct r300_state_atom sc_screendoor;	/* (43E8) */
-	struct r300_state_atom fp;	/* fragment program cntl + nodes (4600) */
-	struct r300_state_atom fpt;	/* texi - (4620) */
-	struct r300_state_atom us_out_fmt;	/* (46A4) */
-	struct r300_state_atom r500fp;	/* r500 fp instructions */
-	struct r300_state_atom r500fp_const;	/* r500 fp constants */
-	struct r300_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
-	struct r300_state_atom fogs;	/* fog state (4BC0) */
-	struct r300_state_atom fogc;	/* fog color (4BC8) */
-	struct r300_state_atom at;	/* alpha test (4BD4) */
-	struct r300_state_atom fg_depth_src;	/* (4BD8) */
-	struct r300_state_atom fpp;	/* 0x4C00 and following */
-	struct r300_state_atom rb3d_cctl;	/* (4E00) */
-	struct r300_state_atom bld;	/* blending (4E04) */
-	struct r300_state_atom cmk;	/* colormask (4E0C) */
-	struct r300_state_atom blend_color;	/* constant blend color */
-	struct r300_state_atom rop;	/* ropcntl */
-	struct r300_state_atom cb;	/* colorbuffer (4E28) */
-	struct r300_state_atom rb3d_dither_ctl;	/* (4E50) */
-	struct r300_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
-	struct r300_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
-	struct r300_state_atom zs;	/* zstencil control (4F00) */
-	struct r300_state_atom zstencil_format;
-	struct r300_state_atom zb;	/* z buffer (4F20) */
-	struct r300_state_atom zb_depthclearvalue;	/* (4F28) */
-	struct r300_state_atom unk4F30;	/* (4F30) */
-	struct r300_state_atom zb_hiz_offset;	/* (4F44) */
-	struct r300_state_atom zb_hiz_pitch;	/* (4F54) */
-
-	struct r300_state_atom vpi;	/* vp instructions */
-	struct r300_state_atom vpp;	/* vp parameters */
-	struct r300_state_atom vps;	/* vertex point size (?) */
-	struct r300_state_atom vpucp[6];	/* vp user clip plane - 6 */
+	struct radeon_state_atom vpt;	/* viewport (1D98) */
+	struct radeon_state_atom vap_cntl;
+	struct radeon_state_atom vap_index_offset; /* 0x208c r5xx only */
+	struct radeon_state_atom vof;	/* VAP output format register 0x2090 */
+	struct radeon_state_atom vte;	/* (20B0) */
+	struct radeon_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
+	struct radeon_state_atom vap_cntl_status;
+	struct radeon_state_atom vir[2];	/* vap input route (2150/21E0) */
+	struct radeon_state_atom vic;	/* vap input control (2180) */
+	struct radeon_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
+	struct radeon_state_atom vap_clip_cntl;
+	struct radeon_state_atom vap_clip;
+	struct radeon_state_atom vap_pvs_vtx_timeout_reg;	/* Vertex timeout register (2288) */
+	struct radeon_state_atom pvs;	/* pvs_cntl (22D0) */
+	struct radeon_state_atom gb_enable;	/* (4008) */
+	struct radeon_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
+	struct radeon_state_atom gb_misc2;	/* Multisampling position shifts ? (4010) */
+	struct radeon_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
+	struct radeon_state_atom ga_triangle_stipple;	/* (4214) */
+	struct radeon_state_atom ps;	/* pointsize (421C) */
+	struct radeon_state_atom ga_point_minmax;	/* (4230) */
+	struct radeon_state_atom lcntl;	/* line control */
+	struct radeon_state_atom ga_line_stipple;	/* (4260) */
+	struct radeon_state_atom shade;
+	struct radeon_state_atom shade2;
+	struct radeon_state_atom polygon_mode;
+	struct radeon_state_atom fogp;	/* fog parameters (4294) */
+	struct radeon_state_atom ga_soft_reset;	/* (429C) */
+	struct radeon_state_atom zbias_cntl;
+	struct radeon_state_atom zbs;	/* zbias (42A4) */
+	struct radeon_state_atom occlusion_cntl;
+	struct radeon_state_atom cul;	/* cull cntl (42B8) */
+	struct radeon_state_atom su_depth_scale;	/* (42C0) */
+	struct radeon_state_atom rc;	/* rs control (4300) */
+	struct radeon_state_atom ri;	/* rs interpolators (4310) */
+	struct radeon_state_atom rr;	/* rs route (4330) */
+	struct radeon_state_atom sc_hyperz;	/* (43A4) */
+	struct radeon_state_atom sc_screendoor;	/* (43E8) */
+	struct radeon_state_atom fp;	/* fragment program cntl + nodes (4600) */
+	struct radeon_state_atom fpt;	/* texi - (4620) */
+	struct radeon_state_atom us_out_fmt;	/* (46A4) */
+	struct radeon_state_atom r500fp;	/* r500 fp instructions */
+	struct radeon_state_atom r500fp_const;	/* r500 fp constants */
+	struct radeon_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
+	struct radeon_state_atom fogs;	/* fog state (4BC0) */
+	struct radeon_state_atom fogc;	/* fog color (4BC8) */
+	struct radeon_state_atom at;	/* alpha test (4BD4) */
+	struct radeon_state_atom fg_depth_src;	/* (4BD8) */
+	struct radeon_state_atom fpp;	/* 0x4C00 and following */
+	struct radeon_state_atom rb3d_cctl;	/* (4E00) */
+	struct radeon_state_atom bld;	/* blending (4E04) */
+	struct radeon_state_atom cmk;	/* colormask (4E0C) */
+	struct radeon_state_atom blend_color;	/* constant blend color */
+	struct radeon_state_atom rop;	/* ropcntl */
+	struct radeon_state_atom cb;	/* colorbuffer (4E28) */
+	struct radeon_state_atom rb3d_dither_ctl;	/* (4E50) */
+	struct radeon_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
+	struct radeon_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
+	struct radeon_state_atom zs;	/* zstencil control (4F00) */
+	struct radeon_state_atom zstencil_format;
+	struct radeon_state_atom zb;	/* z buffer (4F20) */
+	struct radeon_state_atom zb_depthclearvalue;	/* (4F28) */
+	struct radeon_state_atom zb_zmask;	/* (4F30) */
+	struct radeon_state_atom zb_hiz_offset;	/* (4F44) */
+	struct radeon_state_atom zb_hiz_pitch;	/* (4F54) */
+
+	struct radeon_state_atom vpi;	/* vp instructions */
+	struct radeon_state_atom vpp;	/* vp parameters */
+	struct radeon_state_atom vps;	/* vertex point size (?) */
+	struct radeon_state_atom vpucp[6];	/* vp user clip plane - 6 */
 	/* 8 texture units */
 	/* the state is grouped by function and not by
 	   texture unit. This makes single unit updates
 	   really awkward - we are much better off
 	   updating the whole thing at once */
 	struct {
-		struct r300_state_atom filter;
-		struct r300_state_atom filter_1;
-		struct r300_state_atom size;
-		struct r300_state_atom format;
-		struct r300_state_atom pitch;
-		struct r300_state_atom offset;
-		struct r300_state_atom chroma_key;
-		struct r300_state_atom border_color;
+		struct radeon_state_atom filter;
+		struct radeon_state_atom filter_1;
+		struct radeon_state_atom size;
+		struct radeon_state_atom format;
+		struct radeon_state_atom pitch;
+		struct radeon_state_atom offset;
+		struct radeon_state_atom chroma_key;
+		struct radeon_state_atom border_color;
 	} tex;
-	struct r300_state_atom txe;	/* tex enable (4104) */
-};
+	struct radeon_state_atom txe;	/* tex enable (4104) */
 
-/**
- * This structure holds the command buffer while it is being constructed.
- *
- * The first batch of commands in the buffer is always the state that needs
- * to be re-emitted when the context is lost. This batch can be skipped
- * otherwise.
- */
-struct r300_cmdbuf {
-	int size;		/* DWORDs allocated for buffer */
-	uint32_t *cmd_buf;
-	int count_used;		/* DWORDs filled so far */
-	int count_reemit;	/* size of re-emission batch */
+	radeonTexObj *textures[R300_MAX_TEXTURE_UNITS];
 };
 
 /**
  * State cache
  */
 
-struct r300_depthbuffer_state {
-	GLfloat scale;
-};
-
-struct r300_stencilbuffer_state {
-	GLboolean hw_stencil;
-};
-
 /* Vertex shader state */
 
 /* Perhaps more if we store programs in vmem? */
@@ -593,73 +399,55 @@ struct r300_stencilbuffer_state {
 #define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
 #define STATE_R300_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
 
-struct r300_vertex_shader_fragment {
-	int length;
-	union {
-		GLuint d[VSF_MAX_FRAGMENT_LENGTH];
-		float f[VSF_MAX_FRAGMENT_LENGTH];
-		GLuint i[VSF_MAX_FRAGMENT_LENGTH];
-	} body;
-};
-
-struct r300_vertex_shader_state {
-	struct r300_vertex_shader_fragment program;
-};
-
-extern int hw_tcl_on;
-
 #define COLOR_IS_RGBA
 #define TAG(x) r300##x
 #include "tnl_dd/t_dd_vertex.h"
 #undef TAG
 
-//#define CURRENT_VERTEX_SHADER(ctx) (ctx->VertexProgram._Current)
-#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->selected_vp)
-
-/* Should but doesnt work */
-//#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->curr_vp)
-
-/* r300_vertex_shader_state and r300_vertex_program should probably be merged together someday.
- * Keeping them them seperate for now should ensure fixed pipeline keeps functioning properly.
- */
-
-struct r300_vertex_program_key {
-	GLuint InputsRead;
-	GLuint OutputsWritten;
-	GLuint OutputsAdded;
-};
-
 struct r300_vertex_program {
 	struct r300_vertex_program *next;
-	struct r300_vertex_program_key key;
-	int translated;
 
-	struct r300_vertex_shader_fragment program;
+	struct r300_vertex_program_key {
+		GLuint InputsRead;
+		GLuint OutputsWritten;
+		GLuint OutputsAdded;
+	} key;
+	
+	struct r300_vertex_shader_hw_code {
+		int length;
+		union {
+			GLuint d[VSF_MAX_FRAGMENT_LENGTH];
+			float f[VSF_MAX_FRAGMENT_LENGTH];
+		} body;
+	} hw_code;
+
+	GLboolean translated;
+	GLboolean error;
 
 	int pos_end;
 	int num_temporaries;	/* Number of temp vars used by program */
 	int wpos_idx;
 	int inputs[VERT_ATTRIB_MAX];
 	int outputs[VERT_RESULT_MAX];
-	int native;
-	int ref_count;
-	int use_ref_count;
 };
 
 struct r300_vertex_program_cont {
 	struct gl_vertex_program mesa_program;	/* Must be first */
-	struct r300_vertex_shader_fragment params;
 	struct r300_vertex_program *progs;
 };
 
-#define PFS_MAX_ALU_INST	64
-#define PFS_MAX_TEX_INST	64
-#define PFS_MAX_TEX_INDIRECT 4
-#define PFS_NUM_TEMP_REGS	32
-#define PFS_NUM_CONST_REGS	16
+#define R300_PFS_MAX_ALU_INST	64
+#define R300_PFS_MAX_TEX_INST	32
+#define R300_PFS_MAX_TEX_INDIRECT 4
+#define R300_PFS_NUM_TEMP_REGS	32
+#define R300_PFS_NUM_CONST_REGS	32
 
-struct r300_pfs_compile_state;
+#define R500_PFS_MAX_INST 512
+#define R500_PFS_NUM_TEMP_REGS 128
+#define R500_PFS_NUM_CONST_REGS 256
 
+struct r300_pfs_compile_state;
+struct r500_pfs_compile_state;
 
 /**
  * Stores state that influences the compilation of a fragment program.
@@ -702,7 +490,7 @@ struct r300_fragment_program_node {
 struct r300_fragment_program_code {
 	struct {
 		int length; /**< total # of texture instructions used */
-		GLuint inst[PFS_MAX_TEX_INST];
+		GLuint inst[R300_PFS_MAX_TEX_INST];
 	} tex;
 
 	struct {
@@ -712,7 +500,7 @@ struct r300_fragment_program_code {
 			GLuint inst1;
 			GLuint inst2;
 			GLuint inst3;
-		} inst[PFS_MAX_ALU_INST];
+		} inst[R300_PFS_MAX_ALU_INST];
 	} alu;
 
 	struct r300_fragment_program_node node[4];
@@ -723,53 +511,12 @@ struct r300_fragment_program_code {
 	 * Remember which program register a given hardware constant
 	 * belongs to.
 	 */
-	struct prog_src_register constant[PFS_NUM_CONST_REGS];
+	struct prog_src_register constant[R300_PFS_NUM_CONST_REGS];
 	int const_nr;
 
 	int max_temp_idx;
 };
 
-/**
- * Store everything about a fragment program that is needed
- * to render with that program.
- */
-struct r300_fragment_program {
-	struct gl_fragment_program mesa_program;
-
-	GLboolean translated;
-	GLboolean error;
-
-	struct r300_fragment_program_external_state state;
-	struct r300_fragment_program_code code;
-
-	GLboolean WritesDepth;
-	GLuint optimization;
-};
-
-struct r500_pfs_compile_state;
-
-struct r500_fragment_program_external_state {
-	struct {
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is:
-		 *  0 - GL_LUMINANCE
-		 *  1 - GL_INTENSITY
-		 *  2 - GL_ALPHA
-		 * depending on the depth texture mode.
-		 */
-		GLuint depth_texture_mode : 2;
-
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is (texture_compare_func - GL_NEVER).
-		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
-		 *
-		 * Otherwise, this field is 0.
-		 */
-		GLuint texture_compare_func : 3;
-	} unit[16];
-};
 
 struct r500_fragment_program_code {
 	struct {
@@ -779,7 +526,7 @@ struct r500_fragment_program_code {
 		GLuint inst3;
 		GLuint inst4;
 		GLuint inst5;
-	} inst[512];
+	} inst[R500_PFS_MAX_INST];
 
 	int inst_offset;
 	int inst_end;
@@ -788,94 +535,46 @@ struct r500_fragment_program_code {
 	 * Remember which program register a given hardware constant
 	 * belongs to.
 	 */
-	struct prog_src_register constant[PFS_NUM_CONST_REGS];
+	struct prog_src_register constant[R500_PFS_NUM_CONST_REGS];
 	int const_nr;
 
 	int max_temp_idx;
 };
 
-struct r500_fragment_program {
-	struct gl_fragment_program mesa_program;
+/**
+* Store everything about a fragment program that is needed
+* to render with that program.
+*/
+struct r300_fragment_program {
+	struct gl_fragment_program Base;
 
-	GLcontext *ctx;
 	GLboolean translated;
 	GLboolean error;
 
-	struct r500_fragment_program_external_state state;
-	struct r500_fragment_program_code code;
+	struct r300_fragment_program_external_state state;
+	union rX00_fragment_program_code {
+		struct r300_fragment_program_code r300;
+		struct r500_fragment_program_code r500;
+	} code;
 
 	GLboolean writes_depth;
-
 	GLuint optimization;
 };
 
-#define R300_MAX_AOS_ARRAYS		16
-
-#define REG_COORDS	0
-#define REG_COLOR0	1
-#define REG_TEX0	2
-
-struct r300_state {
-	struct r300_depthbuffer_state depth;
-	struct r300_texture_state texture;
-	int sw_tcl_inputs[VERT_ATTRIB_MAX];
-	struct r300_vertex_shader_state vertex_shader;
-	struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
-	int aos_count;
-
-	GLuint *Elts;
-	struct r300_dma_region elt_dma;
-
-	struct r300_dma_region swtcl_dma;
-	DECLARE_RENDERINPUTS(render_inputs_bitset);	/* actual render inputs that R300 was configured for.
-							   They are the same as tnl->render_inputs for fixed pipeline */
-
-	struct r300_stencilbuffer_state stencil;
-
+struct r300_fragment_program_compiler {
+	r300ContextPtr r300;
+	struct r300_fragment_program *fp;
+	union rX00_fragment_program_code *code;
+	struct gl_program *program;
 };
 
-#define R300_FALLBACK_NONE 0
-#define R300_FALLBACK_TCL 1
-#define R300_FALLBACK_RAST 2
+#define R300_MAX_AOS_ARRAYS		16
+
 
 /* r300_swtcl.c
  */
 struct r300_swtcl_info {
-   GLuint RenderIndex;
-
-   /**
-    * Size of a hardware vertex.  This is calculated when \c ::vertex_attrs is
-    * installed in the Mesa state vector.
-    */
-   GLuint vertex_size;
-
-   /**
-    * Attributes instructing the Mesa TCL pipeline where / how to put vertex
-    * data in the hardware buffer.
-    */
-   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
-
-   /**
-    * Number of elements of \c ::vertex_attrs that are actually used.
-    */
-   GLuint vertex_attr_count;
-
-   /**
-    * Cached pointer to the buffer where Mesa will store vertex data.
-    */
-   GLubyte *verts;
-
-   /* Fallback rasterization functions
-    */
-  //   r200_point_func draw_point;
-  //   r200_line_func draw_line;
-  //   r200_tri_func draw_tri;
-
-   GLuint hw_primitive;
-   GLenum render_primitive;
-   GLuint numverts;
-
-   /**
+  /*
     * Offset of the 4UB color data within a hardware (swtcl) vertex.
     */
    GLuint coloroffset;
@@ -884,15 +583,44 @@ struct r300_swtcl_info {
     * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
     */
    GLuint specoffset;
+};
 
-   /**
-    * Should Mesa project vertex data or will the hardware do it?
-    */
-   GLboolean needproj;
+struct r300_vtable {
+	void (* SetupRSUnit)(GLcontext *ctx);
+	void (* SetupFragmentShaderTextures)(GLcontext *ctx, int *tmu_mappings);
+	GLboolean (* BuildFragmentProgramHwCode)(struct r300_fragment_program_compiler *compiler);
+	void (* FragmentProgramDump)(union rX00_fragment_program_code *code);
+	void (* SetupPixelShader)(GLcontext *ctx);
+};
 
-   struct r300_dma_region indexed_verts;
+struct r300_vertex_buffer {
+	struct vertex_attribute {
+		/* generic */
+		GLubyte element;
+		GLvoid *data;
+		GLboolean free_needed;
+		GLuint stride;
+		GLuint dwords;
+		GLubyte size; /* number of components */
+
+		/* hw specific */
+		uint32_t data_type:4;
+		uint32_t dst_loc:5;
+		uint32_t _signed:1;
+		uint32_t normalize:1;
+		uint32_t swizzle:12;
+		uint32_t write_mask:4;
+	} attribs[VERT_ATTRIB_MAX];
+
+	GLubyte num_attribs;
 };
 
+struct r300_index_buffer {
+	GLvoid *ptr;
+	GLboolean is_32bit;
+	GLboolean free_needed;
+	GLuint count;
+};
 
 /**
  * \brief R300 context structure.
@@ -900,46 +628,33 @@ struct r300_swtcl_info {
 struct r300_context {
 	struct radeon_context radeon;	/* parent class, must be first */
 
+	struct r300_vtable vtbl;
+
 	struct r300_hw_state hw;
-	struct r300_cmdbuf cmdbuf;
-	struct r300_state state;
-	struct gl_vertex_program *curr_vp;
+
 	struct r300_vertex_program *selected_vp;
 
 	/* Vertex buffers
 	 */
-	struct r300_dma dma;
-	GLboolean save_on_next_unlock;
-	GLuint NewGLState;
-
-	/* Texture object bookkeeping
-	 */
-	unsigned nr_heaps;
-	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
-	driTextureObject swapped;
-	int texture_depth;
-	float initialMaxAnisotropy;
-
-	/* Clientdata textures;
-	 */
-	GLuint prefer_gart_client_texturing;
-
-#ifdef USER_BUFFERS
-	struct r300_memory_manager *rmm;
-#endif
-
 	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
 	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
 
-	GLboolean disable_lowimpact_fallback;
-
-	DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+	struct r300_options {
+		uint32_t conformance_mode:1;
+		uint32_t hw_tcl_enabled:1;
+		uint32_t s3tc_force_enabled:1;
+		uint32_t s3tc_force_disabled:1;
+		uint32_t stencil_two_side_disabled:1;
+	} options;
+	
 	struct r300_swtcl_info swtcl;
-};
+	struct r300_vertex_buffer vbuf;
+	struct r300_index_buffer ind_buf;
+	GLboolean vap_flush_needed;
+
+	uint32_t fallback;
 
-struct r300_buffer_object {
-	struct gl_buffer_object mesa_obj;
-	int id;
+	DECLARE_RENDERINPUTS(render_inputs_bitset);
 };
 
 #define R300_CONTEXT(ctx)		((r300ContextPtr)(ctx->DriverCtx))
@@ -955,9 +670,11 @@ extern int r300VertexProgUpdateParams(GLcontext * ctx,
 				      struct r300_vertex_program_cont *vp,
 				      float *dst);
 
-#define RADEON_D_CAPTURE 0
-#define RADEON_D_PLAYBACK 1
-#define RADEON_D_PLAYBACK_RAW 2
-#define RADEON_D_T 3
+extern void r300InitShaderFunctions(r300ContextPtr r300);
+
+extern void r300InitDraw(GLcontext *ctx);
+
+#define r300PackFloat32 radeonPackFloat32
+#define r300PackFloat24 radeonPackFloat24
 
 #endif				/* __R300_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
new file mode 100644
index 0000000000..92bb0ee338
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -0,0 +1,484 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Maciej Cencora
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHOR(S) AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/state.h"
+#include "main/api_validate.h"
+#include "main/enums.h"
+
+#include "r300_reg.h"
+#include "r300_context.h"
+#include "r300_emit.h"
+#include "r300_render.h"
+#include "r300_state.h"
+#include "r300_tex.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "vbo/vbo_context.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+
+static void r300FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_index_buffer *ind_buf = &r300->ind_buf;
+	GLvoid *src_ptr;
+
+	if (!mesa_ind_buf) {
+		ind_buf->ptr = NULL;
+		return;
+	}
+
+	ind_buf->count = mesa_ind_buf->count;
+	if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer) {
+		bo[*nr_bo] = mesa_ind_buf->obj;
+		(*nr_bo)++;
+		ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+		assert(mesa_ind_buf->obj->Pointer != NULL);
+	}
+	src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
+
+	if (mesa_ind_buf->type == GL_UNSIGNED_BYTE) {
+		GLubyte *in = (GLubyte *)src_ptr;
+		GLuint *out = _mesa_malloc(sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1));
+		int i;
+
+		ind_buf->ptr = out;
+
+		for (i = 0; i + 1 < mesa_ind_buf->count; i += 2) {
+			*out++ = in[i] | in[i + 1] << 16;
+		}
+
+		if (i < mesa_ind_buf->count) {
+			*out++ = in[i];
+		}
+
+		ind_buf->free_needed = GL_TRUE;
+		ind_buf->is_32bit = GL_FALSE;
+	} else if (mesa_ind_buf->type == GL_UNSIGNED_SHORT) {
+#if MESA_BIG_ENDIAN
+		GLushort *in = (GLushort *)src_ptr;
+		GLuint *out = _mesa_malloc(sizeof(GLushort) *
+					   ((mesa_ind_buf->count + 1) & ~1));
+		int i;
+
+		ind_buf->ptr = out;
+
+		for (i = 0; i + 1 < mesa_ind_buf->count; i += 2) {
+			*out++ = in[i] | in[i + 1] << 16;
+		}
+
+		if (i < mesa_ind_buf->count) {
+			*out++ = in[i];
+		}
+
+		ind_buf->free_needed = GL_TRUE;
+#else
+		ind_buf->ptr = src_ptr;
+		ind_buf->free_needed = GL_FALSE;
+#endif
+		ind_buf->is_32bit = GL_FALSE;
+	} else {
+		ind_buf->ptr = src_ptr;
+		ind_buf->free_needed = GL_FALSE;
+		ind_buf->is_32bit = GL_TRUE;
+	}
+}
+
+static int getTypeSize(GLenum type)
+{
+	switch (type) {
+		case GL_DOUBLE:
+			return sizeof(GLdouble);
+		case GL_FLOAT:
+			return sizeof(GLfloat);
+		case GL_INT:
+			return sizeof(GLint);
+		case GL_UNSIGNED_INT:
+			return sizeof(GLuint);
+		case GL_SHORT:
+			return sizeof(GLshort);
+		case GL_UNSIGNED_SHORT:
+			return sizeof(GLushort);
+		case GL_BYTE:
+			return sizeof(GLbyte);
+		case GL_UNSIGNED_BYTE:
+			return sizeof(GLubyte);
+		default:
+			assert(0);
+			return 0;
+	}
+}
+
+#define CONVERT( TYPE, MACRO ) do {		\
+	GLuint i, j, sz;				\
+	sz = input->Size;				\
+	if (input->Normalized) {			\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = MACRO(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	} else {					\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = (GLfloat)(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	}						\
+} while (0)
+
+static void r300TranslateAttrib(GLcontext *ctx, GLuint attr, int count, const struct gl_client_array *input, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	struct vertex_attribute r300_attr;
+	const void *src_ptr;
+	GLenum type;
+	GLuint stride;
+
+	if (input->BufferObj->Name) {
+		if (!input->BufferObj->Pointer) {
+			bo[*nr_bo] = input->BufferObj;
+			(*nr_bo)++;
+			ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+			assert(input->BufferObj->Pointer != NULL);
+		}
+
+		src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
+	} else
+		src_ptr = input->Ptr;
+
+	stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size : input->StrideB;
+
+	if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT ||
+#if MESA_BIG_ENDIAN
+	    getTypeSize(input->Type) != 4 ||
+#endif
+	    stride < 4) {
+		if (RADEON_DEBUG & DEBUG_FALLBACKS) {
+			fprintf(stderr, "%s: Converting vertex attributes, attribute data format %x,", __FUNCTION__, input->Type);
+			fprintf(stderr, "stride %d, components %d\n", stride, input->Size);
+		}
+
+		GLfloat *dst_ptr, *tmp;
+		tmp = dst_ptr = _mesa_malloc(sizeof(GLfloat) * input->Size * count);
+
+		switch (input->Type) {
+			case GL_DOUBLE:
+				CONVERT(GLdouble, (GLfloat));
+				break;
+			case GL_UNSIGNED_INT:
+				CONVERT(GLuint, UINT_TO_FLOAT);
+				break;
+			case GL_INT:
+				CONVERT(GLint, INT_TO_FLOAT);
+				break;
+			case GL_UNSIGNED_SHORT:
+				CONVERT(GLushort, USHORT_TO_FLOAT);
+				break;
+			case GL_SHORT:
+				CONVERT(GLshort, SHORT_TO_FLOAT);
+				break;
+			case GL_UNSIGNED_BYTE:
+				assert(input->Format != GL_BGRA);
+				CONVERT(GLubyte, UBYTE_TO_FLOAT);
+				break;
+			case GL_BYTE:
+				CONVERT(GLbyte, BYTE_TO_FLOAT);
+				break;
+			default:
+				assert(0);
+				break;
+		}
+
+		type = GL_FLOAT;
+		r300_attr.free_needed = GL_TRUE;
+		r300_attr.data = tmp;
+		r300_attr.stride = sizeof(GLfloat) * input->Size;
+		r300_attr.dwords = input->Size;
+	} else {
+		type = input->Type;
+		r300_attr.free_needed = GL_FALSE;
+		r300_attr.data = (GLvoid *)src_ptr;
+		r300_attr.stride = stride;
+		r300_attr.dwords = (getTypeSize(type) * input->Size  + 3)/ 4;
+	}
+
+	r300_attr.size = input->Size;
+	r300_attr.element = attr;
+	r300_attr.dst_loc = vbuf->num_attribs;
+
+	switch (type) {
+		case GL_FLOAT:
+			switch (input->Size) {
+				case 1: r300_attr.data_type = R300_DATA_TYPE_FLOAT_1; break;
+				case 2: r300_attr.data_type = R300_DATA_TYPE_FLOAT_2; break;
+				case 3: r300_attr.data_type = R300_DATA_TYPE_FLOAT_3; break;
+				case 4: r300_attr.data_type = R300_DATA_TYPE_FLOAT_4; break;
+			}
+			r300_attr._signed = 0;
+			r300_attr.normalize = 0;
+			break;
+		case GL_SHORT:
+			r300_attr._signed = 1;
+			r300_attr.normalize = input->Normalized;
+			switch (input->Size) {
+				case 1:
+				case 2:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_2;
+					break;
+				case 3:
+				case 4:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_4;
+					break;
+			}
+			break;
+		case GL_BYTE:
+			r300_attr._signed = 1;
+			r300_attr.normalize = input->Normalized;
+			r300_attr.data_type = R300_DATA_TYPE_BYTE;
+			break;
+		case GL_UNSIGNED_SHORT:
+			r300_attr._signed = 0;
+			r300_attr.normalize = input->Normalized;
+			switch (input->Size) {
+				case 1:
+				case 2:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_2;
+					break;
+				case 3:
+				case 4:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_4;
+					break;
+			}
+			break;
+		case GL_UNSIGNED_BYTE:
+			r300_attr._signed = 0;
+			r300_attr.normalize = input->Normalized;
+			if (input->Format == GL_BGRA)
+				r300_attr.data_type = R300_DATA_TYPE_D3DCOLOR;
+			else
+				r300_attr.data_type = R300_DATA_TYPE_BYTE;
+			break;
+
+		default:
+		case GL_DOUBLE:
+		case GL_INT:
+		case GL_UNSIGNED_INT:
+			assert(0);
+			break;
+	}
+
+	switch (input->Size) {
+		case 4:
+			r300_attr.swizzle = SWIZZLE_XYZW;
+			break;
+		case 3:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+			break;
+		case 2:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
+			break;
+		case 1:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+			break;
+	}
+
+	r300_attr.write_mask = MASK_XYZW;
+
+	vbuf->attribs[vbuf->num_attribs] = r300_attr;
+	++vbuf->num_attribs;
+}
+
+static void r300SetVertexFormat(GLcontext *ctx, const struct gl_client_array *arrays[], int count, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+
+	{
+		int i, tmp;
+
+		tmp = r300->selected_vp->key.InputsRead;
+		i = 0;
+		vbuf->num_attribs = 0;
+		while (tmp) {
+			/* find first enabled bit */
+			while (!(tmp & 1)) {
+				tmp >>= 1;
+				++i;
+			}
+
+			r300TranslateAttrib(ctx, i, count, arrays[i], bo, nr_bo);
+
+			tmp >>= 1;
+			++i;
+		}
+	}
+
+	r300SwitchFallback(ctx, R300_FALLBACK_AOS_LIMIT, vbuf->num_attribs > R300_MAX_AOS_ARRAYS);
+	if (r300->fallback)
+		return;
+
+	{
+		int i;
+
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			rcommon_emit_vector(ctx, &r300->radeon.tcl.aos[i],
+						vbuf->attribs[i].data, vbuf->attribs[i].dwords,
+						vbuf->attribs[i].stride, count);
+		}
+
+		r300->radeon.tcl.aos_count = vbuf->num_attribs;
+	}
+}
+
+static void r300FreeData(GLcontext *ctx, struct gl_buffer_object **bo, GLuint nr_bo)
+{
+	{
+		struct r300_vertex_buffer *vbuf = &R300_CONTEXT(ctx)->vbuf;
+		int i;
+
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			if (vbuf->attribs[i].free_needed)
+				_mesa_free(vbuf->attribs[i].data);
+		}
+	}
+
+	{
+		struct r300_index_buffer *ind_buf = &R300_CONTEXT(ctx)->ind_buf;
+		if (ind_buf->free_needed)
+			_mesa_free(ind_buf->ptr);
+	}
+
+	{
+		int i;
+
+		for (i = 0; i < nr_bo; ++i) {
+			ctx->Driver.UnmapBuffer(ctx, 0, bo[i]);
+		}
+	}
+}
+
+static GLboolean r300TryDrawPrims(GLcontext *ctx,
+					 const struct gl_client_array *arrays[],
+					 const struct _mesa_prim *prim,
+					 GLuint nr_prims,
+					 const struct _mesa_index_buffer *ib,
+					 GLuint min_index,
+					 GLuint max_index )
+{
+	struct r300_context *r300 = R300_CONTEXT(ctx);
+	struct gl_buffer_object *bo[VERT_ATTRIB_MAX+1];
+	GLuint i, nr_bo = 0;
+
+	if (ctx->NewState)
+		_mesa_update_state( ctx );
+
+	if (r300->options.hw_tcl_enabled)
+		_tnl_UpdateFixedFunctionProgram(ctx);
+
+	r300UpdateShaders(r300);
+
+	r300SwitchFallback(ctx, R300_FALLBACK_INVALID_BUFFERS, !r300ValidateBuffers(ctx));
+
+	r300FixupIndexBuffer(ctx, ib, bo, &nr_bo);
+
+	r300SetVertexFormat(ctx, arrays, max_index + 1, bo, &nr_bo);
+
+	if (r300->fallback)
+		return GL_FALSE;
+
+	r300SetupVAP(ctx, r300->selected_vp->key.InputsRead, r300->selected_vp->key.OutputsWritten);
+
+	r300UpdateShaderStates(r300);
+
+	r300EmitCacheFlush(r300);
+	radeonEmitState(&r300->radeon);
+
+	for (i = 0; i < nr_prims; ++i) {
+		r300RunRenderPrimitive(ctx, prim[i].start, prim[i].start + prim[i].count, prim[i].mode);
+	}
+
+	r300EmitCacheFlush(r300);
+
+	radeonReleaseArrays(ctx, ~0);
+
+	r300FreeData(ctx, bo, nr_bo);
+
+	return GL_TRUE;
+}
+
+/* TODO: rebase if number of indices in any of primitives is > 8192 for 32bit indices or 16384 for 16bit indices */
+
+static void r300DrawPrims(GLcontext *ctx,
+			 const struct gl_client_array *arrays[],
+			 const struct _mesa_prim *prim,
+			 GLuint nr_prims,
+			 const struct _mesa_index_buffer *ib,
+			 GLuint min_index,
+			 GLuint max_index)
+{
+	struct split_limits limits;
+	GLboolean retval;
+
+	limits.max_verts = 65535;
+	limits.max_indices = 65535;
+	limits.max_vb_size = 1024*1024;
+
+	if (min_index) {
+		vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r300DrawPrims );
+		return;
+	}
+	if ((ib && ib->count > 65536)) {
+		vbo_split_prims (ctx, arrays, prim, nr_prims, ib, min_index, max_index, r300DrawPrims, &limits);
+		return;
+	}
+
+	/* Make an attempt at drawing */
+	retval = r300TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+
+	/* If failed run tnl pipeline - it should take care of fallbacks */
+	if (!retval)
+		_tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+}
+
+void r300InitDraw(GLcontext *ctx)
+{
+	struct vbo_context *vbo = vbo_context(ctx);
+
+	vbo->draw_prims = r300DrawPrims;
+}
diff --git a/src/mesa/drivers/dri/r300/r300_emit.c b/src/mesa/drivers/dri/r300/r300_emit.c
index 28c3157427..c3817721dc 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_emit.c
@@ -31,6 +31,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \file
  *
  * \author Keith Whitwell <keith@tungstengraphics.com>
+ * \author Maciej Cencora <m.cencora@gmail.com>
  */
 
 #include "main/glheader.h"
@@ -46,222 +47,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_context.h"
 
 #include "r300_context.h"
-#include "radeon_ioctl.h"
 #include "r300_state.h"
 #include "r300_emit.h"
 #include "r300_ioctl.h"
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
-
-#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
-    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
-    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
-    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
-    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
-    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
-#error Cannot change these!
-#endif
-
-#define DEBUG_ALL DEBUG_VERTS
-
-#if defined(USE_X86_ASM)
-#define COPY_DWORDS( dst, src, nr )					\
-do {									\
-	int __tmp;							\
-	__asm__ __volatile__( "rep ; movsl"				\
-			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
-			      : "0" (nr),				\
-			        "D" ((long)dst),			\
-			        "S" ((long)src) );			\
-} while (0)
-#else
-#define COPY_DWORDS( dst, src, nr )		\
-do {						\
-   int j;					\
-   for ( j = 0 ; j < nr ; j++ )			\
-      dst[j] = ((int *)src)[j];			\
-   dst += nr;					\
-} while (0)
-#endif
-
-static void r300EmitVec4(GLcontext * ctx, struct r300_dma_region *rvb,
-			 GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 4)
-		COPY_DWORDS(out, data, count);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out++;
-			data += stride;
-		}
-}
-
-static void r300EmitVec8(GLcontext * ctx, struct r300_dma_region *rvb,
-			 GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 8)
-		COPY_DWORDS(out, data, count * 2);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out += 2;
-			data += stride;
-		}
-}
-
-static void r300EmitVec12(GLcontext * ctx, struct r300_dma_region *rvb,
-			  GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 12)
-		COPY_DWORDS(out, data, count * 3);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out[2] = *(int *)(data + 8);
-			out += 3;
-			data += stride;
-		}
-}
-
-static void r300EmitVec16(GLcontext * ctx, struct r300_dma_region *rvb,
-			  GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 16)
-		COPY_DWORDS(out, data, count * 4);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out[2] = *(int *)(data + 8);
-			out[3] = *(int *)(data + 12);
-			out += 4;
-			data += stride;
-		}
-}
-
-static void r300EmitVec(GLcontext * ctx, struct r300_dma_region *rvb,
-			GLvoid * data, int size, int stride, int count)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-	if (stride == 0) {
-		r300AllocDmaRegion(rmesa, rvb, size * 4, 4);
-		count = 1;
-		rvb->aos_offset = GET_START(rvb);
-		rvb->aos_stride = 0;
-	} else {
-		r300AllocDmaRegion(rmesa, rvb, size * count * 4, 4);
-		rvb->aos_offset = GET_START(rvb);
-		rvb->aos_stride = size;
-	}
-
-	switch (size) {
-	case 1:
-		r300EmitVec4(ctx, rvb, data, stride, count);
-		break;
-	case 2:
-		r300EmitVec8(ctx, rvb, data, stride, count);
-		break;
-	case 3:
-		r300EmitVec12(ctx, rvb, data, stride, count);
-		break;
-	case 4:
-		r300EmitVec16(ctx, rvb, data, stride, count);
-		break;
-	default:
-		assert(0);
-		break;
-	}
-}
-
-#define DW_SIZE(x) ((inputs[tab[(x)]] << R300_DST_VEC_LOC_SHIFT) |	\
-		    (attribptr[tab[(x)]]->size - 1) << R300_DATA_TYPE_0_SHIFT)
-
-GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
-				 int *inputs, GLint * tab, GLuint nr)
-{
-	GLuint i, dw;
-
-	/* type, inputs, stop bit, size */
-	for (i = 0; i < nr; i += 2) {
-		/* make sure input is valid, would lockup the gpu */
-		assert(inputs[tab[i]] != -1);
-		dw = (R300_SIGNED | DW_SIZE(i));
-		if (i + 1 == nr) {
-			dw |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
-		} else {
-			assert(inputs[tab[i + 1]] != -1);
-			dw |= (R300_SIGNED |
-			       DW_SIZE(i + 1)) << R300_DATA_TYPE_1_SHIFT;
-			if (i + 2 == nr) {
-				dw |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
-			}
-		}
-		dst[i >> 1] = dw;
-	}
-
-	return (nr + 1) >> 1;
-}
-
-static GLuint r300VAPInputRoute1Swizzle(int swizzle[4])
-{
-	return (swizzle[0] << R300_SWIZZLE_SELECT_X_SHIFT) |
-	    (swizzle[1] << R300_SWIZZLE_SELECT_Y_SHIFT) |
-	    (swizzle[2] << R300_SWIZZLE_SELECT_Z_SHIFT) |
-	    (swizzle[3] << R300_SWIZZLE_SELECT_W_SHIFT);
-}
-
-GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr)
-{
-	GLuint i, dw;
-
-	for (i = 0; i < nr; i += 2) {
-		dw = (r300VAPInputRoute1Swizzle(swizzle[i]) |
-		      ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
-			R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE0_SHIFT;
-		if (i + 1 < nr) {
-			dw |= (r300VAPInputRoute1Swizzle(swizzle[i + 1]) |
-			       ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
-				 R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
-		}
-		dst[i >> 1] = dw;
-	}
-
-	return (nr + 1) >> 1;
-}
+#include "r300_render.h"
+#include "r300_swtcl.h"
 
 GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
 {
@@ -272,7 +62,6 @@ GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
 
 GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	GLuint i, vic_1 = 0;
 
 	if (InputsRead & (1 << VERT_ATTRIB_POS))
@@ -284,281 +73,112 @@ GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
 	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
 		vic_1 |= R300_INPUT_CNTL_COLOR;
 
-	rmesa->state.texture.tc_count = 0;
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
 		if (InputsRead & (1 << (VERT_ATTRIB_TEX0 + i))) {
-			rmesa->state.texture.tc_count++;
 			vic_1 |= R300_INPUT_CNTL_TC0 << i;
 		}
 
 	return vic_1;
 }
 
-GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten)
+GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint vp_writes, GLuint fp_reads)
 {
 	GLuint ret = 0;
 
-	if (OutputsWritten & (1 << VERT_RESULT_HPOS))
+	if (vp_writes & (1 << VERT_RESULT_HPOS))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_COL0))
+	if (vp_writes & (1 << VERT_RESULT_COL0) && fp_reads & FRAG_BIT_COL0)
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_COL1))
+	if (vp_writes & (1 << VERT_RESULT_COL1) && fp_reads & FRAG_BIT_COL1)
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_BFC0)
-	    || OutputsWritten & (1 << VERT_RESULT_BFC1))
-		ret |=
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT |
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
+	/* Two sided lighting works only if all 4 colors are written */
+	if (vp_writes & (1 << VERT_RESULT_BFC0) || vp_writes & (1 << VERT_RESULT_BFC1))
+		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT | R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
+			   R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT | R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_PSIZ))
+	if (vp_writes & (1 << VERT_RESULT_PSIZ))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
 
 	return ret;
 }
 
-GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten)
+GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes, GLuint fp_reads)
 {
 	GLuint i, ret = 0, first_free_texcoord = 0;
 
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-		if (OutputsWritten & (1 << (VERT_RESULT_TEX0 + i))) {
-			ret |= (4 << (3 * i));
+		if (vp_writes & (1 << (VERT_RESULT_TEX0 + i)) && fp_reads & FRAG_BIT_TEX(i)) {
+			ret |= (4 << (3 * first_free_texcoord));
 			++first_free_texcoord;
 		}
 	}
 
-	if (OutputsWritten & (1 << VERT_RESULT_FOGC)) {
-		if (first_free_texcoord > 8) {
-			fprintf(stderr, "\tout of free texcoords to write fog coord\n");
-			_mesa_exit(-1);
-		}
-		ret |= 4 << (3 * first_free_texcoord);
-	}
-
-	return ret;
-}
-
-/* Emit vertex data to GART memory
- * Route inputs to the vertex processor
- * This function should never return R300_FALLBACK_TCL when using software tcl.
- */
-int r300EmitArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
-	GLuint nr;
-	GLuint count = vb->Count;
-	GLuint i;
-	GLuint InputsRead = 0, OutputsWritten = 0;
-	int *inputs = NULL;
-	int vir_inputs[VERT_ATTRIB_MAX];
-	GLint tab[VERT_ATTRIB_MAX];
-	int swizzle[VERT_ATTRIB_MAX][4];
-	struct r300_vertex_program *prog =
-	    (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-
-	if (hw_tcl_on) {
-		inputs = prog->inputs;
-		InputsRead = prog->key.InputsRead;
-		OutputsWritten = prog->key.OutputsWritten;
-	} else {
-		inputs = rmesa->state.sw_tcl_inputs;
-
-		DECLARE_RENDERINPUTS(render_inputs_bitset);
-		RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
-
-		vb->AttribPtr[VERT_ATTRIB_POS] = vb->ClipPtr;
-
-		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS));
-		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_NORMAL) == 0);
-		//assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0));
-
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS)) {
-			InputsRead |= 1 << VERT_ATTRIB_POS;
-			OutputsWritten |= 1 << VERT_RESULT_HPOS;
-		}
-
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0)) {
-			InputsRead |= 1 << VERT_ATTRIB_COLOR0;
-			OutputsWritten |= 1 << VERT_RESULT_COL0;
-		}
-
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR1)) {
-			InputsRead |= 1 << VERT_ATTRIB_COLOR1;
-			OutputsWritten |= 1 << VERT_RESULT_COL1;
-		}
-
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-			if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_TEX(i))) {
-				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
-				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-			}
-		}
-
-		for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-			if (InputsRead & (1 << i)) {
-				inputs[i] = nr++;
-			} else {
-				inputs[i] = -1;
-			}
-		}
-
-		/* Fixed, apply to vir0 only */
-		memcpy(vir_inputs, inputs, VERT_ATTRIB_MAX * sizeof(int));
-		inputs = vir_inputs;
-		if (InputsRead & VERT_ATTRIB_POS)
-			inputs[VERT_ATTRIB_POS] = 0;
-		if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
-			inputs[VERT_ATTRIB_COLOR0] = 2;
-		if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
-			inputs[VERT_ATTRIB_COLOR1] = 3;
-		for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
-			if (InputsRead & (1 << i))
-				inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
-
-		RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
+	if (fp_reads & FRAG_BIT_WPOS) {
+		ret |= (4 << (3 * first_free_texcoord));
+		++first_free_texcoord;
 	}
 
-	assert(InputsRead);
-	assert(OutputsWritten);
-
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			tab[nr++] = i;
-		}
+	if (vp_writes & (1 << VERT_RESULT_FOGC) && fp_reads & FRAG_BIT_FOGC) {
+		ret |= 4 << (3 * first_free_texcoord);
 	}
 
-	if (nr > R300_MAX_AOS_ARRAYS) {
-		return R300_FALLBACK_TCL;
+	if (first_free_texcoord > 8) {
+		fprintf(stderr, "\tout of free texcoords\n");
+		_mesa_exit(-1);
 	}
 
-	for (i = 0; i < nr; i++) {
-		int ci, fix, found = 0;
-
-		swizzle[i][0] = SWIZZLE_ZERO;
-		swizzle[i][1] = SWIZZLE_ZERO;
-		swizzle[i][2] = SWIZZLE_ZERO;
-		swizzle[i][3] = SWIZZLE_ONE;
+	return ret;
+}
 
-		for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
-			swizzle[i][ci] = ci;
-		}
+GLboolean r300EmitArrays(GLcontext * ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	GLuint InputsRead, OutputsWritten;
 
-		if (r300IsGartMemory(rmesa, vb->AttribPtr[tab[i]]->data, 4)) {
-			if (vb->AttribPtr[tab[i]]->stride % 4) {
-				return R300_FALLBACK_TCL;
-			}
-			rmesa->state.aos[i].address = (void *)(vb->AttribPtr[tab[i]]->data);
-			rmesa->state.aos[i].start = 0;
-			rmesa->state.aos[i].aos_offset = r300GartOffsetFromVirtual(rmesa, vb->AttribPtr[tab[i]]->data);
-			rmesa->state.aos[i].aos_stride = vb->AttribPtr[tab[i]]->stride / 4;
-			rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
-		} else {
-			r300EmitVec(ctx, &rmesa->state.aos[i],
-				    vb->AttribPtr[tab[i]]->data,
-				    vb->AttribPtr[tab[i]]->size,
-				    vb->AttribPtr[tab[i]]->stride, count);
-		}
+	r300ChooseSwtclVertexFormat(ctx, &InputsRead, &OutputsWritten);
 
-		rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
+	r300SwitchFallback(ctx, R300_FALLBACK_AOS_LIMIT, vbuf->num_attribs > R300_MAX_AOS_ARRAYS);
+	if (r300->fallback & R300_RASTER_FALLBACK_MASK)
+		return GL_FALSE;
 
-		for (fix = 0; fix <= 4 - vb->AttribPtr[tab[i]]->size; fix++) {
-			if ((rmesa->state.aos[i].aos_offset - _mesa_sizeof_type(GL_FLOAT) * fix) % 4) {
-				continue;
-			}
-			found = 1;
-			break;
-		}
+	{
+		struct vertex_buffer *mesa_vb = &TNL_CONTEXT(ctx)->vb;
+		GLuint attr, i;
 
-		if (found) {
-			if (fix > 0) {
-				WARN_ONCE("Feeling lucky?\n");
-			}
-			rmesa->state.aos[i].aos_offset -= _mesa_sizeof_type(GL_FLOAT) * fix;
-			for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
-				swizzle[i][ci] += fix;
-			}
-		} else {
-			WARN_ONCE
-			    ("Cannot handle offset %x with stride %d, comp %d\n",
-			     rmesa->state.aos[i].aos_offset,
-			     rmesa->state.aos[i].aos_stride,
-			     vb->AttribPtr[tab[i]]->size);
-			return R300_FALLBACK_TCL;
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			attr = vbuf->attribs[i].element;
+			rcommon_emit_vector(ctx, &r300->radeon.tcl.aos[i], mesa_vb->AttribPtr[attr]->data,
+					mesa_vb->AttribPtr[attr]->size, mesa_vb->AttribPtr[attr]->stride, mesa_vb->Count);
 		}
-	}
 
-	/* Setup INPUT_ROUTE. */
-	R300_STATECHANGE(rmesa, vir[0]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
-	    r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-			       vb->AttribPtr, inputs, tab, nr);
-	R300_STATECHANGE(rmesa, vir[1]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
-	    r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-			       nr);
-
-	/* Setup INPUT_CNTL. */
-	R300_STATECHANGE(rmesa, vic);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-
-	/* Setup OUTPUT_VTX_FMT. */
-	R300_STATECHANGE(rmesa, vof);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] =
-	    r300VAPOutputCntl0(ctx, OutputsWritten);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] =
-	    r300VAPOutputCntl1(ctx, OutputsWritten);
-
-	rmesa->state.aos_count = nr;
-
-	return R300_FALLBACK_NONE;
-}
+		r300->radeon.tcl.aos_count = vbuf->num_attribs;
 
-#ifdef USER_BUFFERS
-void r300UseArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	int i;
-
-	if (rmesa->state.elt_dma.buf)
-		r300_mem_use(rmesa, rmesa->state.elt_dma.buf->id);
-
-	for (i = 0; i < rmesa->state.aos_count; i++) {
-		if (rmesa->state.aos[i].buf)
-			r300_mem_use(rmesa, rmesa->state.aos[i].buf->id);
+		/* Fill index buffer info */
+		r300->ind_buf.ptr = mesa_vb->Elts;
+		r300->ind_buf.is_32bit = GL_TRUE;
+		r300->ind_buf.free_needed = GL_FALSE;
 	}
-}
-#endif
 
-void r300ReleaseArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	int i;
+	r300SetupVAP(ctx, InputsRead, OutputsWritten);
 
-	r300ReleaseDmaRegion(rmesa, &rmesa->state.elt_dma, __FUNCTION__);
-	for (i = 0; i < rmesa->state.aos_count; i++) {
-		r300ReleaseDmaRegion(rmesa, &rmesa->state.aos[i], __FUNCTION__);
-	}
+	return GL_TRUE;
 }
 
 void r300EmitCacheFlush(r300ContextPtr rmesa)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
-	e32(R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
-	    R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
-
-	reg_start(R300_ZB_ZCACHE_CTLSTAT, 0);
-	e32(R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
-	    R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+	BATCH_LOCALS(&rmesa->radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(4);
+	OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT,
+		R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+		R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+	OUT_BATCH_REGVAL(R300_ZB_ZCACHE_CTLSTAT,
+		R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+		R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+	END_BATCH();
+	COMMIT_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_emit.h b/src/mesa/drivers/dri/r300/r300_emit.h
index 89d738339f..2fb8b82d3a 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.h
+++ b/src/mesa/drivers/dri/r300/r300_emit.h
@@ -44,28 +44,31 @@
 #include "r300_cmdbuf.h"
 #include "radeon_reg.h"
 
-/* TODO: move these defines (and the ones from DRM) into r300_reg.h and sync up
- * with DRM */
-#define CP_PACKET0(reg, n)	(RADEON_CP_PACKET0 | ((n)<<16) | ((reg)>>2))
-#define CP_PACKET3( pkt, n )						\
-	(RADEON_CP_PACKET3 | (pkt) | ((n) << 16))
-
-static INLINE uint32_t cmdpacket0(int reg, int count)
+static INLINE uint32_t cmdpacket0(struct radeon_screen *rscrn,
+                                  int reg, int count)
 {
-	drm_r300_cmd_header_t cmd;
-
-	cmd.packet0.cmd_type = R300_CMD_PACKET0;
-	cmd.packet0.count = count;
-	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
-	cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
-
-	return cmd.u;
+    if (!rscrn->kernel_mm) {
+	    drm_r300_cmd_header_t cmd;
+
+	cmd.u = 0;
+    	cmd.packet0.cmd_type = R300_CMD_PACKET0;
+	    cmd.packet0.count = count;
+    	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
+	    cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
+
+    	return cmd.u;
+    }
+    if (count) {
+        return CP_PACKET0(reg, count - 1);
+    }
+    return CP_PACKET2;
 }
 
-static INLINE uint32_t cmdvpu(int addr, int count)
+static INLINE uint32_t cmdvpu(struct radeon_screen *rscrn, int addr, int count)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.vpu.cmd_type = R300_CMD_VPU;
 	cmd.vpu.count = count;
 	cmd.vpu.adrhi = ((unsigned int)addr & 0xFF00) >> 8;
@@ -74,10 +77,12 @@ static INLINE uint32_t cmdvpu(int addr, int count)
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdr500fp(int addr, int count, int type, int clamp)
+static INLINE uint32_t cmdr500fp(struct radeon_screen *rscrn,
+                                 int addr, int count, int type, int clamp)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.r500fp.cmd_type = R300_CMD_R500FP;
 	cmd.r500fp.count = count;
 	cmd.r500fp.adrhi_flags = ((unsigned int)addr & 0x100) >> 8;
@@ -88,181 +93,139 @@ static INLINE uint32_t cmdr500fp(int addr, int count, int type, int clamp)
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdpacket3(int packet)
+static INLINE uint32_t cmdpacket3(struct radeon_screen *rscrn, int packet)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.packet3.cmd_type = R300_CMD_PACKET3;
 	cmd.packet3.packet = packet;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdcpdelay(unsigned short count)
+static INLINE uint32_t cmdcpdelay(struct radeon_screen *rscrn,  
+                                  unsigned short count)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
+
 	cmd.delay.cmd_type = R300_CMD_CP_DELAY;
 	cmd.delay.count = count;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdwait(unsigned char flags)
+static INLINE uint32_t cmdwait(struct radeon_screen *rscrn,
+                               unsigned char flags)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.wait.cmd_type = R300_CMD_WAIT;
 	cmd.wait.flags = flags;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdpacify(void)
+static INLINE uint32_t cmdpacify(struct radeon_screen *rscrn)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.header.cmd_type = R300_CMD_END3D;
 
 	return cmd.u;
 }
 
 /**
- * Prepare to write a register value to register at address reg.
- * If num_extra > 0 then the following extra values are written
- * to registers with address +4, +8 and so on..
- */
-#define reg_start(reg, num_extra)					\
-	do {								\
-		int _n;							\
-		_n=(num_extra);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+2),				\
-					__FUNCTION__);			\
-		cmd_reserved=_n+2;					\
-		cmd_written=1;						\
-		cmd[0].i=cmdpacket0((reg), _n+1);			\
-	} while (0);
-
-/**
- * Emit GLuint freestyle
+ * Write the header of a packet3 to the command buffer.
+ * Outputs 2 dwords and expects (num_extra+1) additional dwords afterwards.
  */
-#define e32(dword)							\
-	do {								\
-		if(cmd_written<cmd_reserved) {				\
-			cmd[cmd_written].i=(dword);			\
-			cmd_written++;					\
-		} else {						\
-			fprintf(stderr,					\
-				"e32 but no previous packet "		\
-				"declaration.\n"			\
-				"Aborting! in %s::%s at line %d, "	\
-				"cmd_written=%d cmd_reserved=%d\n",	\
-				__FILE__, __FUNCTION__, __LINE__,	\
-				cmd_written, cmd_reserved);		\
-			_mesa_exit(-1);					\
-		}							\
+#define OUT_BATCH_PACKET3(packet, num_extra) do {\
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		\
+    	OUT_BATCH(cmdpacket3(b_l_rmesa->radeonScreen,\
+                  R300_CMD_PACKET3_RAW)); \
+    } else b_l_rmesa->cmdbuf.cs->section_cdw++;\
+	OUT_BATCH(CP_PACKET3((packet), (num_extra))); \
 	} while(0)
 
-#define	efloat(f) e32(r300PackFloat32(f))
-
-#define vsf_start_fragment(dest, length)				\
-	do {								\
-		int _n;							\
-		_n = (length);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+1),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+2;					\
-		cmd_written =1;						\
-		cmd[0].i = cmdvpu((dest), _n/4);			\
-	} while (0);
-
-#define r500fp_start_fragment(dest, length)				\
-	do {								\
-		int _n;							\
-		_n = (length);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+1),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+1;					\
-		cmd_written =1;						\
-		cmd[0].i = cmdr500fp((dest), _n/6, 0, 0);		\
-	} while (0);
-
-#define start_packet3(packet, count)					\
-	{								\
-		int _n;							\
-		GLuint _p;						\
-		_n = (count);						\
-		_p = (packet);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+3),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+3;					\
-		cmd_written = 2;					\
-		if(_n > 0x3fff) {					\
-			fprintf(stderr,"Too big packet3 %08x: cannot "	\
-				"store %d dwords\n",			\
-				_p, _n);				\
-			_mesa_exit(-1);					\
-		}							\
-		cmd[0].i = cmdpacket3(R300_CMD_PACKET3_RAW);		\
-		cmd[1].i = _p | ((_n & 0x3fff)<<16);			\
-	}
-
 /**
  * Must be sent to switch to 2d commands
  */
-void static INLINE end_3d(r300ContextPtr rmesa)
+void static INLINE end_3d(radeonContextPtr radeon)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(radeon);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].header.cmd_type = R300_CMD_END3D;
+	if (!radeon->radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdpacify(radeon->radeonScreen));
+		END_BATCH();
+	}
 }
 
 void static INLINE cp_delay(r300ContextPtr rmesa, unsigned short count)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(&rmesa->radeon);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].i = cmdcpdelay(count);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdcpdelay(rmesa->radeon.radeonScreen, count));
+		END_BATCH();
+	}
 }
 
-void static INLINE cp_wait(r300ContextPtr rmesa, unsigned char flags)
+void static INLINE cp_wait(radeonContextPtr radeon, unsigned char flags)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].i = cmdwait(flags);
+	BATCH_LOCALS(radeon);
+	uint32_t wait_until;
+
+	if (!radeon->radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdwait(radeon->radeonScreen, flags));
+		END_BATCH();
+	} else {
+		switch(flags) {
+		case R300_WAIT_2D:
+			wait_until = (1 << 14);
+			break;
+		case R300_WAIT_3D:
+			wait_until = (1 << 15);
+			break;
+		case R300_NEW_WAIT_2D_3D:
+			wait_until = (1 << 14) | (1 << 15);
+			break;
+		case R300_NEW_WAIT_2D_2D_CLEAN:
+			wait_until = (1 << 14) | (1 << 16) | (1 << 18);
+			break;
+		case R300_NEW_WAIT_3D_3D_CLEAN:
+			wait_until = (1 << 15) | (1 << 17) | (1 << 18);
+			break;
+		case R300_NEW_WAIT_2D_2D_CLEAN_3D_3D_CLEAN:
+			wait_until  = (1 << 14) | (1 << 16) | (1 << 18);
+			wait_until |= (1 << 15) | (1 << 17) | (1 << 18);
+			break;
+		default:
+			return;
+		}
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		OUT_BATCH(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+		OUT_BATCH(wait_until);
+		END_BATCH();
+	}
 }
 
-extern int r300EmitArrays(GLcontext * ctx);
-
-#ifdef USER_BUFFERS
-void r300UseArrays(GLcontext * ctx);
-#endif
+extern GLboolean r300EmitArrays(GLcontext * ctx);
 
-extern void r300ReleaseArrays(GLcontext * ctx);
 extern int r300PrimitiveType(r300ContextPtr rmesa, int prim);
 extern int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim);
 
 extern void r300EmitCacheFlush(r300ContextPtr rmesa);
 
-extern GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
-				 int *inputs, GLint * tab, GLuint nr);
-extern GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr);
 extern GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead);
 extern GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead);
-extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten);
-extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten);
+extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint vp_writes, GLuint fp_reads);
+extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes, GLuint fp_reads);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 873cde4414..55c1cfe631 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -25,32 +25,12 @@
  *
  */
 
-/**
- * \file
- *
- * Fragment program compiler. Perform transformations on the intermediate
- * representation until the program is in a form where we can translate
- * it more or less directly into machine-readable form.
- *
- * \author Ben Skeggs <darktama@iinet.net.au>
- * \author Jerome Glisse <j.glisse@gmail.com>
- */
+#include "r300_fragprog.h"
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/prog_instruction.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
 
 #include "r300_context.h"
-#include "r300_fragprog.h"
 #include "r300_fragprog_swizzle.h"
-#include "r300_state.h"
-
-#include "radeon_nqssadce.h"
-#include "radeon_program_alu.h"
-
 
 static void reset_srcreg(struct prog_src_register* reg)
 {
@@ -81,7 +61,7 @@ static struct prog_src_register shadow_ambient(struct gl_program *program, int t
  * \todo If/when r5xx uses the radeon_program architecture, this can probably
  * be reused.
  */
-static GLboolean transform_TEX(
+GLboolean r300_transform_TEX(
 	struct radeon_transform_context *t,
 	struct prog_instruction* orig_inst, void* data)
 {
@@ -160,6 +140,8 @@ static GLboolean transform_TEX(
 			inst.DstReg.Index = tempreg;
 			inst.DstReg.WriteMask = WRITEMASK_XYZW;
 			destredirect = GL_TRUE;
+		} else if (inst.SaturateMode) {
+			destredirect = GL_TRUE;
 		}
 	}
 
@@ -175,7 +157,7 @@ static GLboolean transform_TEX(
 		inst.SrcReg[0].File = PROGRAM_TEMPORARY;
 		inst.SrcReg[0].Index = tmpreg;
 	}
-	
+
 	tgt = radeonAppendInstructions(t->Program, 1);
 	_mesa_copy_instructions(tgt, &inst, 1);
 
@@ -239,6 +221,7 @@ static GLboolean transform_TEX(
 
 		tgt->Opcode = OPCODE_MOV;
 		tgt->DstReg = orig_inst->DstReg;
+		tgt->SaturateMode = inst.SaturateMode;
 		tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
 		tgt->SrcReg[0].Index = inst.DstReg.Index;
 	}
@@ -246,241 +229,10 @@ static GLboolean transform_TEX(
 	return GL_TRUE;
 }
 
-
-static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
-}
-
-
-/**
- * Transform the program to support fragment.position.
- *
- * Introduce a small fragment at the start of the program that will be
- * the only code that directly reads the FRAG_ATTRIB_WPOS input.
- * All other code pieces that reference that input will be rewritten
- * to read from a newly allocated temporary.
- *
- * \todo if/when r5xx supports the radeon_program architecture, this is a
- * likely candidate for code sharing.
- */
-static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
-{
-	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
-
-	if (!(InputsRead & FRAG_BIT_WPOS))
-		return;
-
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
-
-	_mesa_insert_instructions(compiler->program, 0, 3);
-	fpi = compiler->program->Instructions;
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
-
-	for (; i < compiler->program->NumInstructions; ++i) {
-		int reg;
-		for (reg = 0; reg < 3; reg++) {
-			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
-			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
-				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
-				fpi[i].SrcReg[reg].Index = tempregi;
-			}
-		}
-	}
-}
-
-
-static void nqssadce_init(struct nqssadce_state* s)
-{
-	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
-	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
-}
-
-
-static GLuint build_dtm(GLuint depthmode)
-{
-	switch(depthmode) {
-	default:
-	case GL_LUMINANCE: return 0;
-	case GL_INTENSITY: return 1;
-	case GL_ALPHA: return 2;
-	}
-}
-
-static GLuint build_func(GLuint comparefunc)
-{
-	return comparefunc - GL_NEVER;
-}
-
-
-/**
- * Collect all external state that is relevant for compiling the given
- * fragment program.
- */
-static void build_state(
-	r300ContextPtr r300,
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_external_state *state)
-{
-	int unit;
-
-	_mesa_bzero(state, sizeof(*state));
-
-	for(unit = 0; unit < 16; ++unit) {
-		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
-			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
-
-			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
-			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
-		}
-	}
-}
-
-
-void r300TranslateFragmentShader(r300ContextPtr r300,
-				 struct r300_fragment_program *fp)
-{
-	struct r300_fragment_program_external_state state;
-
-	build_state(r300, fp, &state);
-	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
-		/* TODO: cache compiled programs */
-		fp->translated = GL_FALSE;
-		_mesa_memcpy(&fp->state, &state, sizeof(state));
-	}
-
-	if (!fp->translated) {
-		struct r300_fragment_program_compiler compiler;
-
-		compiler.r300 = r300;
-		compiler.fp = fp;
-		compiler.code = &fp->code;
-		compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Fragment Program: Initial program:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		insert_WPOS_trailer(&compiler);
-
-		struct radeon_program_transformation transformations[] = {
-			{ &transform_TEX, &compiler },
-			{ &radeonTransformALU, 0 },
-			{ &radeonTransformTrigSimple, 0 }
-		};
-		radeonLocalTransform(
-			r300->radeon.glCtx,
-			compiler.program,
-			3, transformations);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Fragment Program: After native rewrite:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &r300FPIsNativeSwizzle,
-			.BuildSwizzle = &r300FPBuildSwizzle,
-			.RewriteDepthOut = GL_TRUE
-		};
-		radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after NqSSA-DCE:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		if (!r300FragmentProgramEmit(&compiler))
-			fp->error = GL_TRUE;
-
-		/* Subtle: Rescue any parameters that have been added during transformations */
-		_mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
-		fp->mesa_program.Base.Parameters = compiler.program->Parameters;
-		compiler.program->Parameters = 0;
-
-		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, NULL);
-
-		if (!fp->error)
-			fp->translated = GL_TRUE;
-		if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
-			r300FragmentProgramDump(fp, &fp->code);
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
-	}
-
-	update_params(r300, fp);
-}
-
 /* just some random things... */
-void r300FragmentProgramDump(
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_code *code)
+void r300FragmentProgramDump(union rX00_fragment_program_code *c)
 {
+	struct r300_fragment_program_code *code = &c->r300;
 	int n, i, j;
 	static int pc = 0;
 
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h
index 94fb554fb3..5ce6f33cee 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.h
@@ -33,9 +33,6 @@
 #ifndef __R300_FRAGPROG_H_
 #define __R300_FRAGPROG_H_
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
 #include "shader/program.h"
 #include "shader/prog_instruction.h"
 
@@ -105,28 +102,10 @@
 
 #endif
 
-struct r300_fragment_program;
+extern GLboolean r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
-extern void r300TranslateFragmentShader(r300ContextPtr r300,
-					struct r300_fragment_program *fp);
+extern void r300FragmentProgramDump(union rX00_fragment_program_code *c);
 
-
-/**
- * Used internally by the r300 fragment program code to store compile-time
- * only data.
- */
-struct r300_fragment_program_compiler {
-	r300ContextPtr r300;
-	struct r300_fragment_program *fp;
-	struct r300_fragment_program_code *code;
-	struct gl_program *program;
-};
-
-extern GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler);
-
-
-extern void r300FragmentProgramDump(
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_code *code);
+extern GLboolean r300_transform_TEX(struct radeon_transform_context *t, struct prog_instruction* orig_inst, void* data);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.c b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
new file mode 100644
index 0000000000..abc8757ba1
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
@@ -0,0 +1,291 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * Fragment program compiler. Perform transformations on the intermediate
+ * representation until the program is in a form where we can translate
+ * it more or less directly into machine-readable form.
+ *
+ * \author Ben Skeggs <darktama@iinet.net.au>
+ * \author Jerome Glisse <j.glisse@gmail.com>
+ */
+
+#include "r300_fragprog_common.h"
+
+#include "shader/program.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_state.h"
+#include "r300_fragprog.h"
+#include "r300_fragprog_swizzle.h"
+#include "r500_fragprog.h"
+
+#include "radeon_program.h"
+#include "radeon_program_alu.h"
+
+static void update_params(GLcontext *ctx, struct gl_fragment_program *fp)
+{
+	/* Ask Mesa nicely to fill in ParameterValues for us */
+	if (fp->Base.Parameters)
+		_mesa_load_state_parameters(ctx, fp->Base.Parameters);
+}
+
+static void nqssadce_init(struct nqssadce_state* s)
+{
+	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
+	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
+}
+
+/**
+ * Transform the program to support fragment.position.
+ *
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
+ *
+ */
+static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
+{
+	GLuint InputsRead = compiler->fp->Base.Base.InputsRead;
+
+	if (!(InputsRead & FRAG_BIT_WPOS))
+		return;
+
+	static gl_state_index tokens[STATE_LENGTH] = {
+		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+	};
+	struct prog_instruction *fpi;
+	GLuint window_index;
+	int i = 0;
+	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
+
+	_mesa_insert_instructions(compiler->program, 0, 3);
+	fpi = compiler->program->Instructions;
+
+	/* perspective divide */
+	fpi[i].Opcode = OPCODE_RCP;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_W;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	fpi[i].Opcode = OPCODE_MUL;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[1].Index = tempregi;
+	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	/* viewport transformation */
+	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
+
+	fpi[i].Opcode = OPCODE_MAD;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[0].Index = tempregi;
+	fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[1].Index = window_index;
+	fpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[2].Index = window_index;
+	fpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	i++;
+
+	for (; i < compiler->program->NumInstructions; ++i) {
+		int reg;
+		for (reg = 0; reg < 3; reg++) {
+			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+				fpi[i].SrcReg[reg].Index = tempregi;
+			}
+		}
+	}
+}
+
+static GLuint build_dtm(GLuint depthmode)
+{
+	switch(depthmode) {
+	default:
+	case GL_LUMINANCE: return 0;
+	case GL_INTENSITY: return 1;
+	case GL_ALPHA: return 2;
+	}
+}
+
+static GLuint build_func(GLuint comparefunc)
+{
+	return comparefunc - GL_NEVER;
+}
+
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+	r300ContextPtr r300,
+	struct r300_fragment_program *fp,
+	struct r300_fragment_program_external_state *state)
+{
+	int unit;
+
+	_mesa_bzero(state, sizeof(*state));
+
+	for(unit = 0; unit < 16; ++unit) {
+		if (fp->Base.Base.ShadowSamplers & (1 << unit)) {
+			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
+
+			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
+		}
+	}
+}
+
+void r300TranslateFragmentShader(GLcontext *ctx, struct gl_fragment_program *fp)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program *r300_fp = (struct r300_fragment_program *)fp;
+	struct r300_fragment_program_external_state state;
+
+	build_state(r300, r300_fp, &state);
+	if (_mesa_memcmp(&r300_fp->state, &state, sizeof(state))) {
+		/* TODO: cache compiled programs */
+		r300_fp->translated = GL_FALSE;
+		_mesa_memcpy(&r300_fp->state, &state, sizeof(state));
+	}
+
+	if (!r300_fp->translated) {
+		struct r300_fragment_program_compiler compiler;
+
+		compiler.r300 = r300;
+		compiler.fp = r300_fp;
+		compiler.code = &r300_fp->code;
+		compiler.program = _mesa_clone_program(ctx, &fp->Base);
+
+		if (RADEON_DEBUG & DEBUG_PIXEL) {
+			fflush(stdout);
+			_mesa_printf("Fragment Program: Initial program:\n");
+			_mesa_print_program(compiler.program);
+			fflush(stdout);
+		}
+
+		insert_WPOS_trailer(&compiler);
+
+		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+			struct radeon_program_transformation transformations[] = {
+				{ &r500_transform_TEX, &compiler },
+				{ &radeonTransformALU, 0 },
+				{ &radeonTransformDeriv, 0 },
+				{ &radeonTransformTrigScale, 0 }
+			};
+			radeonLocalTransform(ctx, compiler.program, 4, transformations);
+		} else {
+			struct radeon_program_transformation transformations[] = {
+				{ &r300_transform_TEX, &compiler },
+				{ &radeonTransformALU, 0 },
+				{ &radeonTransformTrigSimple, 0 }
+			};
+			radeonLocalTransform(ctx, compiler.program, 3, transformations);
+		}
+
+		if (RADEON_DEBUG & DEBUG_PIXEL) {
+			_mesa_printf("Fragment Program: After native rewrite:\n");
+			_mesa_print_program(compiler.program);
+			fflush(stdout);
+		}
+
+		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+			struct radeon_nqssadce_descr nqssadce = {
+				.Init = &nqssadce_init,
+				.IsNativeSwizzle = &r500FPIsNativeSwizzle,
+				.BuildSwizzle = &r500FPBuildSwizzle,
+				.RewriteDepthOut = GL_TRUE
+			};
+			radeonNqssaDce(ctx, compiler.program, &nqssadce);
+		} else {
+			struct radeon_nqssadce_descr nqssadce = {
+				.Init = &nqssadce_init,
+				.IsNativeSwizzle = &r300FPIsNativeSwizzle,
+				.BuildSwizzle = &r300FPBuildSwizzle,
+				.RewriteDepthOut = GL_TRUE
+			};
+			radeonNqssaDce(ctx, compiler.program, &nqssadce);
+		}
+
+		if (RADEON_DEBUG & DEBUG_PIXEL) {
+			_mesa_printf("Compiler: after NqSSA-DCE:\n");
+			_mesa_print_program(compiler.program);
+			fflush(stdout);
+		}
+
+		if (!r300->vtbl.BuildFragmentProgramHwCode(&compiler))
+			r300_fp->error = GL_TRUE;
+
+		/* Subtle: Rescue any parameters that have been added during transformations */
+		_mesa_free_parameter_list(fp->Base.Parameters);
+		fp->Base.Parameters = compiler.program->Parameters;
+		compiler.program->Parameters = 0;
+
+		_mesa_reference_program(ctx, &compiler.program, NULL);
+
+		r300_fp->translated = GL_TRUE;
+
+		r300UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+
+		if (r300_fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
+			r300->vtbl.FragmentProgramDump(&r300_fp->code);
+	}
+
+	update_params(ctx, fp);
+}
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.h b/src/mesa/drivers/dri/r300/r300_fragprog_common.h
new file mode 100644
index 0000000000..85ea86fecb
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_FRAGPROG_COMMON_H_
+#define __R300_FRAGPROG_COMMON_H_
+
+#include "main/mtypes.h"
+
+extern void r300TranslateFragmentShader(GLcontext *ctx, struct gl_fragment_program *fp);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
index 9f0b7e3534..b75656e7ee 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
@@ -47,7 +47,7 @@
 
 #define PROG_CODE \
 	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
-	struct r300_fragment_program_code *code = c->code
+	struct r300_fragment_program_code *code = &c->code->r300
 
 #define error(fmt, args...) do {			\
 		fprintf(stderr, "%s::%s(): " fmt "\n",	\
@@ -66,7 +66,7 @@ static GLboolean emit_const(void* data, GLuint file, GLuint index, GLuint *hwind
 	}
 
 	if (*hwindex >= code->const_nr) {
-		if (*hwindex >= PFS_NUM_CONST_REGS) {
+		if (*hwindex >= R300_PFS_NUM_CONST_REGS) {
 			error("Out of hw constants!\n");
 			return GL_FALSE;
 		}
@@ -138,7 +138,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 {
 	PROG_CODE;
 
-	if (code->alu.length >= PFS_MAX_ALU_INST) {
+	if (code->alu.length >= R300_PFS_MAX_ALU_INST) {
 		error("Too many ALU instructions");
 		return GL_FALSE;
 	}
@@ -201,7 +201,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 	if (inst->Alpha.DepthWriteMask) {
 		code->alu.inst[ip].inst3 |= R300_ALU_DSTA_DEPTH;
 		code->node[code->cur_node].flags |= R300_W_OUT;
-		c->fp->WritesDepth = GL_TRUE;
+		c->fp->writes_depth = GL_TRUE;
 	}
 
 	return GL_TRUE;
@@ -213,7 +213,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
  */
 static GLboolean finish_node(struct r300_fragment_program_compiler *c)
 {
-	struct r300_fragment_program_code *code = c->code;
+	struct r300_fragment_program_code *code = &c->code->r300;
 	struct r300_fragment_program_node *node = &code->node[code->cur_node];
 
 	if (node->alu_end < 0) {
@@ -275,7 +275,7 @@ static GLboolean emit_tex(void* data, struct prog_instruction* inst)
 {
 	PROG_CODE;
 
-	if (code->tex.length >= PFS_MAX_TEX_INST) {
+	if (code->tex.length >= R300_PFS_MAX_TEX_INST) {
 		error("Too many TEX instructions");
 		return GL_FALSE;
 	}
@@ -318,16 +318,16 @@ static const struct radeon_pair_handler pair_handler = {
 	.EmitPaired = &emit_alu,
 	.EmitTex = &emit_tex,
 	.BeginTexBlock = &begin_tex,
-	.MaxHwTemps = PFS_NUM_TEMP_REGS
+	.MaxHwTemps = R300_PFS_NUM_TEMP_REGS
 };
 
 /**
  * Final compilation step: Turn the intermediate radeon_program into
  * machine-readable instructions.
  */
-GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
+GLboolean r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
-	struct r300_fragment_program_code *code = compiler->code;
+	struct r300_fragment_program_code *code = &compiler->code->r300;
 
 	_mesa_bzero(code, sizeof(struct r300_fragment_program_code));
 	code->node[0].alu_end = -1;
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
index ee85e229f0..104079b4db 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
@@ -46,8 +46,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "swrast/swrast.h"
 
+#include "radeon_common.h"
+#include "radeon_lock.h"
 #include "r300_context.h"
-#include "radeon_ioctl.h"
 #include "r300_ioctl.h"
 #include "r300_cmdbuf.h"
 #include "r300_state.h"
@@ -55,71 +56,90 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_reg.h"
 #include "r300_emit.h"
 #include "r300_fragprog.h"
+#include "r300_context.h"
 
 #include "vblank.h"
 
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+
 #define CLEARBUFFER_COLOR	0x1
 #define CLEARBUFFER_DEPTH	0x2
 #define CLEARBUFFER_STENCIL	0x4
 
-static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
+static void r300EmitClearState(GLcontext * ctx);
+
+static void r300UserClear(GLcontext *ctx, GLuint mask)
 {
+	radeon_clear_tris(ctx, mask);
+}
+
+static void r300ClearBuffer(r300ContextPtr r300, int flags,
+			    struct radeon_renderbuffer *rrb,
+			    struct radeon_renderbuffer *rrbd)
+{
+	BATCH_LOCALS(&r300->radeon);
 	GLcontext *ctx = r300->radeon.glCtx;
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
-	GLuint cboffset, cbpitch;
-	drm_r300_cmd_header_t *cmd2;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	GLuint cbpitch = 0;
 	r300ContextPtr rmesa = r300;
 
 	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s: %s buffer (%i,%i %ix%i)\n",
-			__FUNCTION__, buffer ? "back" : "front",
-			dPriv->x, dPriv->y, dPriv->w, dPriv->h);
-
-	if (buffer) {
-		cboffset = r300->radeon.radeonScreen->backOffset;
-		cbpitch = r300->radeon.radeonScreen->backPitch;
-	} else {
-		cboffset = r300->radeon.radeonScreen->frontOffset;
-		cbpitch = r300->radeon.radeonScreen->frontPitch;
+		fprintf(stderr, "%s: buffer %p (%i,%i %ix%i)\n",
+			__FUNCTION__, rrb, dPriv->x, dPriv->y,
+			dPriv->w, dPriv->h);
+
+	if (rrb) {
+		cbpitch = (rrb->pitch / rrb->cpp);
+		if (rrb->cpp == 4)
+			cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+		else
+			cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+		if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
+			cbpitch |= R300_COLOR_TILE_ENABLE;
+        }
 	}
 
-	cboffset += r300->radeon.radeonScreen->fbLocation;
-
-	cp_wait(r300, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-	end_3d(rmesa);
-
-	R300_STATECHANGE(r300, cb);
-	reg_start(R300_RB3D_COLOROFFSET0, 0);
-	e32(cboffset);
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		cbpitch |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		cbpitch |= R300_COLOR_TILE_ENABLE;
-
-	reg_start(R300_RB3D_COLORPITCH0, 0);
-	e32(cbpitch);
-
-	R300_STATECHANGE(r300, cmk);
-	reg_start(RB3D_COLOR_CHANNEL_MASK, 0);
+	/* TODO in bufmgr */
+	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	end_3d(&rmesa->radeon);
 
 	if (flags & CLEARBUFFER_COLOR) {
-		e32((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
-		    (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
-		    (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
-		    (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
+		assert(rrb != 0);
+		BEGIN_BATCH_NO_AUTOSTATE(6);
+		OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+		OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+		OUT_BATCH_REGVAL(R300_RB3D_COLORPITCH0, cbpitch);
+		END_BATCH();
+	}
+#if 1
+	if (flags & (CLEARBUFFER_DEPTH | CLEARBUFFER_STENCIL)) {
+		assert(rrbd != 0);
+		cbpitch = (rrbd->pitch / rrbd->cpp);
+		if (rrbd->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
+			cbpitch |= R300_DEPTHMACROTILE_ENABLE;
+        }
+		if (rrbd->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
+            cbpitch |= R300_DEPTHMICROTILE_TILED;
+        }
+		BEGIN_BATCH_NO_AUTOSTATE(6);
+		OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
+		OUT_BATCH_RELOC(0, rrbd->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+		OUT_BATCH_REGVAL(R300_ZB_DEPTHPITCH, cbpitch);
+		END_BATCH();
+	}
+#endif
+	BEGIN_BATCH_NO_AUTOSTATE(6);
+	OUT_BATCH_REGSEQ(RB3D_COLOR_CHANNEL_MASK, 1);
+	if (flags & CLEARBUFFER_COLOR) {
+		OUT_BATCH((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
+			  (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
+			  (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
+			  (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
 	} else {
-		e32(0x0);
+		OUT_BATCH(0);
 	}
 
-	R300_STATECHANGE(r300, zs);
-	reg_start(R300_ZB_CNTL, 2);
 
 	{
 		uint32_t t1, t2;
@@ -146,73 +166,92 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
 			     R300_S_FRONT_ZFAIL_OP_SHIFT);
 		}
 
-		e32(t1);
-		e32(t2);
-		e32(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) << R300_STENCILWRITEMASK_SHIFT) |
-		    (ctx->Stencil.Clear & R300_STENCILREF_MASK));
+		OUT_BATCH_REGSEQ(R300_ZB_CNTL, 3);
+		OUT_BATCH(t1);
+		OUT_BATCH(t2);
+		OUT_BATCH(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
+                   R300_STENCILWRITEMASK_SHIFT) |
+			  (ctx->Stencil.Clear & R300_STENCILREF_MASK));
+		END_BATCH();
 	}
 
-	cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
-	cmd2[0].packet3.cmd_type = R300_CMD_PACKET3;
-	cmd2[0].packet3.packet = R300_CMD_PACKET3_CLEAR;
-	cmd2[1].u = r300PackFloat32(dPriv->w / 2.0);
-	cmd2[2].u = r300PackFloat32(dPriv->h / 2.0);
-	cmd2[3].u = r300PackFloat32(ctx->Depth.Clear);
-	cmd2[4].u = r300PackFloat32(1.0);
-	cmd2[5].u = r300PackFloat32(ctx->Color.ClearColor[0]);
-	cmd2[6].u = r300PackFloat32(ctx->Color.ClearColor[1]);
-	cmd2[7].u = r300PackFloat32(ctx->Color.ClearColor[2]);
-	cmd2[8].u = r300PackFloat32(ctx->Color.ClearColor[3]);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(9);
+		OUT_BATCH(cmdpacket3(r300->radeon.radeonScreen, R300_CMD_PACKET3_CLEAR));
+		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+		OUT_BATCH_FLOAT32(1.0);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+		END_BATCH();
+	} else {
+		OUT_BATCH(CP_PACKET3(R200_3D_DRAW_IMMD_2, 8));
+		OUT_BATCH(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
+			  (1 << R300_PRIM_NUM_VERTICES_SHIFT));
+		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+		OUT_BATCH_FLOAT32(1.0);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+	}
 
 	r300EmitCacheFlush(rmesa);
-	cp_wait(rmesa, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+
+	R300_STATECHANGE(r300, cb);
+	R300_STATECHANGE(r300, cmk);
+	R300_STATECHANGE(r300, zs);
 }
 
 static void r300EmitClearState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	r300ContextPtr rmesa = r300;
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	BATCH_LOCALS(&r300->radeon);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
 	int i;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
-	int has_tcl = 1;
+	int has_tcl;
 	int is_r500 = 0;
 	GLuint vap_cntl;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
-
-        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-                is_r500 = 1;
+	has_tcl = r300->options.hw_tcl_enabled;
 
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+		is_r500 = 1;
 
-	/* FIXME: the values written to R300_VAP_INPUT_ROUTE_0_0 and
-	 * R300_VAP_INPUT_ROUTE_0_1 are in fact known, however, the values are
-	 * quite complex; see the functions in r300_emit.c.
+	/* State atom dirty tracking is a little subtle here.
+	 *
+	 * On the one hand, we need to make sure base state is emitted
+	 * here if we start with an empty batch buffer, otherwise clear
+	 * works incorrectly with multiple processes. Therefore, the first
+	 * BEGIN_BATCH cannot be a BEGIN_BATCH_NO_AUTOSTATE.
 	 *
-	 * I believe it would be a good idea to extend the functions in
-	 * r300_emit.c so that they can be used to setup the default values for
-	 * these registers, as well as the actual values used for rendering.
+	 * On the other hand, implicit state emission clears the state atom
+	 * dirty bits, so we have to call R300_STATECHANGE later than the
+	 * first BEGIN_BATCH.
+	 *
+	 * The final trickiness is that, because we change state, we need
+	 * to ensure that any stored swtcl primitives are flushed properly
+	 * before we start changing state. See the R300_NEWPRIM in r300Clear
+	 * for this.
 	 */
-	R300_STATECHANGE(r300, vir[0]);
-	reg_start(R300_VAP_PROG_STREAM_CNTL_0, 0);
+	BEGIN_BATCH(31);
+	OUT_BATCH_REGSEQ(R300_VAP_PROG_STREAM_CNTL_0, 1);
 	if (!has_tcl)
-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
 		 ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
 	else
-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
 		 ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
 
-	/* disable fog */
-	R300_STATECHANGE(r300, fogs);
-	reg_start(R300_FG_FOG_BLEND, 0);
-	e32(0x0);
-
-	R300_STATECHANGE(r300, vir[1]);
-	reg_start(R300_VAP_PROG_STREAM_CNTL_EXT_0, 0);
-	e32(((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+	OUT_BATCH_REGVAL(R300_FG_FOG_BLEND, 0);
+	OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_EXT_0,
+	   ((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
 	       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
 	       (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
 	       (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
@@ -226,619 +265,402 @@ static void r300EmitClearState(GLcontext * ctx)
 	      << R300_SWIZZLE1_SHIFT)));
 
 	/* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
-	R300_STATECHANGE(r300, vic);
-	reg_start(R300_VAP_VTX_STATE_CNTL, 1);
-	e32((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
-	e32(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
+	OUT_BATCH_REGSEQ(R300_VAP_VTX_STATE_CNTL, 2);
+	OUT_BATCH((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
+	OUT_BATCH(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
 
-	R300_STATECHANGE(r300, vte);
 	/* comes from fglrx startup of clear */
-	reg_start(R300_SE_VTE_CNTL, 1);
-	e32(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
-	    R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
-	    R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
-	    R300_VPORT_Z_OFFSET_ENA);
-	e32(0x8);
+	OUT_BATCH_REGSEQ(R300_SE_VTE_CNTL, 2);
+	OUT_BATCH(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
+		  R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+		  R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+		  R300_VPORT_Z_OFFSET_ENA);
+	OUT_BATCH(0x8);
 
-	reg_start(R300_VAP_PSC_SGN_NORM_CNTL, 0);
-	e32(0xaaaaaaaa);
+	OUT_BATCH_REGVAL(R300_VAP_PSC_SGN_NORM_CNTL, 0xaaaaaaaa);
 
-	R300_STATECHANGE(r300, vof);
-	reg_start(R300_VAP_OUTPUT_VTX_FMT_0, 1);
-	e32(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
-	    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
-	e32(0x0);		/* no textures */
+	OUT_BATCH_REGSEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+	OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
+		  R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
+	OUT_BATCH(0); /* no textures */
 
-	R300_STATECHANGE(r300, txe);
-	reg_start(R300_TX_ENABLE, 0);
-	e32(0x0);
+	OUT_BATCH_REGVAL(R300_TX_ENABLE, 0);
 
-	R300_STATECHANGE(r300, vpt);
-	reg_start(R300_SE_VPORT_XSCALE, 5);
-	efloat(1.0);
-	efloat(dPriv->x);
-	efloat(1.0);
-	efloat(dPriv->y);
-	efloat(1.0);
-	efloat(0.0);
+	OUT_BATCH_REGSEQ(R300_SE_VPORT_XSCALE, 6);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(dPriv->x);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(dPriv->y);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(0.0);
 
-	R300_STATECHANGE(r300, at);
-	reg_start(R300_FG_ALPHA_FUNC, 0);
-	e32(0x0);
+	OUT_BATCH_REGVAL(R300_FG_ALPHA_FUNC, 0);
+
+	OUT_BATCH_REGSEQ(R300_RB3D_CBLEND, 2);
+	OUT_BATCH(0x0);
+	OUT_BATCH(0x0);
+	END_BATCH();
 
+	R300_STATECHANGE(r300, vir[0]);
+	R300_STATECHANGE(r300, fogs);
+	R300_STATECHANGE(r300, vir[1]);
+	R300_STATECHANGE(r300, vic);
+	R300_STATECHANGE(r300, vte);
+	R300_STATECHANGE(r300, vof);
+	R300_STATECHANGE(r300, txe);
+	R300_STATECHANGE(r300, vpt);
+	R300_STATECHANGE(r300, at);
 	R300_STATECHANGE(r300, bld);
-	reg_start(R300_RB3D_CBLEND, 1);
-	e32(0x0);
-	e32(0x0);
+	R300_STATECHANGE(r300, ps);
 
 	if (has_tcl) {
-	    R300_STATECHANGE(r300, vap_clip_cntl);
-	    reg_start(R300_VAP_CLIP_CNTL, 0);
-	    e32(R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+		R300_STATECHANGE(r300, vap_clip_cntl);
+
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		OUT_BATCH_REGVAL(R300_VAP_CLIP_CNTL, R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+		END_BATCH();
         }
 
-	R300_STATECHANGE(r300, ps);
-	reg_start(R300_GA_POINT_SIZE, 0);
-	e32(((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
-	    ((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+	BEGIN_BATCH_NO_AUTOSTATE(2);
+	OUT_BATCH_REGVAL(R300_GA_POINT_SIZE,
+		((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
+		((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+	END_BATCH();
 
 	if (!is_r500) {
 		R300_STATECHANGE(r300, ri);
-		reg_start(R300_RS_IP_0, 7);
-		for (i = 0; i < 8; ++i) {
-			e32(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
-		}
-
 		R300_STATECHANGE(r300, rc);
-		/* The second constant is needed to get glxgears display anything .. */
-		reg_start(R300_RS_COUNT, 1);
-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		e32(0x0);
-
 		R300_STATECHANGE(r300, rr);
-		reg_start(R300_RS_INST_0, 0);
-		e32(R300_RS_INST_COL_CN_WRITE);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R300_RS_IP_0, 8);
+		for (i = 0; i < 8; ++i)
+			OUT_BATCH(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
+
+		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+		OUT_BATCH(0x0);
+
+		OUT_BATCH_REGVAL(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
+		END_BATCH();
 	} else {
 		R300_STATECHANGE(r300, ri);
-		reg_start(R500_RS_IP_0, 7);
+		R300_STATECHANGE(r300, rc);
+		R300_STATECHANGE(r300, rr);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R500_RS_IP_0, 8);
 		for (i = 0; i < 8; ++i) {
-			e32((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-			    (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+			OUT_BATCH((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+				  (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
 		}
 
-		R300_STATECHANGE(r300, rc);
-		/* The second constant is needed to get glxgears display anything .. */
-		reg_start(R300_RS_COUNT, 1);
-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		e32(0x0);
-
-		R300_STATECHANGE(r300, rr);
-		reg_start(R500_RS_INST_0, 0);
-		e32(R500_RS_INST_COL_CN_WRITE);
+		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+		OUT_BATCH(0x0);
 
+		OUT_BATCH_REGVAL(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
+		END_BATCH();
 	}
 
 	if (!is_r500) {
 		R300_STATECHANGE(r300, fp);
-		reg_start(R300_US_CONFIG, 2);
-		e32(0x0);
-		e32(0x0);
-		e32(0x0);
-		reg_start(R300_US_CODE_ADDR_0, 3);
-		e32(0x0);
-		e32(0x0);
-		e32(0x0);
-		e32(R300_RGBA_OUT);
-
 		R300_STATECHANGE(r300, fpi[0]);
 		R300_STATECHANGE(r300, fpi[1]);
 		R300_STATECHANGE(r300, fpi[2]);
 		R300_STATECHANGE(r300, fpi[3]);
 
-		reg_start(R300_US_ALU_RGB_INST_0, 0);
-		e32(FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
-
-		reg_start(R300_US_ALU_RGB_ADDR_0, 0);
-		e32(FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
-
-		reg_start(R300_US_ALU_ALPHA_INST_0, 0);
-		e32(FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
-
-		reg_start(R300_US_ALU_ALPHA_ADDR_0, 0);
-		e32(FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+		BEGIN_BATCH(17);
+		OUT_BATCH_REGSEQ(R300_US_CONFIG, 3);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH_REGSEQ(R300_US_CODE_ADDR_0, 4);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(R300_RGBA_OUT);
+
+		OUT_BATCH_REGVAL(R300_US_ALU_RGB_INST_0,
+			FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
+		OUT_BATCH_REGVAL(R300_US_ALU_RGB_ADDR_0,
+			FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
+		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_INST_0,
+			FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
+		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_ADDR_0,
+			FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+		END_BATCH();
 	} else {
- 		R300_STATECHANGE(r300, fp);
- 		reg_start(R500_US_CONFIG, 1);
- 		e32(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
- 		e32(0x0);
- 		reg_start(R500_US_CODE_ADDR, 2);
- 		e32(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
- 		e32(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
- 		e32(R500_US_CODE_OFFSET_ADDR(0));
+		struct radeon_state_atom r500fp;
+		uint32_t _cmd[10];
 
+		R300_STATECHANGE(r300, fp);
 		R300_STATECHANGE(r300, r500fp);
-		r500fp_start_fragment(0, 6);
-
-		e32(R500_INST_TYPE_OUT |
-		    R500_INST_TEX_SEM_WAIT |
-		    R500_INST_LAST |
-		    R500_INST_RGB_OMASK_R |
-		    R500_INST_RGB_OMASK_G |
-		    R500_INST_RGB_OMASK_B |
-		    R500_INST_ALPHA_OMASK |
-		    R500_INST_RGB_CLAMP |
-		    R500_INST_ALPHA_CLAMP);
-
-		e32(R500_RGB_ADDR0(0) |
-		    R500_RGB_ADDR1(0) |
-		    R500_RGB_ADDR1_CONST |
-		    R500_RGB_ADDR2(0) |
-		    R500_RGB_ADDR2_CONST);
-
-		e32(R500_ALPHA_ADDR0(0) |
-		    R500_ALPHA_ADDR1(0) |
-		    R500_ALPHA_ADDR1_CONST |
-		    R500_ALPHA_ADDR2(0) |
-		    R500_ALPHA_ADDR2_CONST);
-
-		e32(R500_ALU_RGB_SEL_A_SRC0 |
-		    R500_ALU_RGB_R_SWIZ_A_R |
-		    R500_ALU_RGB_G_SWIZ_A_G |
-		    R500_ALU_RGB_B_SWIZ_A_B |
-		    R500_ALU_RGB_SEL_B_SRC0 |
-		    R500_ALU_RGB_R_SWIZ_B_R |
-		    R500_ALU_RGB_B_SWIZ_B_G |
-		    R500_ALU_RGB_G_SWIZ_B_B);
-
-		e32(R500_ALPHA_OP_CMP |
-		    R500_ALPHA_SWIZ_A_A |
-		    R500_ALPHA_SWIZ_B_A);
-
-		e32(R500_ALU_RGBA_OP_CMP |
-		    R500_ALU_RGBA_R_SWIZ_0 |
-		    R500_ALU_RGBA_G_SWIZ_0 |
-		    R500_ALU_RGBA_B_SWIZ_0 |
-		    R500_ALU_RGBA_A_SWIZ_0);
+
+		BEGIN_BATCH(7);
+		OUT_BATCH_REGSEQ(R500_US_CONFIG, 2);
+		OUT_BATCH(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+		OUT_BATCH(0x0);
+		OUT_BATCH_REGSEQ(R500_US_CODE_ADDR, 3);
+		OUT_BATCH(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
+		OUT_BATCH(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
+		OUT_BATCH(R500_US_CODE_OFFSET_ADDR(0));
+		END_BATCH();
+
+		r500fp.check = check_r500fp;
+		r500fp.cmd = _cmd;
+		r500fp.cmd[0] = cmdr500fp(r300->radeon.radeonScreen, 0, 1, 0, 0);
+		r500fp.cmd[1] = R500_INST_TYPE_OUT |
+			R500_INST_TEX_SEM_WAIT |
+			R500_INST_LAST |
+			R500_INST_RGB_OMASK_R |
+			R500_INST_RGB_OMASK_G |
+			R500_INST_RGB_OMASK_B |
+			R500_INST_ALPHA_OMASK |
+			R500_INST_RGB_CLAMP |
+			R500_INST_ALPHA_CLAMP;
+		r500fp.cmd[2] = R500_RGB_ADDR0(0) |
+			R500_RGB_ADDR1(0) |
+			R500_RGB_ADDR1_CONST |
+			R500_RGB_ADDR2(0) |
+			R500_RGB_ADDR2_CONST;
+		r500fp.cmd[3] = R500_ALPHA_ADDR0(0) |
+			R500_ALPHA_ADDR1(0) |
+			R500_ALPHA_ADDR1_CONST |
+			R500_ALPHA_ADDR2(0) |
+			R500_ALPHA_ADDR2_CONST;
+		r500fp.cmd[4] = R500_ALU_RGB_SEL_A_SRC0 |
+			R500_ALU_RGB_R_SWIZ_A_R |
+			R500_ALU_RGB_G_SWIZ_A_G |
+			R500_ALU_RGB_B_SWIZ_A_B |
+			R500_ALU_RGB_SEL_B_SRC0 |
+			R500_ALU_RGB_R_SWIZ_B_R |
+			R500_ALU_RGB_B_SWIZ_B_G |
+			R500_ALU_RGB_G_SWIZ_B_B;
+		r500fp.cmd[5] = R500_ALPHA_OP_CMP |
+			R500_ALPHA_SWIZ_A_A |
+			R500_ALPHA_SWIZ_B_A;
+		r500fp.cmd[6] = R500_ALU_RGBA_OP_CMP |
+			R500_ALU_RGBA_R_SWIZ_0 |
+			R500_ALU_RGBA_G_SWIZ_0 |
+			R500_ALU_RGBA_B_SWIZ_0 |
+			R500_ALU_RGBA_A_SWIZ_0;
+
+		r500fp.cmd[7] = 0;
+		emit_r500fp(ctx, &r500fp);
 	}
 
-	reg_start(R300_VAP_PVS_STATE_FLUSH_REG, 0);
-	e32(0x00000000);
+	BEGIN_BATCH(2);
+	OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+	END_BATCH();
+
 	if (has_tcl) {
-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
 			(12 << R300_VF_MAX_VTX_NUM_SHIFT));
-	    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
-	} else
-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+			vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
+	} else {
+		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
 			(5 << R300_VF_MAX_VTX_NUM_SHIFT));
+	}
 
 	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
-	    vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
-	    vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
-	    vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
-	    vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
 	else
-	    vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
+
+	R300_STATECHANGE(r300, vap_cntl);
 
-	R300_STATECHANGE(rmesa, vap_cntl);
-	reg_start(R300_VAP_CNTL, 0);
-	e32(vap_cntl);
+	BEGIN_BATCH(2);
+	OUT_BATCH_REGVAL(R300_VAP_CNTL, vap_cntl);
+	END_BATCH();
 
 	if (has_tcl) {
+        struct radeon_state_atom vpu;
+        uint32_t _cmd[10];
 		R300_STATECHANGE(r300, pvs);
-		reg_start(R300_VAP_PVS_CODE_CNTL_0, 2);
-
-		e32((0 << R300_PVS_FIRST_INST_SHIFT) |
-		    (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-		    (1 << R300_PVS_LAST_INST_SHIFT));
-		e32((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-		    (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
-		e32(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-
 		R300_STATECHANGE(r300, vpi);
-		vsf_start_fragment(0x0, 8);
 
-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 0, 0xf, PVS_DST_REG_OUT));
-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(0x0);
-
-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf, PVS_DST_REG_OUT));
-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(0x0);
+		BEGIN_BATCH(4);
+		OUT_BATCH_REGSEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
+		OUT_BATCH((0 << R300_PVS_FIRST_INST_SHIFT) |
+			  (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			  (1 << R300_PVS_LAST_INST_SHIFT));
+		OUT_BATCH((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+			  (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
+		OUT_BATCH(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+		END_BATCH();
+
+		vpu.check = check_vpu;
+		vpu.cmd = _cmd;
+		vpu.cmd[0] = cmdvpu(r300->radeon.radeonScreen, 0, 2);
+
+		vpu.cmd[1] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE,
+                                         0, 0xf, PVS_DST_REG_OUT);
+		vpu.cmd[2] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
+                                      PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+		vpu.cmd[3] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+		vpu.cmd[4] = 0x0;
+
+		vpu.cmd[5] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf,
+                                         PVS_DST_REG_OUT);
+		vpu.cmd[6] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X,
+                                      PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z,
+                                      PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT,
+
+                                      VSF_FLAG_NONE);
+		vpu.cmd[7] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+		vpu.cmd[8] = 0x0;
+
+		r300->vap_flush_needed = GL_TRUE;
+		emit_vpu(ctx, &vpu);
 	}
 }
 
-/**
- * Buffer clear
- */
-static void r300Clear(GLcontext * ctx, GLbitfield mask)
+static void r300KernelClear(GLcontext *ctx, GLuint flags)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
-	int flags = 0;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
+	struct radeon_renderbuffer *rrb;
+	struct radeon_renderbuffer *rrbd;
 	int bits = 0;
-	int swapped;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "r300Clear\n");
-
-	{
-		LOCK_HARDWARE(&r300->radeon);
-		UNLOCK_HARDWARE(&r300->radeon);
-		if (dPriv->numClipRects == 0)
-			return;
-	}
 
-	if (mask & BUFFER_BIT_FRONT_LEFT) {
-		flags |= BUFFER_BIT_FRONT_LEFT;
-		mask &= ~BUFFER_BIT_FRONT_LEFT;
-	}
-
-	if (mask & BUFFER_BIT_BACK_LEFT) {
-		flags |= BUFFER_BIT_BACK_LEFT;
-		mask &= ~BUFFER_BIT_BACK_LEFT;
-	}
-
-	if (mask & BUFFER_BIT_DEPTH) {
+	/* Make sure it fits there. */
+	rcommonEnsureCmdBufSpace(&r300->radeon, 421 * 3, __FUNCTION__);
+	if (flags || bits)
+		r300EmitClearState(ctx);
+	rrbd = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
+	if (rrbd && (flags & BUFFER_BIT_DEPTH))
 		bits |= CLEARBUFFER_DEPTH;
-		mask &= ~BUFFER_BIT_DEPTH;
-	}
 
-	if ((mask & BUFFER_BIT_STENCIL) && r300->state.stencil.hw_stencil) {
+	if (rrbd && (flags & BUFFER_BIT_STENCIL))
 		bits |= CLEARBUFFER_STENCIL;
-		mask &= ~BUFFER_BIT_STENCIL;
-	}
 
-	if (mask) {
-		if (RADEON_DEBUG & DEBUG_FALLBACKS)
-			fprintf(stderr, "%s: swrast clear, mask: %x\n",
-				__FUNCTION__, mask);
-		_swrast_Clear(ctx, mask);
+	if (flags & BUFFER_BIT_COLOR0) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_COLOR0);
+		r300ClearBuffer(r300, CLEARBUFFER_COLOR, rrb, NULL);
+		bits = 0;
 	}
 
-	swapped = r300->radeon.sarea->pfCurrentPage == 1;
-
-	/* Make sure it fits there. */
-	r300EnsureCmdBufSpace(r300, 421 * 3, __FUNCTION__);
-	if (flags || bits)
-		r300EmitClearState(ctx);
-
 	if (flags & BUFFER_BIT_FRONT_LEFT) {
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped);
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
 		bits = 0;
 	}
 
 	if (flags & BUFFER_BIT_BACK_LEFT) {
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped ^ 1);
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_BACK_LEFT);
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
 		bits = 0;
 	}
 
 	if (bits)
-		r300ClearBuffer(r300, bits, 0);
+		r300ClearBuffer(r300, bits, NULL, rrbd);
 
+	COMMIT_BATCH();
 }
 
-void r300Flush(GLcontext * ctx)
+/**
+ * Buffer clear
+ */
+static void r300Clear(GLcontext * ctx, GLbitfield mask)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+	GLbitfield swrast_mask = 0, tri_mask = 0;
+	int i;
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
 
 	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush( rmesa );
-
-	if (rmesa->cmdbuf.count_used > rmesa->cmdbuf.count_reemit)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-}
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-
-void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size)
-{
-	struct r300_dma_buffer *dmabuf;
-	size = MAX2(size, RADEON_BUFFER_SIZE * 16);
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush) {
-		rmesa->dma.flush(rmesa);
-	}
+		fprintf(stderr, "r300Clear\n");
 
-	if (rmesa->dma.current.buf) {
-#ifdef USER_BUFFERS
-		r300_mem_use(rmesa, rmesa->dma.current.buf->id);
-#endif
-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+	if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+		LOCK_HARDWARE(&r300->radeon);
+		UNLOCK_HARDWARE(&r300->radeon);
+		if (dPriv->numClipRects == 0)
+			return;
 	}
-	if (rmesa->dma.nr_released_bufs > 4)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
-	dmabuf->buf = (void *)1;	/* hack */
-	dmabuf->refcount = 1;
 
-	dmabuf->id = r300_mem_alloc(rmesa, 4, size);
-	if (dmabuf->id == 0) {
-		LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
-
-		r300FlushCmdBufLocked(rmesa, __FUNCTION__);
-		radeonWaitForIdleLocked(&rmesa->radeon);
+	/* Flush swtcl vertices if necessary, because we will change hardware
+	 * state during clear. See also the state-related comment in
+	 * r300EmitClearState.
+	 */
+	R300_NEWPRIM(r300);
 
-		dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+	if (colorMask == ~0)
+	  tri_mask |= (mask & BUFFER_BITS_COLOR);
 
-		UNLOCK_HARDWARE(&rmesa->radeon);
 
-		if (dmabuf->id == 0) {
-			fprintf(stderr,
-				"Error: Could not get dma buffer... exiting\n");
-			_mesa_exit(-1);
-		}
+	/* HW stencil */
+	if (mask & BUFFER_BIT_STENCIL) {
+		tri_mask |= BUFFER_BIT_STENCIL;
 	}
 
-	rmesa->dma.current.buf = dmabuf;
-	rmesa->dma.current.address = r300_mem_ptr(rmesa, dmabuf->id);
-	rmesa->dma.current.end = size;
-	rmesa->dma.current.start = 0;
-	rmesa->dma.current.ptr = 0;
-}
-
-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-			  struct r300_dma_region *region, const char *caller)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
-
-	if (!region->buf)
-		return;
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (--region->buf->refcount == 0) {
-		r300_mem_free(rmesa, region->buf->id);
-		FREE(region->buf);
-		rmesa->dma.nr_released_bufs++;
+	/* HW depth */
+	if (mask & BUFFER_BIT_DEPTH) {
+    	        tri_mask |= BUFFER_BIT_DEPTH;
 	}
 
-	region->buf = 0;
-	region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r300AllocDmaRegion(r300ContextPtr rmesa,
-			struct r300_dma_region *region,
-			int bytes, int alignment)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (region->buf)
-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
-
-	alignment--;
-	rmesa->dma.current.start = rmesa->dma.current.ptr =
-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
-		r300RefillCurrentDmaRegion(rmesa, (bytes + 0x7) & ~0x7);
-
-	region->start = rmesa->dma.current.start;
-	region->ptr = rmesa->dma.current.start;
-	region->end = rmesa->dma.current.start + bytes;
-	region->address = rmesa->dma.current.address;
-	region->buf = rmesa->dma.current.buf;
-	region->buf->refcount++;
-
-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
-	rmesa->dma.current.start =
-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
-
-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
-}
+	/* If we're doing a tri pass for depth/stencil, include a likely color
+	 * buffer with it.
+	 */
 
-#else
-static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa)
-{
-	struct r300_dma_buffer *dmabuf;
-	int fd = rmesa->radeon.dri.fd;
-	int index = 0;
-	int size = 0;
-	drmDMAReq dma;
-	int ret;
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush) {
-		rmesa->dma.flush(rmesa);
+	for (i = 0; i < BUFFER_COUNT; i++) {
+	  GLuint bufBit = 1 << i;
+	  if ((tri_mask) & bufBit) {
+	    if (!fb->Attachment[i].Renderbuffer->ClassID) {
+	      tri_mask &= ~bufBit;
+	      swrast_mask |= bufBit;
+	    }
+	  }
 	}
 
-	if (rmesa->dma.current.buf)
-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
-
-	if (rmesa->dma.nr_released_bufs > 4)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	dma.context = rmesa->radeon.dri.hwContext;
-	dma.send_count = 0;
-	dma.send_list = NULL;
-	dma.send_sizes = NULL;
-	dma.flags = 0;
-	dma.request_count = 1;
-	dma.request_size = RADEON_BUFFER_SIZE;
-	dma.request_list = &index;
-	dma.request_sizes = &size;
-	dma.granted_count = 0;
-
-	LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
-
-	ret = drmDMA(fd, &dma);
-
-	if (ret != 0) {
-		/* Try to release some buffers and wait until we can't get any more */
-		if (rmesa->dma.nr_released_bufs) {
-			r300FlushCmdBufLocked(rmesa, __FUNCTION__);
-		}
-
-		if (RADEON_DEBUG & DEBUG_DMA)
-			fprintf(stderr, "Waiting for buffers\n");
-
-		radeonWaitForIdleLocked(&rmesa->radeon);
-		ret = drmDMA(fd, &dma);
+	/* SW fallback clearing */
+	swrast_mask = mask & ~tri_mask;
 
-		if (ret != 0) {
-			UNLOCK_HARDWARE(&rmesa->radeon);
-			fprintf(stderr,
-				"Error: Could not get dma buffer... exiting\n");
-			_mesa_exit(-1);
-		}
+	if (tri_mask) {
+		if (r300->radeon.radeonScreen->kernel_mm)
+			r300UserClear(ctx, tri_mask);
+		else
+			r300KernelClear(ctx, tri_mask);
 	}
-
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	if (RADEON_DEBUG & DEBUG_DMA)
-		fprintf(stderr, "Allocated buffer %d\n", index);
-
-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
-	dmabuf->buf = &rmesa->radeon.radeonScreen->buffers->list[index];
-	dmabuf->refcount = 1;
-
-	rmesa->dma.current.buf = dmabuf;
-	rmesa->dma.current.address = dmabuf->buf->address;
-	rmesa->dma.current.end = dmabuf->buf->total;
-	rmesa->dma.current.start = 0;
-	rmesa->dma.current.ptr = 0;
-}
-
-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-			  struct r300_dma_region *region, const char *caller)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
-
-	if (!region->buf)
-		return;
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (--region->buf->refcount == 0) {
-		drm_radeon_cmd_header_t *cmd;
-
-		if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-			fprintf(stderr, "%s -- DISCARD BUF %d\n",
-				__FUNCTION__, region->buf->buf->idx);
-		cmd =
-		    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa,
-								sizeof
-								(*cmd) / 4,
-								__FUNCTION__);
-		cmd->dma.cmd_type = R300_CMD_DMA_DISCARD;
-		cmd->dma.buf_idx = region->buf->buf->idx;
-
-		FREE(region->buf);
-		rmesa->dma.nr_released_bufs++;
+	if (swrast_mask) {
+		if (RADEON_DEBUG & DEBUG_FALLBACKS)
+			fprintf(stderr, "%s: swrast clear, mask: %x\n",
+				__FUNCTION__, swrast_mask);
+		_swrast_Clear(ctx, swrast_mask);
 	}
-
-	region->buf = 0;
-	region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r300AllocDmaRegion(r300ContextPtr rmesa,
-			struct r300_dma_region *region,
-			int bytes, int alignment)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (region->buf)
-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
-
-	alignment--;
-	rmesa->dma.current.start = rmesa->dma.current.ptr =
-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
-		r300RefillCurrentDmaRegion(rmesa);
-
-	region->start = rmesa->dma.current.start;
-	region->ptr = rmesa->dma.current.start;
-	region->end = rmesa->dma.current.start + bytes;
-	region->address = rmesa->dma.current.address;
-	region->buf = rmesa->dma.current.buf;
-	region->buf->refcount++;
-
-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
-	rmesa->dma.current.start =
-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
-
-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
 }
 
-#endif
-
-GLboolean r300IsGartMemory(r300ContextPtr rmesa, const GLvoid * pointer,
-			   GLint size)
-{
-	int offset =
-	    (char *)pointer -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-	int valid = (size >= 0 && offset >= 0
-		     && offset + size <
-		     rmesa->radeon.radeonScreen->gartTextures.size);
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "r300IsGartMemory( %p ) : %d\n", pointer,
-			valid);
-
-	return valid;
-}
-
-GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa, const GLvoid * pointer)
-{
-	int offset =
-	    (char *)pointer -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-
-	//fprintf(stderr, "offset=%08x\n", offset);
-
-	if (offset < 0
-	    || offset > rmesa->radeon.radeonScreen->gartTextures.size)
-		return ~0;
-	else
-		return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
-}
 
 void r300InitIoctlFuncs(struct dd_function_table *functions)
 {
 	functions->Clear = r300Clear;
 	functions->Finish = radeonFinish;
-	functions->Flush = r300Flush;
+	functions->Flush = radeonFlush;
 }
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.h b/src/mesa/drivers/dri/r300/r300_ioctl.h
index e1143fb6c3..3abfa71a6e 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.h
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.h
@@ -39,22 +39,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "radeon_drm.h"
 
-extern GLboolean r300IsGartMemory(r300ContextPtr rmesa,
-				  const GLvoid * pointer, GLint size);
-
-extern GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa,
-					const GLvoid * pointer);
-
-extern void r300Flush(GLcontext * ctx);
-
-extern void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-				 struct r300_dma_region *region,
-				 const char *caller);
-extern void r300AllocDmaRegion(r300ContextPtr rmesa,
-			       struct r300_dma_region *region, int bytes,
-			       int alignment);
-
 extern void r300InitIoctlFuncs(struct dd_function_table *functions);
 
-extern void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size);
 #endif				/* __R300_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_mem.c b/src/mesa/drivers/dri/r300/r300_mem.c
deleted file mode 100644
index f8f9d4fcdf..0000000000
--- a/src/mesa/drivers/dri/r300/r300_mem.c
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * Copyright (C) 2005 Aapo Tahkola.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-/**
- * \file
- *
- * \author Aapo Tahkola <aet@rasterburn.org>
- */
-
-#include <unistd.h>
-
-#include "r300_context.h"
-#include "r300_cmdbuf.h"
-#include "r300_ioctl.h"
-#include "r300_mem.h"
-#include "radeon_ioctl.h"
-
-#ifdef USER_BUFFERS
-
-static void resize_u_list(r300ContextPtr rmesa)
-{
-	void *temp;
-	int nsize;
-
-	temp = rmesa->rmm->u_list;
-	nsize = rmesa->rmm->u_size * 2;
-
-	rmesa->rmm->u_list = _mesa_malloc(nsize * sizeof(*rmesa->rmm->u_list));
-	_mesa_memset(rmesa->rmm->u_list, 0,
-		     nsize * sizeof(*rmesa->rmm->u_list));
-
-	if (temp) {
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-		_mesa_memcpy(rmesa->rmm->u_list, temp,
-			     rmesa->rmm->u_size * sizeof(*rmesa->rmm->u_list));
-		_mesa_free(temp);
-	}
-
-	rmesa->rmm->u_size = nsize;
-}
-
-void r300_mem_init(r300ContextPtr rmesa)
-{
-	rmesa->rmm = malloc(sizeof(struct r300_memory_manager));
-	memset(rmesa->rmm, 0, sizeof(struct r300_memory_manager));
-
-	rmesa->rmm->u_size = 128;
-	resize_u_list(rmesa);
-}
-
-void r300_mem_destroy(r300ContextPtr rmesa)
-{
-	_mesa_free(rmesa->rmm->u_list);
-	rmesa->rmm->u_list = NULL;
-
-	_mesa_free(rmesa->rmm);
-	rmesa->rmm = NULL;
-}
-
-void *r300_mem_ptr(r300ContextPtr rmesa, int id)
-{
-	assert(id <= rmesa->rmm->u_last);
-	return rmesa->rmm->u_list[id].ptr;
-}
-
-int r300_mem_find(r300ContextPtr rmesa, void *ptr)
-{
-	int i;
-
-	for (i = 1; i < rmesa->rmm->u_size + 1; i++)
-		if (rmesa->rmm->u_list[i].ptr &&
-		    ptr >= rmesa->rmm->u_list[i].ptr &&
-		    ptr <
-		    rmesa->rmm->u_list[i].ptr + rmesa->rmm->u_list[i].size)
-			break;
-
-	if (i < rmesa->rmm->u_size + 1)
-		return i;
-
-	fprintf(stderr, "%p failed\n", ptr);
-	return 0;
-}
-
-//#define MM_DEBUG
-int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size)
-{
-	drm_radeon_mem_alloc_t alloc;
-	int offset = 0, ret;
-	int i, free = -1;
-	int done_age;
-	drm_radeon_mem_free_t memfree;
-	int tries = 0;
-	static int bytes_wasted = 0, allocated = 0;
-
-	if (size < 4096)
-		bytes_wasted += 4096 - size;
-
-	allocated += size;
-
-#if 0
-	static int t = 0;
-	if (t != time(NULL)) {
-		t = time(NULL);
-		fprintf(stderr, "slots used %d, wasted %d kb, allocated %d\n",
-			rmesa->rmm->u_last, bytes_wasted / 1024,
-			allocated / 1024);
-	}
-#endif
-
-	memfree.region = RADEON_MEM_REGION_GART;
-
-      again:
-
-	done_age = radeonGetAge((radeonContextPtr) rmesa);
-
-	if (rmesa->rmm->u_last + 1 >= rmesa->rmm->u_size)
-		resize_u_list(rmesa);
-
-	for (i = rmesa->rmm->u_last + 1; i > 0; i--) {
-		if (rmesa->rmm->u_list[i].ptr == NULL) {
-			free = i;
-			continue;
-		}
-
-		if (rmesa->rmm->u_list[i].h_pending == 0 &&
-		    rmesa->rmm->u_list[i].pending
-		    && rmesa->rmm->u_list[i].age <= done_age) {
-			memfree.region_offset =
-			    (char *)rmesa->rmm->u_list[i].ptr -
-			    (char *)rmesa->radeon.radeonScreen->gartTextures.
-			    map;
-
-			ret =
-			    drmCommandWrite(rmesa->radeon.radeonScreen->
-					    driScreen->fd, DRM_RADEON_FREE,
-					    &memfree, sizeof(memfree));
-
-			if (ret) {
-				fprintf(stderr, "Failed to free at %p\n",
-					rmesa->rmm->u_list[i].ptr);
-				fprintf(stderr, "ret = %s\n", strerror(-ret));
-				exit(1);
-			} else {
-#ifdef MM_DEBUG
-				fprintf(stderr, "really freed %d at age %x\n",
-					i,
-					radeonGetAge((radeonContextPtr) rmesa));
-#endif
-				if (i == rmesa->rmm->u_last)
-					rmesa->rmm->u_last--;
-
-				if (rmesa->rmm->u_list[i].size < 4096)
-					bytes_wasted -=
-					    4096 - rmesa->rmm->u_list[i].size;
-
-				allocated -= rmesa->rmm->u_list[i].size;
-				rmesa->rmm->u_list[i].pending = 0;
-				rmesa->rmm->u_list[i].ptr = NULL;
-				free = i;
-			}
-		}
-	}
-	rmesa->rmm->u_head = i;
-
-	if (free == -1) {
-		WARN_ONCE("Ran out of slots!\n");
-		//usleep(100);
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-		tries++;
-		if (tries > 100) {
-			WARN_ONCE("Ran out of slots!\n");
-			exit(1);
-		}
-		goto again;
-	}
-
-	alloc.region = RADEON_MEM_REGION_GART;
-	alloc.alignment = alignment;
-	alloc.size = size;
-	alloc.region_offset = &offset;
-
-	ret =
-	    drmCommandWriteRead(rmesa->radeon.dri.fd, DRM_RADEON_ALLOC, &alloc,
-				sizeof(alloc));
-	if (ret) {
-#if 0
-		WARN_ONCE("Ran out of mem!\n");
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-		//usleep(100);
-		tries2++;
-		tries = 0;
-		if (tries2 > 100) {
-			WARN_ONCE("Ran out of GART memory!\n");
-			exit(1);
-		}
-		goto again;
-#else
-		WARN_ONCE
-		    ("Ran out of GART memory (for %d)!\nPlease consider adjusting GARTSize option.\n",
-		     size);
-		return 0;
-#endif
-	}
-
-	i = free;
-
-	if (i > rmesa->rmm->u_last)
-		rmesa->rmm->u_last = i;
-
-	rmesa->rmm->u_list[i].ptr =
-	    ((GLubyte *) rmesa->radeon.radeonScreen->gartTextures.map) + offset;
-	rmesa->rmm->u_list[i].size = size;
-	rmesa->rmm->u_list[i].age = 0;
-	//fprintf(stderr, "alloc %p at id %d\n", rmesa->rmm->u_list[i].ptr, i);
-
-#ifdef MM_DEBUG
-	fprintf(stderr, "allocated %d at age %x\n", i,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	return i;
-}
-
-void r300_mem_use(r300ContextPtr rmesa, int id)
-{
-	uint64_t ull;
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-	drm_r300_cmd_header_t *cmd;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (id == 0)
-		return;
-
-	cmd =
-	    (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa,
-						      2 + sizeof(ull) / 4,
-						      __FUNCTION__);
-	cmd[0].scratch.cmd_type = R300_CMD_SCRATCH;
-	cmd[0].scratch.reg = R300_MEM_SCRATCH;
-	cmd[0].scratch.n_bufs = 1;
-	cmd[0].scratch.flags = 0;
-	cmd++;
-
-	ull = (uint64_t) (intptr_t) & rmesa->rmm->u_list[id].age;
-	_mesa_memcpy(cmd, &ull, sizeof(ull));
-	cmd += sizeof(ull) / 4;
-
-	cmd[0].u = /*id */ 0;
-
-	LOCK_HARDWARE(&rmesa->radeon);	/* Protect from DRM. */
-	rmesa->rmm->u_list[id].h_pending++;
-	UNLOCK_HARDWARE(&rmesa->radeon);
-}
-
-unsigned long r300_mem_offset(r300ContextPtr rmesa, int id)
-{
-	unsigned long offset;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	offset = (char *)rmesa->rmm->u_list[id].ptr -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-	offset += rmesa->radeon.radeonScreen->gart_texture_offset;
-
-	return offset;
-}
-
-void *r300_mem_map(r300ContextPtr rmesa, int id, int access)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-	void *ptr;
-	int tries = 0;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (access == R300_MEM_R) {
-
-		if (rmesa->rmm->u_list[id].mapped == 1)
-			WARN_ONCE("buffer %d already mapped\n", id);
-
-		rmesa->rmm->u_list[id].mapped = 1;
-		ptr = r300_mem_ptr(rmesa, id);
-
-		return ptr;
-	}
-
-	if (rmesa->rmm->u_list[id].h_pending)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	if (rmesa->rmm->u_list[id].h_pending) {
-		return NULL;
-	}
-
-	while (rmesa->rmm->u_list[id].age >
-	       radeonGetAge((radeonContextPtr) rmesa) && tries++ < 1000)
-		usleep(10);
-
-	if (tries >= 1000) {
-		fprintf(stderr, "Idling failed (%x vs %x)\n",
-			rmesa->rmm->u_list[id].age,
-			radeonGetAge((radeonContextPtr) rmesa));
-		return NULL;
-	}
-
-	if (rmesa->rmm->u_list[id].mapped == 1)
-		WARN_ONCE("buffer %d already mapped\n", id);
-
-	rmesa->rmm->u_list[id].mapped = 1;
-	ptr = r300_mem_ptr(rmesa, id);
-
-	return ptr;
-}
-
-void r300_mem_unmap(r300ContextPtr rmesa, int id)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (rmesa->rmm->u_list[id].mapped == 0)
-		WARN_ONCE("buffer %d not mapped\n", id);
-
-	rmesa->rmm->u_list[id].mapped = 0;
-}
-
-void r300_mem_free(r300ContextPtr rmesa, int id)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (id == 0)
-		return;
-
-	if (rmesa->rmm->u_list[id].ptr == NULL) {
-		WARN_ONCE("Not allocated!\n");
-		return;
-	}
-
-	if (rmesa->rmm->u_list[id].pending) {
-		WARN_ONCE("%p already pended!\n", rmesa->rmm->u_list[id].ptr);
-		return;
-	}
-
-	rmesa->rmm->u_list[id].pending = 1;
-}
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_mem.h b/src/mesa/drivers/dri/r300/r300_mem.h
deleted file mode 100644
index 625a7f6d8d..0000000000
--- a/src/mesa/drivers/dri/r300/r300_mem.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef __R300_MEM_H__
-#define __R300_MEM_H__
-
-//#define R300_MEM_PDL 0
-#define R300_MEM_UL 1
-
-#define R300_MEM_R 1
-#define R300_MEM_W 2
-#define R300_MEM_RW (R300_MEM_R | R300_MEM_W)
-
-#define R300_MEM_SCRATCH 2
-
-struct r300_memory_manager {
-	struct {
-		void *ptr;
-		uint32_t size;
-		uint32_t age;
-		uint32_t h_pending;
-		int pending;
-		int mapped;
-	} *u_list;
-	int u_head, u_size, u_last;
-
-};
-
-extern void r300_mem_init(r300ContextPtr rmesa);
-extern void r300_mem_destroy(r300ContextPtr rmesa);
-extern void *r300_mem_ptr(r300ContextPtr rmesa, int id);
-extern int r300_mem_find(r300ContextPtr rmesa, void *ptr);
-extern int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size);
-extern void r300_mem_use(r300ContextPtr rmesa, int id);
-extern unsigned long r300_mem_offset(r300ContextPtr rmesa, int id);
-extern void *r300_mem_map(r300ContextPtr rmesa, int id, int access);
-extern void r300_mem_unmap(r300ContextPtr rmesa, int id);
-extern void r300_mem_free(r300ContextPtr rmesa, int id);
-
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index 8f1a6630d5..357c600af9 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -1467,6 +1467,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_TX_FORMAT_3D		   (1 << 25)
 #	define R300_TX_FORMAT_CUBIC_MAP		   (2 << 25)
 
+#	define R300_TX_FORMAT_GAMMA			(1 << 21)
+
 	/* gap */
 	/* Floating point formats */
 	/* Note - hardware supports both 16 and 32 bit floating point */
@@ -1531,6 +1533,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R500_SEL_FILTER4_TC3		 (3 << 18)
 
 #define R300_TX_OFFSET_0                    0x4540
+#define R300_TX_OFFSET_1                    0x4544
+#define R300_TX_OFFSET_2                    0x4548
+#define R300_TX_OFFSET_3                    0x454C
+#define R300_TX_OFFSET_4                    0x4550
+#define R300_TX_OFFSET_5                    0x4554
+#define R300_TX_OFFSET_6                    0x4558
+#define R300_TX_OFFSET_7                    0x455C
 	/* BEGIN: Guess from R200 */
 #       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
 #       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
@@ -2425,6 +2434,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* Z Buffer Clear Value */
 #define R300_ZB_DEPTHCLEARVALUE                  0x4f28
 
+#define R300_ZB_ZMASK_OFFSET                     0x4f30
+#define R300_ZB_ZMASK_PITCH                      0x4f34
+#define R300_ZB_ZMASK_WRINDEX                    0x4f38
+#define R300_ZB_ZMASK_DWORD                      0x4f3c
+#define R300_ZB_ZMASK_RDINDEX                    0x4f40
+
 /* Hierarchical Z Memory Offset */
 #define R300_ZB_HIZ_OFFSET                       0x4f44
 
@@ -3165,6 +3180,9 @@ enum {
 #   define R300_W_SRC_RAS				(1 << 2)
 
 
+/* Packet0 field ordering to write all values to the same reg */
+#define RADEON_ONE_REG_WR        (1 << 15)
+
 /* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
  * Two parameter dwords:
  * 0. VAP_VTX_FMT: The first parameter is not written to hardware
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index 16ce4a1199..bf50b062f6 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -50,6 +50,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  * no bugs...
  */
 
+#include "r300_render.h"
+
 #include "main/glheader.h"
 #include "main/state.h"
 #include "main/imports.h"
@@ -66,16 +68,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_vp_build.h"
 #include "radeon_reg.h"
 #include "radeon_macros.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
-extern int future_hw_tcl_on;
+#include "r300_fragprog_common.h"
+#include "r300_swtcl.h"
 
 /**
  * \brief Convert a OpenGL primitive type into a R300 primitive type.
@@ -172,96 +172,186 @@ int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
 	return num_verts - verts_off;
 }
 
-static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts)
+static void r300EmitElts(GLcontext * ctx, unsigned long n_elts)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_dma_region *rvb = &rmesa->state.elt_dma;
 	void *out;
+	GLuint size;
 
-	if (r300IsGartMemory(rmesa, elts, n_elts * 4)) {
-		rvb->address = rmesa->radeon.radeonScreen->gartTextures.map;
-		rvb->start = ((char *)elts) - rvb->address;
-		rvb->aos_offset =
-		    rmesa->radeon.radeonScreen->gart_texture_offset +
-		    rvb->start;
-		return;
-	} else if (r300IsGartMemory(rmesa, elts, 1)) {
-		WARN_ONCE("Pointer not within GART memory!\n");
-		_mesa_exit(-1);
-	}
-
-	r300AllocDmaRegion(rmesa, rvb, n_elts * 4, 4);
-	rvb->aos_offset = GET_START(rvb);
+	size = ((rmesa->ind_buf.is_32bit ? 4 : 2) * n_elts + 3) & ~3;
 
-	out = rvb->address + rvb->start;
-	memcpy(out, elts, n_elts * 4);
+	radeonAllocDmaRegion(&rmesa->radeon, &rmesa->radeon.tcl.elt_dma_bo,
+			     &rmesa->radeon.tcl.elt_dma_offset, size, 4);
+	radeon_bo_map(rmesa->radeon.tcl.elt_dma_bo, 1);
+	out = rmesa->radeon.tcl.elt_dma_bo->ptr + rmesa->radeon.tcl.elt_dma_offset;
+	memcpy(out, rmesa->ind_buf.ptr, size);
+	radeon_bo_unmap(rmesa->radeon.tcl.elt_dma_bo);
 }
 
-static void r300FireEB(r300ContextPtr rmesa, unsigned long addr,
-		       int vertex_count, int type)
+static void r300FireEB(r300ContextPtr rmesa, int vertex_count, int type)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (vertex_count << 16) | type | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
-
-	start_packet3(CP_PACKET3(R300_PACKET3_INDX_BUFFER, 2), 2);
-	e32(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
-	    (R300_VAP_PORT_IDX0 >> 2));
-	e32(addr);
-	e32(vertex_count);
+	BATCH_LOCALS(&rmesa->radeon);
+
+    r300_emit_scissor(rmesa->radeon.glCtx);
+	if (vertex_count > 0) {
+		int size;
+
+		BEGIN_BATCH(10);
+		OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0);
+		if (rmesa->ind_buf.is_32bit) {
+			size = vertex_count;
+			OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+			  ((vertex_count + 0) << 16) | type |
+			  R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+		} else {
+			size = (vertex_count + 1) >> 1;
+			OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+			   ((vertex_count + 0) << 16) | type);
+		}
+
+		if (!rmesa->radeon.radeonScreen->kernel_mm) {
+			OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+			OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+	    			 (R300_VAP_PORT_IDX0 >> 2));
+			OUT_BATCH_RELOC(rmesa->radeon.tcl.elt_dma_offset,
+					rmesa->radeon.tcl.elt_dma_bo,
+					rmesa->radeon.tcl.elt_dma_offset,
+					RADEON_GEM_DOMAIN_GTT, 0, 0);
+			OUT_BATCH(size);
+		} else {
+			OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+			OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+	    			 (R300_VAP_PORT_IDX0 >> 2));
+			OUT_BATCH(rmesa->radeon.tcl.elt_dma_offset);
+			OUT_BATCH(size);
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.elt_dma_bo,
+					      RADEON_GEM_DOMAIN_GTT, 0, 0);
+		}
+		END_BATCH();
+	}
 }
 
 static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 {
+	BATCH_LOCALS(&rmesa->radeon);
+	uint32_t voffset;
 	int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
 	int i;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
 
 	if (RADEON_DEBUG & DEBUG_VERTS)
 		fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
 			offset);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1), sz - 1);
-	e32(nr);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH(sz+2+(nr * 2));
+		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+		OUT_BATCH(nr);
+
+		for (i = 0; i + 1 < nr; i += 2) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+				  (rmesa->radeon.tcl.aos[i].stride << 8) |
+				  (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+				  (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[i].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+			  offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[i+1].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+		}
 
-	for (i = 0; i + 1 < nr; i += 2) {
-		e32((rmesa->state.aos[i].aos_size << 0) |
-		    (rmesa->state.aos[i].aos_stride << 8) |
-		    (rmesa->state.aos[i + 1].aos_size << 16) |
-		    (rmesa->state.aos[i + 1].aos_stride << 24));
+		if (nr & 1) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+				  (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[nr - 1].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+		}
+		END_BATCH();
+	} else {
 
-		e32(rmesa->state.aos[i].aos_offset + offset * 4 * rmesa->state.aos[i].aos_stride);
-		e32(rmesa->state.aos[i + 1].aos_offset + offset * 4 * rmesa->state.aos[i + 1].aos_stride);
-	}
+		BEGIN_BATCH(sz+2+(nr * 2));
+		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+		OUT_BATCH(nr);
+
+		for (i = 0; i + 1 < nr; i += 2) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+				  (rmesa->radeon.tcl.aos[i].stride << 8) |
+				  (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+				  (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			OUT_BATCH(voffset);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			OUT_BATCH(voffset);
+		}
 
-	if (nr & 1) {
-		e32((rmesa->state.aos[nr - 1].aos_size << 0) |
-		    (rmesa->state.aos[nr - 1].aos_stride << 8));
-		e32(rmesa->state.aos[nr - 1].aos_offset + offset * 4 * rmesa->state.aos[nr - 1].aos_stride);
+		if (nr & 1) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+			  (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			OUT_BATCH(voffset);
+		}
+		for (i = 0; i + 1 < nr; i += 2) {
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[i+0].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[i+1].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+		}
+		if (nr & 1) {
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[nr-1].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+		}
+		END_BATCH();
 	}
+
 }
 
 static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(&rmesa->radeon);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+    r300_emit_scissor(rmesa->radeon.glCtx);
+	BEGIN_BATCH(3);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+	END_BATCH();
 }
 
-static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
-				   int start, int end, int prim)
+void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
 
 	type = r300PrimitiveType(rmesa, prim);
 	num_verts = r300NumVerts(rmesa, end - start, prim);
@@ -269,7 +359,13 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 	if (type < 0 || num_verts <= 0)
 		return;
 
-	if (vb->Elts) {
+	/* Make space for at least 64 dwords.
+	 * This is supposed to ensure that we can get all rendering
+	 * commands into a single command buffer.
+	 */
+	rcommonEnsureCmdBufSpace(&rmesa->radeon, 128, __FUNCTION__);
+
+	if (rmesa->ind_buf.ptr) {
 		if (num_verts > 65535) {
 			/* not implemented yet */
 			WARN_ONCE("Too many elts\n");
@@ -286,114 +382,143 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 		 * allocating the index array might actually evict the vertex
 		 * arrays. *sigh*
 		 */
-		r300EmitElts(ctx, vb->Elts, num_verts);
-		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
-		r300FireEB(rmesa, rmesa->state.elt_dma.aos_offset, num_verts, type);
+		r300EmitElts(ctx, num_verts);
+		/* don't pass start if we are split up */
+		r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, 0);
+		if (rmesa->radeon.radeonScreen->kernel_mm) {
+			BEGIN_BATCH_NO_AUTOSTATE(2);
+			OUT_BATCH_REGSEQ(R300_VAP_VF_MAX_VTX_INDX, 1);
+			OUT_BATCH(rmesa->radeon.tcl.aos[0].count);
+			END_BATCH();
+		}
+		r300FireEB(rmesa, num_verts, type);
 	} else {
-		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
+		r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, start);
 		r300FireAOS(rmesa, num_verts, type);
 	}
+	COMMIT_BATCH();
 }
 
-static GLboolean r300RunRender(GLcontext * ctx,
-			       struct tnl_pipeline_stage *stage)
+static void r300RunRender(GLcontext * ctx, struct tnl_pipeline_stage *stage)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	int i;
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	struct vertex_buffer *vb = &tnl->vb;
 
-
 	if (RADEON_DEBUG & DEBUG_PRIMS)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
 	r300UpdateShaders(rmesa);
-	if (r300EmitArrays(ctx))
-		return GL_TRUE;
+	r300EmitArrays(ctx);
 
 	r300UpdateShaderStates(rmesa);
 
 	r300EmitCacheFlush(rmesa);
-	r300EmitState(rmesa);
+	radeonEmitState(&rmesa->radeon);
 
 	for (i = 0; i < vb->PrimitiveCount; i++) {
 		GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
 		GLuint start = vb->Primitive[i].start;
 		GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
-		r300RunRenderPrimitive(rmesa, ctx, start, end, prim);
+		r300RunRenderPrimitive(ctx, start, end, prim);
 	}
 
 	r300EmitCacheFlush(rmesa);
 
-#ifdef USER_BUFFERS
-	r300UseArrays(ctx);
-#endif
+	radeonReleaseArrays(ctx, ~0);
+}
 
-	r300ReleaseArrays(ctx);
 
-	return GL_FALSE;
+static const char *getFallbackString(uint32_t bit)
+{
+	switch (bit) {
+		case R300_FALLBACK_VERTEX_PROGRAM :
+			return "vertex program";
+		case R300_FALLBACK_LINE_SMOOTH:
+			return "smooth lines";
+		case R300_FALLBACK_POINT_SMOOTH:
+			return "smooth points";
+		case R300_FALLBACK_POLYGON_SMOOTH:
+			return "smooth polygons";
+		case R300_FALLBACK_LINE_STIPPLE:
+			return "line stipple";
+		case R300_FALLBACK_POLYGON_STIPPLE:
+			return "polygon stipple";
+		case R300_FALLBACK_STENCIL_TWOSIDE:
+			return "two-sided stencil";
+		case R300_FALLBACK_RENDER_MODE:
+			return "render mode != GL_RENDER";
+		case R300_FALLBACK_FRAGMENT_PROGRAM:
+			return "fragment program";
+		case R300_FALLBACK_AOS_LIMIT:
+			return "aos limit";
+		case R300_FALLBACK_INVALID_BUFFERS:
+			return "invalid buffers";
+		default:
+			return "unknown";
+	}
 }
 
-#define FALLBACK_IF(expr)						\
-	do {								\
-		if (expr) {						\
-			if (1 || RADEON_DEBUG & DEBUG_FALLBACKS)	\
-				WARN_ONCE("Software fallback:%s\n",	\
-					  #expr);			\
-			return R300_FALLBACK_RAST;			\
-		}							\
-	} while(0)
-
-static int r300Fallback(GLcontext * ctx)
+void r300SwitchFallback(GLcontext *ctx, uint32_t bit, GLboolean mode)
 {
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	const unsigned back = ctx->Stencil._BackFace;
-
-	/* Do we need to use new-style shaders?
-	 * Also is there a better way to do this? */
-	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-		struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-		if (fp) {
-			if (!fp->translated) {
-				r500TranslateFragmentShader(r300, fp);
-				FALLBACK_IF(!fp->translated);
+	TNLcontext *tnl = TNL_CONTEXT(ctx);
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	uint32_t old_fallback = rmesa->fallback;
+	static uint32_t fallback_warn = 0;
+	
+	if (mode) {
+		if ((fallback_warn & bit) == 0) {
+			_mesa_fprintf(stderr, "WARNING! Falling back to software for %s\n", getFallbackString(bit));
+			fallback_warn |= bit;
+		}
+		rmesa->fallback |= bit;
+
+		/* update only if we change from no tcl fallbacks to some tcl fallbacks */
+		if (rmesa->options.hw_tcl_enabled) {
+			if (((old_fallback & R300_TCL_FALLBACK_MASK) == 0) &&
+				((bit & R300_TCL_FALLBACK_MASK) > 0)) {
+				R300_STATECHANGE(rmesa, vap_cntl_status);
+				rmesa->hw.vap_cntl_status.cmd[1] |= R300_VAP_TCL_BYPASS;
 			}
 		}
+
+		/* update only if we change from no raster fallbacks to some raster fallbacks */
+		if (((old_fallback & R300_RASTER_FALLBACK_MASK) == 0) &&
+			((bit & R300_RASTER_FALLBACK_MASK) > 0)) {
+			
+			radeon_firevertices(&rmesa->radeon);
+			rmesa->radeon.swtcl.RenderIndex = ~0;
+			_swsetup_Wakeup( ctx );
+		}
 	} else {
-		struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-		if (fp) {
-			if (!fp->translated) {
-				r300TranslateFragmentShader(r300, fp);
-				FALLBACK_IF(!fp->translated);
+		rmesa->fallback &= ~bit;
+
+		/* update only if we have disabled all tcl fallbacks */
+		if (rmesa->options.hw_tcl_enabled) {
+			if ((old_fallback & R300_RASTER_FALLBACK_MASK) == bit) {
+				R300_STATECHANGE(rmesa, vap_cntl_status);
+				rmesa->hw.vap_cntl_status.cmd[1] &= ~R300_VAP_TCL_BYPASS;
 			}
 		}
-	}
 
-	FALLBACK_IF(ctx->RenderMode != GL_RENDER);
-
-	/* If GL_EXT_stencil_two_side is disabled, this fallback check can
-	 * be removed.
-	 */
-	FALLBACK_IF(ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
-		    || ctx->Stencil.ValueMask[0] !=
-		    ctx->Stencil.ValueMask[back]
-		    || ctx->Stencil.WriteMask[0] !=
-		    ctx->Stencil.WriteMask[back]);
-
-	if (ctx->Extensions.NV_point_sprite || ctx->Extensions.ARB_point_sprite)
-		FALLBACK_IF(ctx->Point.PointSprite);
-
-	if (!r300->disable_lowimpact_fallback) {
-		FALLBACK_IF(ctx->Polygon.StippleFlag);
-		FALLBACK_IF(ctx->Multisample._Enabled);
-		FALLBACK_IF(ctx->Line.StippleFlag);
-		FALLBACK_IF(ctx->Line.SmoothFlag);
-		FALLBACK_IF(ctx->Point.SmoothFlag);
+		/* update only if we have disabled all raster fallbacks */
+		if ((old_fallback & R300_RASTER_FALLBACK_MASK) == bit) {
+			_swrast_flush( ctx );
+			
+			tnl->Driver.Render.Start = r300RenderStart;
+			tnl->Driver.Render.Finish = r300RenderFinish;
+			tnl->Driver.Render.PrimitiveNotify = r300RenderPrimitive;
+			tnl->Driver.Render.ResetLineStipple = r300ResetLineStipple;
+			tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+			tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+			tnl->Driver.Render.Interp = _tnl_interp;
+			
+			_tnl_invalidate_vertex_state( ctx, ~0 );
+			_tnl_invalidate_vertices( ctx, ~0 );
+		}
 	}
-
-	return R300_FALLBACK_NONE;
+	
 }
 
 static GLboolean r300RunNonTCLRender(GLcontext * ctx,
@@ -404,43 +529,15 @@ static GLboolean r300RunNonTCLRender(GLcontext * ctx,
 	if (RADEON_DEBUG & DEBUG_PRIMS)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
-	if (r300Fallback(ctx) >= R300_FALLBACK_RAST)
-		return GL_TRUE;
-
-	if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
- 	        return GL_TRUE;
-
-	return r300RunRender(ctx, stage);
-}
-
-static GLboolean r300RunTCLRender(GLcontext * ctx,
-				  struct tnl_pipeline_stage *stage)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_vertex_program *vp;
-
-	hw_tcl_on = future_hw_tcl_on;
-
-	if (RADEON_DEBUG & DEBUG_PRIMS)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (hw_tcl_on == GL_FALSE)
+	if (rmesa->fallback & R300_RASTER_FALLBACK_MASK)
 		return GL_TRUE;
 
-	if (r300Fallback(ctx) >= R300_FALLBACK_TCL) {
-		hw_tcl_on = GL_FALSE;
+	if (rmesa->options.hw_tcl_enabled == GL_FALSE)
 		return GL_TRUE;
-	}
-
-	r300UpdateShaders(rmesa);
 
-	vp = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-	if (vp->native == GL_FALSE) {
-		hw_tcl_on = GL_FALSE;
-		return GL_TRUE;
-	}
+	r300RunRender(ctx, stage);
 
-	return r300RunRender(ctx, stage);
+	return GL_FALSE;
 }
 
 const struct tnl_pipeline_stage _r300_render_stage = {
@@ -451,12 +548,3 @@ const struct tnl_pipeline_stage _r300_render_stage = {
 	NULL,
 	r300RunNonTCLRender
 };
-
-const struct tnl_pipeline_stage _r300_tcl_stage = {
-	"r300 Hardware Transform, Clipping and Lighting",
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	r300RunTCLRender
-};
diff --git a/src/mesa/drivers/dri/r300/r300_render.h b/src/mesa/drivers/dri/r300/r300_render.h
new file mode 100644
index 0000000000..ec785474a6
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_render.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_RENDER_H__
+#define __R300_RENDER_H__
+
+#include "main/mtypes.h"
+
+#define R300_FALLBACK_VERTEX_PROGRAM    (1 << 0)
+#define R300_TCL_FALLBACK_MASK           0x0000ffff
+
+#define R300_FALLBACK_LINE_SMOOTH       (1 << 16)
+#define R300_FALLBACK_POINT_SMOOTH      (1 << 17)
+#define R300_FALLBACK_POLYGON_SMOOTH    (1 << 18)
+#define R300_FALLBACK_LINE_STIPPLE      (1 << 19)
+#define R300_FALLBACK_POLYGON_STIPPLE   (1 << 20)
+#define R300_FALLBACK_STENCIL_TWOSIDE   (1 << 21)
+#define R300_FALLBACK_RENDER_MODE       (1 << 22)
+#define R300_FALLBACK_FRAGMENT_PROGRAM  (1 << 23)
+#define R300_FALLBACK_AOS_LIMIT         (1 << 30)
+#define R300_FALLBACK_INVALID_BUFFERS   (1 << 31)
+#define R300_RASTER_FALLBACK_MASK        0xffff0000
+
+#define MASK_XYZW (R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W)
+#define MASK_X R300_WRITE_ENA_X
+#define MASK_Y R300_WRITE_ENA_Y
+#define MASK_Z R300_WRITE_ENA_Z
+#define MASK_W R300_WRITE_ENA_W
+
+#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
+    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
+    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
+    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
+    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
+    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
+#error Cannot change these!
+#endif
+
+extern const struct tnl_pipeline_stage _r300_render_stage;
+
+extern void r300SwitchFallback(GLcontext *ctx, uint32_t bit, GLboolean mode);
+
+extern void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_shader.c b/src/mesa/drivers/dri/r300/r300_shader.c
index f30fd986e0..0133b83796 100644
--- a/src/mesa/drivers/dri/r300/r300_shader.c
+++ b/src/mesa/drivers/dri/r300/r300_shader.c
@@ -1,18 +1,42 @@
+/*
+ * Copyright 2009 Maciej Cencora <m.cencora@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
 
 #include "main/glheader.h"
 
 #include "shader/program.h"
 #include "tnl/tnl.h"
 #include "r300_context.h"
-#include "r300_fragprog.h"
+#include "r300_fragprog_common.h"
 
 static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 					 GLuint id)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	struct r300_vertex_program_cont *vp;
-	struct r300_fragment_program *r300_fp;
-	struct r500_fragment_program *r500_fp;
+	struct r300_fragment_program *fp;
 
 	switch (target) {
 	case GL_VERTEX_STATE_PROGRAM_NV:
@@ -20,28 +44,12 @@ static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 		vp = CALLOC_STRUCT(r300_vertex_program_cont);
 		return _mesa_init_vertex_program(ctx, &vp->mesa_program,
 						 target, id);
-	case GL_FRAGMENT_PROGRAM_ARB:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-			r500_fp = CALLOC_STRUCT(r500_fragment_program);
-			r500_fp->ctx = ctx;
-			return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
-							   target, id);
-		} else {
-			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
-							   target, id);
-		}
 
 	case GL_FRAGMENT_PROGRAM_NV:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-			r500_fp = CALLOC_STRUCT(r500_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
-							   target, id);
-		} else {
-			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
-							   target, id);
-		}
+	case GL_FRAGMENT_PROGRAM_ARB:
+		fp = CALLOC_STRUCT(r300_fragment_program);
+		return _mesa_init_fragment_program(ctx, &fp->Base, target, id);
+
 	default:
 		_mesa_problem(ctx, "Bad target in r300NewProgram");
 	}
@@ -57,20 +65,15 @@ static void r300DeleteProgram(GLcontext * ctx, struct gl_program *prog)
 static void
 r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	struct r300_vertex_program_cont *vp = (void *)prog;
 	struct r300_fragment_program *r300_fp = (struct r300_fragment_program *)prog;
-	struct r500_fragment_program *r500_fp = (struct r500_fragment_program *)prog;
 
 	switch (target) {
 	case GL_VERTEX_PROGRAM_ARB:
 		vp->progs = NULL;
 		break;
 	case GL_FRAGMENT_PROGRAM_ARB:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-			r500_fp->translated = GL_FALSE;
-		else
-			r300_fp->translated = GL_FALSE;
+		r300_fp->translated = GL_FALSE;
 		break;
 	}
 
@@ -81,7 +84,14 @@ r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 static GLboolean
 r300IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-	return GL_TRUE;
+	if (target == GL_FRAGMENT_PROGRAM_ARB) {
+		struct r300_fragment_program *fp = (struct r300_fragment_program *)prog;
+		if (!fp->translated)
+			r300TranslateFragmentShader(ctx, &fp->Base);
+
+		return !fp->error;
+	} else
+		return GL_TRUE;
 }
 
 void r300InitShaderFuncs(struct dd_function_table *functions)
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index 79f0b3625c..c0eda977db 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -42,6 +42,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/context.h"
 #include "main/dd.h"
+#include "main/framebuffer.h"
 #include "main/simple_list.h"
 #include "main/api_arrayelt.h"
 #include "main/texformat.h"
@@ -52,22 +53,22 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "shader/prog_statevars.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
 
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
 #include "r300_tex.h"
+#include "r300_fragprog_common.h"
+#include "r300_fragprog.h"
+#include "r500_fragprog.h"
+#include "r300_render.h"
+#include "r300_vertprog.h"
 
 #include "drirenderbuffer.h"
 
-extern int future_hw_tcl_on;
-extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
-
 static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
@@ -366,7 +367,7 @@ static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 	GLint *ip;
 
 	/* no VAP UCP on non-TCL chipsets */
-	if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (!rmesa->options.hw_tcl_enabled)
 			return;
 
 	p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
@@ -385,7 +386,7 @@ static void r300SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state)
 	GLuint p;
 
 	/* no VAP UCP on non-TCL chipsets */
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (!r300->options.hw_tcl_enabled)
 		return;
 
 	p = cap - GL_CLIP_PLANE0;
@@ -451,24 +452,16 @@ static void r300SetPolygonOffsetState(GLcontext * ctx, GLboolean state)
 
 static GLboolean current_fragment_program_writes_depth(GLcontext* ctx)
 {
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 
-	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
-		struct r300_fragment_program *fp = (struct r300_fragment_program *)
-			(char *)ctx->FragmentProgram._Current;
-		return (fp && fp->WritesDepth);
-	} else {
-		struct r500_fragment_program* fp =
-			(struct r500_fragment_program*)(char*)
-			ctx->FragmentProgram._Current;
-		return (fp && fp->writes_depth);
-	}
+	return (fp && fp->writes_depth);
 }
 
 static void r300SetEarlyZState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	GLuint topZ = R300_ZTOP_ENABLE;
+	GLuint w_fmt, fgdepthsrc;
 
 	if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
 		topZ = R300_ZTOP_DISABLE;
@@ -485,6 +478,26 @@ static void r300SetEarlyZState(GLcontext * ctx)
 		R300_STATECHANGE(r300, zstencil_format);
 		r300->hw.zstencil_format.cmd[2] = topZ;
 	}
+
+	/* w_fmt value is set to get best performance
+	* see p.130 R5xx 3D acceleration guide v1.3 */
+	if (current_fragment_program_writes_depth(ctx)) {
+		fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
+		w_fmt = R300_W_FMT_W24 | R300_W_SRC_US;
+	} else {
+		fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
+		w_fmt = R300_W_FMT_W0 | R300_W_SRC_US;
+	}
+
+	if (w_fmt != r300->hw.us_out_fmt.cmd[5]) {
+		R300_STATECHANGE(r300, us_out_fmt);
+		r300->hw.us_out_fmt.cmd[5] = w_fmt;
+	}
+
+	if (fgdepthsrc != r300->hw.fg_depth_src.cmd[1]) {
+		R300_STATECHANGE(r300, fg_depth_src);
+		r300->hw.fg_depth_src.cmd[1] = fgdepthsrc;
+	}
 }
 
 static void r300SetAlphaState(GLcontext * ctx)
@@ -535,8 +548,6 @@ static void r300SetAlphaState(GLcontext * ctx)
 	R300_STATECHANGE(r300, at);
 	r300->hw.at.cmd[R300_AT_ALPHA_TEST] = pp_misc;
 	r300->hw.at.cmd[R300_AT_UNKNOWN] = 0;
-
-	r300SetEarlyZState(ctx);
 }
 
 static void r300AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref)
@@ -584,15 +595,35 @@ static void r300SetDepthState(GLcontext * ctx)
 		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
 		    translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
 	}
+}
 
-	r300SetEarlyZState(ctx);
+static void r300CatchStencilFallback(GLcontext *ctx)
+{
+	const unsigned back = ctx->Stencil._BackFace;
+
+	if (ctx->Stencil._Enabled && (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
+		|| ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[back]
+		|| ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[back])) {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_TRUE);
+	} else {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_FALSE);
+	}
 }
 
 static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLboolean hw_stencil = GL_FALSE;
 
-	if (r300->state.stencil.hw_stencil) {
+	r300CatchStencilFallback(ctx);
+
+	if (ctx->DrawBuffer) {
+		struct radeon_renderbuffer *rrbStencil
+			= radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+		hw_stencil = (rrbStencil && rrbStencil->bo);
+	}
+
+	if (hw_stencil) {
 		R300_STATECHANGE(r300, zs);
 		if (state) {
 			r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
@@ -601,10 +632,6 @@ static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 			r300->hw.zs.cmd[R300_ZS_CNTL_0] &=
 			    ~R300_STENCIL_ENABLE;
 		}
-	} else {
-#if R200_MERGED
-		FALLBACK(&r300->radeon, RADEON_FALLBACK_STENCIL, state);
-#endif
 	}
 }
 
@@ -737,7 +764,12 @@ static void r300ColorMask(GLcontext * ctx,
 static void r300PointSize(GLcontext * ctx, GLfloat size)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        /* same size limits for AA, non-AA points */
+
+	/* We need to clamp to user defined range here, because
+	 * the HW clamping happens only for per vertex point size. */
+	size = CLAMP(size, ctx->Point.MinSize, ctx->Point.MaxSize);
+
+	/* same size limits for AA, non-AA points */
 	size = CLAMP(size, ctx->Const.MinPointSize, ctx->Const.MaxPointSize);
 
 	R300_STATECHANGE(r300, ps);
@@ -830,29 +862,33 @@ static void r300ShadeModel(GLcontext * ctx, GLenum mode)
 
 	R300_STATECHANGE(rmesa, shade);
 	rmesa->hw.shade.cmd[1] = 0x00000002;
+	R300_STATECHANGE(rmesa, shade2);
 	switch (mode) {
 	case GL_FLAT:
-		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_FLAT;
+		rmesa->hw.shade2.cmd[1] = R300_RE_SHADE_MODEL_FLAT;
 		break;
 	case GL_SMOOTH:
-		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_SMOOTH;
+		rmesa->hw.shade2.cmd[1] = R300_RE_SHADE_MODEL_SMOOTH;
 		break;
 	default:
 		return;
 	}
-	rmesa->hw.shade.cmd[3] = 0x00000000;
-	rmesa->hw.shade.cmd[4] = 0x00000000;
+	rmesa->hw.shade2.cmd[2] = 0x00000000;
+	rmesa->hw.shade2.cmd[3] = 0x00000000;
 }
 
 static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 				    GLenum func, GLint ref, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	GLuint refmask =
-	    ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
-	     | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
-	const unsigned back = ctx->Stencil._BackFace;
+	GLuint refmask;
 	GLuint flag;
+	const unsigned back = ctx->Stencil._BackFace;
+
+	r300CatchStencilFallback(ctx);
+
+	refmask = ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
+	     | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
 
 	R300_STATECHANGE(rmesa, zs);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
@@ -880,6 +916,8 @@ static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 
+	r300CatchStencilFallback(ctx);
+
 	R300_STATECHANGE(rmesa, zs);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
 	    ~(R300_STENCILREF_MASK <<
@@ -896,6 +934,8 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	const unsigned back = ctx->Stencil._BackFace;
 
+	r300CatchStencilFallback(ctx);
+
 	R300_STATECHANGE(rmesa, zs);
 	/* It is easier to mask what's left.. */
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &=
@@ -924,28 +964,32 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
  * Window position and viewport transformation
  */
 
-/*
- * To correctly position primitives:
- */
-#define SUBPIXEL_X 0.125
-#define SUBPIXEL_Y 0.125
-
 static void r300UpdateWindow(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
 	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
+	const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+	const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+	GLfloat y_scale, y_bias;
+
+	if (render_to_fbo) {
+		y_scale = 1.0;
+		y_bias = 0;
+	} else {
+		y_scale = -1.0;
+		y_bias = yoffset;
+	}
 
 	GLfloat sx = v[MAT_SX];
-	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
-	GLfloat sy = -v[MAT_SY];
-	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
-	GLfloat sz = v[MAT_SZ] * rmesa->state.depth.scale;
-	GLfloat tz = v[MAT_TZ] * rmesa->state.depth.scale;
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat sy = v[MAT_SY] * y_scale;
+	GLfloat ty = (v[MAT_TY] * y_scale) + y_bias;
+	GLfloat sz = v[MAT_SZ] * depthScale;
+	GLfloat tz = v[MAT_TZ] * depthScale;
 
-	R300_FIREVERTICES(rmesa);
 	R300_STATECHANGE(rmesa, vpt);
 
 	rmesa->hw.vpt.cmd[R300_VPT_XSCALE] = r300PackFloat32(sx);
@@ -964,6 +1008,8 @@ static void r300Viewport(GLcontext * ctx, GLint x, GLint y,
 	 * values, or keep the originals hanging around.
 	 */
 	r300UpdateWindow(ctx);
+
+	radeon_viewport(ctx, x, y, width, height);
 }
 
 static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
@@ -974,13 +1020,13 @@ static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
 void r300UpdateViewportOffset(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = ((radeonContextPtr) rmesa)->dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = (GLfloat) dPriv->x;
 	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
 
-	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
-	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat ty = (-v[MAT_TY]) + yoffset;
 
 	if (rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] != r300PackFloat32(tx) ||
 	    rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] != r300PackFloat32(ty)) {
@@ -996,64 +1042,6 @@ void r300UpdateViewportOffset(GLcontext * ctx)
 	radeonUpdateScissor(ctx);
 }
 
-/**
- * Tell the card where to render (offset, pitch).
- * Effected by glDrawBuffer, etc
- */
-void r300UpdateDrawBuffer(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	r300ContextPtr r300 = rmesa;
-	struct gl_framebuffer *fb = ctx->DrawBuffer;
-	driRenderbuffer *drb;
-
-	if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-		/* draw to front */
-		drb =
-		    (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].
-		    Renderbuffer;
-	} else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-		/* draw to back */
-		drb =
-		    (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].
-		    Renderbuffer;
-	} else {
-		/* drawing to multiple buffers, or none */
-		return;
-	}
-
-	assert(drb);
-	assert(drb->flippedPitch);
-
-	R300_STATECHANGE(rmesa, cb);
-
-	r300->hw.cb.cmd[R300_CB_OFFSET] = drb->flippedOffset +	//r300->radeon.state.color.drawOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.cb.cmd[R300_CB_PITCH] = drb->flippedPitch;	//r300->radeon.state.color.drawPitch;
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-#if 0
-	R200_STATECHANGE(rmesa, ctx);
-
-	/* Note: we used the (possibly) page-flipped values */
-	rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
-	    = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
-	       & R200_COLOROFFSET_MASK);
-	rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
-
-	if (rmesa->sarea->tiling_enabled) {
-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
-		    R200_COLOR_TILE_ENABLE;
-	}
-#endif
-}
-
 static void
 r300FetchStateParameter(GLcontext * ctx,
 			const gl_state_index state[STATE_LENGTH],
@@ -1064,12 +1052,14 @@ r300FetchStateParameter(GLcontext * ctx,
 	switch (state[0]) {
 	case STATE_INTERNAL:
 		switch (state[1]) {
-		case STATE_R300_WINDOW_DIMENSION:
-			value[0] = r300->radeon.dri.drawable->w * 0.5f;	/* width*0.5 */
-			value[1] = r300->radeon.dri.drawable->h * 0.5f;	/* height*0.5 */
-			value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
-			value[3] = 1.0F;	/* not used */
-			break;
+		case STATE_R300_WINDOW_DIMENSION: {
+				__DRIdrawablePrivate * drawable = radeon_get_drawable(&r300->radeon);
+				value[0] = drawable->w * 0.5f;	/* width*0.5 */
+				value[1] = drawable->h * 0.5f;	/* height*0.5 */
+				value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
+				value[3] = 1.0F;	/* not used */
+				break;
+			}
 
 		case STATE_R300_TEXRECT_FACTOR:{
 				struct gl_texture_object *t =
@@ -1109,14 +1099,14 @@ void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
 	struct gl_program_parameter_list *paramList;
 	GLuint i;
 
-	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)))
 		return;
 
 	fp = (struct r300_fragment_program *)ctx->FragmentProgram._Current;
 	if (!fp)
 		return;
 
-	paramList = fp->mesa_program.Base.Parameters;
+	paramList = fp->Base.Base.Parameters;
 
 	if (!paramList)
 		return;
@@ -1235,9 +1225,8 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int i;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-	struct r300_fragment_program_code *code = &fp->code;
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
+	struct r300_fragment_program_code *code = &fp->code.r300;
 
 	R300_STATECHANGE(r300, fpt);
 
@@ -1271,15 +1260,15 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 	}
 
 	r300->hw.fpt.cmd[R300_FPT_CMD_0] =
-		cmdpacket0(R300_US_TEX_INST_0, code->tex.length);
+		cmdpacket0(r300->radeon.radeonScreen,
+                   R300_US_TEX_INST_0, code->tex.length);
 }
 
 static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 {
 	int i;
-	struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-	struct r500_fragment_program_code *code = &fp->code;
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
+	struct r500_fragment_program_code *code = &fp->code.r500;
 
 	/* find all the texture instructions and relocate the texture units */
 	for (i = 0; i < code->inst_end + 1; i++) {
@@ -1322,7 +1311,7 @@ static GLuint translate_lod_bias(GLfloat bias)
 static void r300SetupTextures(GLcontext * ctx)
 {
 	int i, mtu;
-	struct r300_tex_obj *t;
+	struct radeon_tex_obj *t;
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int hw_tmu = 0;
 	int last_hw_tmu = -1;	/* -1 translates into no setup costs for fields */
@@ -1356,21 +1345,16 @@ static void r300SetupTextures(GLcontext * ctx)
 	/* We cannot let disabled tmu offsets pass DRM */
 	for (i = 0; i < mtu; i++) {
 		if (ctx->Texture.Unit[i]._ReallyEnabled) {
-
-#if 0				/* Enables old behaviour */
-			hw_tmu = i;
-#endif
 			tmu_mappings[i] = hw_tmu;
 
-			t = (r300TexObjPtr) r300->state.texture.unit[i].texobj->DriverData;
-			/* XXX questionable fix for bug 9170: */
+			t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
 			if (!t)
 				continue;
 
-			if ((t->format & 0xffffff00) == 0xffffff00) {
+			if ((t->pp_txformat & 0xffffff00) == 0xffffff00) {
 				WARN_ONCE
 				    ("unknown texture format (entry %x) encountered. Help me !\n",
-				     t->format & 0xff);
+				     t->pp_txformat & 0xff);
 			}
 
 			if (RADEON_DEBUG & DEBUG_STATE)
@@ -1381,29 +1365,28 @@ static void r300SetupTextures(GLcontext * ctx)
 
 			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
 						hw_tmu] =
-			    gen_fixed_filter(t->filter) | (hw_tmu << 28);
+			    gen_fixed_filter(t->pp_txfilter) | (hw_tmu << 28);
 			/* Note: There is a LOD bias per texture unit and a LOD bias
 			 * per texture object. We add them here to get the correct behaviour.
 			 * (The per-texture object LOD bias was introduced in OpenGL 1.4
 			 * and is not present in the EXT_texture_object extension).
 			 */
 			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-				t->filter_1 |
-				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias);
+				t->pp_txfilter_1 |
+				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.LodBias);
 			r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-			    t->size;
+			    t->pp_txsize;
 			r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
-						hw_tmu] = t->format;
+						hw_tmu] = t->pp_txformat;
 			r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-			    t->pitch_reg;
-			r300->hw.tex.offset.cmd[R300_TEX_VALUE_0 +
-						hw_tmu] = t->offset;
+			  t->pp_txpitch;
+			r300->hw.textures[hw_tmu] = t;
 
-			if (t->offset & R300_TXO_MACRO_TILE) {
+			if (t->tile_bits & R300_TXO_MACRO_TILE) {
 				WARN_ONCE("macro tiling enabled!\n");
 			}
 
-			if (t->offset & R300_TXO_MICRO_TILE) {
+			if (t->tile_bits & R300_TXO_MICRO_TILE) {
 				WARN_ONCE("micro tiling enabled!\n");
 			}
 
@@ -1420,37 +1403,33 @@ static void r300SetupTextures(GLcontext * ctx)
 	}
 
 	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER0_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, last_hw_tmu + 1);
 	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER1_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, last_hw_tmu + 1);
 	r300->hw.tex.size.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_SIZE_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, last_hw_tmu + 1);
 	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, last_hw_tmu + 1);
 	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT2_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, last_hw_tmu + 1);
 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_OFFSET_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, last_hw_tmu + 1);
 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
-
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
 
 	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
-		if (fp->mesa_program.UsesKill && last_hw_tmu < 0) {
+		if (fp->Base.UsesKill && last_hw_tmu < 0) {
 			// The KILL operation requires the first texture unit
 			// to be enabled.
 			r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
 			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0] = 0;
 			r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-				cmdpacket0(R300_TX_FILTER0_0, 1);
+				cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 1);
 		}
-		r300SetupFragmentShaderTextures(ctx, tmu_mappings);
-	} else
-		r500SetupFragmentShaderTextures(ctx, tmu_mappings);
+	}
+	r300->vtbl.SetupFragmentShaderTextures(ctx, tmu_mappings);
 
 	if (RADEON_DEBUG & DEBUG_STATE)
 		fprintf(stderr, "TX_ENABLE: %08x  last_hw_tmu=%d\n",
@@ -1469,26 +1448,21 @@ union r300_outputs_written {
 static void r300SetupRSUnit(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
 	union r300_outputs_written OutputsWritten;
 	GLuint InputsRead;
 	int fp_reg, high_rr;
 	int col_ip, tex_ip;
 	int rs_tex_count = 0;
-	int i, count, col_fmt;
+	int i, col_fmt, hw_tcl_on;
+
+	hw_tcl_on = r300->options.hw_tcl_enabled;
 
 	if (hw_tcl_on)
-		OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+		OutputsWritten.vp_outputs = r300->selected_vp->key.OutputsWritten;
 	else
-		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->render_inputs_bitset);
 
-	if (ctx->FragmentProgram._Current)
-		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-	else {
-		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
-		return;		/* This should only ever happen once.. */
-	}
+	InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
 
 	R300_STATECHANGE(r300, ri);
 	R300_STATECHANGE(r300, rc);
@@ -1507,15 +1481,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL0;
 			++col_ip;
@@ -1527,15 +1493,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL1) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL1;
 			++col_ip;
@@ -1545,6 +1503,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
+	/* We always route 4 texcoord components */
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 		if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
 		    continue;
@@ -1554,33 +1513,27 @@ static void r300SetupRSUnit(GLcontext * ctx)
 		    continue;
 		}
 
-		int swiz;
-
-		/* with TCL we always seem to route 4 components */
-		if (hw_tcl_on)
-		  count = 4;
-		else
-		  count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-
-		switch(count) {
-		case 4: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3); break;
-		case 3: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
-		default:
-		case 1:
-		case 2: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
-		};
-
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz | R300_RS_TEX_PTR(rs_tex_count);
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) | R300_RS_TEX_PTR(rs_tex_count);
 		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
 		InputsRead &= ~(FRAG_BIT_TEX0 << i);
-		rs_tex_count += count;
+		rs_tex_count += 4;
+		++tex_ip;
+		++fp_reg;
+	}
+
+	if (InputsRead & FRAG_BIT_WPOS) {
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) | R300_RS_TEX_PTR(rs_tex_count);
+		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+		InputsRead &= ~FRAG_BIT_WPOS;
+		rs_tex_count += 4;
 		++tex_ip;
 		++fp_reg;
 	}
 
 	if (InputsRead & FRAG_BIT_FOGC) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= R300_RS_SEL_S(0) | R300_RS_SEL_T(R300_RS_SEL_K0) | R300_RS_SEL_R(R300_RS_SEL_K0);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= R300_RS_SEL_Q(R300_RS_SEL_K1) | R300_RS_TEX_PTR(rs_tex_count);
 			r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_FOGC;
 			rs_tex_count += 4;
@@ -1591,27 +1544,19 @@ static void r300SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
-	if (InputsRead & FRAG_BIT_WPOS) {
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
-		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
-		InputsRead &= ~FRAG_BIT_WPOS;
-		rs_tex_count += 4;
-		++tex_ip;
-		++fp_reg;
-	}
-	InputsRead &= ~FRAG_BIT_WPOS;
-
 	/* Setup default color if no color or tex was set */
 	if (rs_tex_count == 0 && col_ip == 0) {
-		r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+		r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_ADDR(0);
+		r300->hw.ri.cmd[R300_RI_INTERP_0] = R300_RS_COL_PTR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
 		++col_ip;
 	}
 
 	high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
-	r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+	r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT) | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
 	r300->hw.rc.cmd[2] |= high_rr - 1;
 
-	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr);
+	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, high_rr);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, high_rr);
 
 	if (InputsRead)
 		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
@@ -1620,26 +1565,21 @@ static void r300SetupRSUnit(GLcontext * ctx)
 static void r500SetupRSUnit(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
 	union r300_outputs_written OutputsWritten;
 	GLuint InputsRead;
 	int fp_reg, high_rr;
 	int col_ip, tex_ip;
 	int rs_tex_count = 0;
-	int i, count, col_fmt;
+	int i, col_fmt, hw_tcl_on;
+
+	hw_tcl_on = r300->options.hw_tcl_enabled;
 
 	if (hw_tcl_on)
-		OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+		OutputsWritten.vp_outputs = r300->selected_vp->key.OutputsWritten;
 	else
-		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->render_inputs_bitset);
 
-	if (ctx->FragmentProgram._Current)
-		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-	else {
-		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
-		return;		/* This should only ever happen once.. */
-	}
+	InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
 
 	R300_STATECHANGE(r300, ri);
 	R300_STATECHANGE(r300, rc);
@@ -1658,15 +1598,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL0;
 			++col_ip;
@@ -1678,15 +1610,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL1) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL1;
 			++col_ip;
@@ -1696,7 +1620,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
-
+	/* We always route 4 texcoord components */
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 		if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
 		    continue;
@@ -1706,55 +1630,37 @@ static void r500SetupRSUnit(GLcontext * ctx)
 		    continue;
 		}
 
-		int swiz = 0;
-
-		/* with TCL we always seem to route 4 components */
-		if (hw_tcl_on)
-		  count = 4;
-		else
-		  count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-
-		if (count == 4) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= (rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 3) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 2) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 1) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else {
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		}
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+			((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+			((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+			((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
 
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz;
 		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
 		InputsRead &= ~(FRAG_BIT_TEX0 << i);
-		rs_tex_count += count;
+		rs_tex_count += 4;
+		++tex_ip;
+		++fp_reg;
+	}
+
+	if (InputsRead & FRAG_BIT_WPOS) {
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+			((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+			((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+			((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+		InputsRead &= ~FRAG_BIT_WPOS;
+		rs_tex_count += 4;
 		++tex_ip;
 		++fp_reg;
 	}
 
 	if (InputsRead & FRAG_BIT_FOGC) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
-				((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
-				((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
-				((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= (rs_tex_count << R500_RS_IP_TEX_PTR_S_SHIFT) |
+				(R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+				(R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+				(R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
 
 			r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_FOGC;
@@ -1766,87 +1672,27 @@ static void r500SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
-	if (InputsRead & FRAG_BIT_WPOS) {
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
-				((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
-				((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
-				((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
-
-		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
-		InputsRead &= ~FRAG_BIT_WPOS;
-		rs_tex_count += 4;
-		++tex_ip;
-		++fp_reg;
-	}
-
 	/* Setup default color if no color or tex was set */
 	if (rs_tex_count == 0 && col_ip == 0) {
-		r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+		r300->hw.rr.cmd[R300_RR_INST_0] = R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_ADDR(0);
+		r300->hw.ri.cmd[R300_RI_INTERP_0] = R500_RS_COL_PTR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
 		++col_ip;
 	}
 
 	high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
-	r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
-	r300->hw.rc.cmd[2] |= 0xC0 | (high_rr - 1);
+	r300->hw.rc.cmd[1] = (rs_tex_count << R300_IT_COUNT_SHIFT) | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+	r300->hw.rc.cmd[2] = 0xC0 | (high_rr - 1);
 
-	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr);
+	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, high_rr);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, high_rr);
 
 	if (InputsRead)
 		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
 }
 
-
-
-
-#define bump_vpu_count(ptr, new_count)   do{\
-	drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
-	int _nc=(new_count)/4; \
-	assert(_nc < 256); \
-	if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
-	}while(0)
-
-static INLINE void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
-{
-	int i;
-
-	if (vsf->length == 0)
-		return;
-
-	if (vsf->length & 0x3) {
-		fprintf(stderr, "VERTEX_SHADER_FRAGMENT must have length divisible by 4\n");
-		_mesa_exit(-1);
-	}
-
-	switch ((dest >> 8) & 0xf) {
-	case 0:
-		R300_STATECHANGE(r300, vpi);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vpi.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-
-	case 2:
-		R300_STATECHANGE(r300, vpp);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vpp.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-	case 4:
-		R300_STATECHANGE(r300, vps);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vps.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-	default:
-		fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
-		_mesa_exit(-1);
-	}
-}
-
 #define MIN3(a, b, c)	((a) < (b) ? MIN2(a, c) : MIN2(b, c))
 
-
-static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
+void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 			GLuint output_count, GLuint temp_count)
 {
     int vtx_mem_size;
@@ -1870,7 +1716,7 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
     pvs_num_cntrls = MIN2(6, vtx_mem_size/temp_count);
 
     R300_STATECHANGE(rmesa, vap_cntl);
-    if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+    if (rmesa->options.hw_tcl_enabled) {
 	rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] =
 	    (pvs_num_slots << R300_PVS_NUM_SLOTS_SHIFT) |
 	    (pvs_num_cntrls << R300_PVS_NUM_CNTLRS_SHIFT) |
@@ -1900,114 +1746,6 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 
 }
 
-static void r300SetupDefaultVertexProgram(r300ContextPtr rmesa)
-{
-	struct r300_vertex_shader_state *prog = &(rmesa->state.vertex_shader);
-	GLuint o_reg = 0;
-	GLuint i_reg = 0;
-	int i;
-	int inst_count = 0;
-	int param_count = 0;
-	int program_end = 0;
-
-	for (i = VERT_ATTRIB_POS; i < VERT_ATTRIB_MAX; i++) {
-		if (rmesa->state.sw_tcl_inputs[i] != -1) {
-			prog->program.body.i[program_end + 0] = PVS_OP_DST_OPERAND(VE_MULTIPLY, GL_FALSE, GL_FALSE, o_reg++, VSF_FLAG_ALL, PVS_DST_REG_OUT);
-			prog->program.body.i[program_end + 1] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			prog->program.body.i[program_end + 2] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			prog->program.body.i[program_end + 3] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			program_end += 4;
-			i_reg++;
-		}
-	}
-
-	prog->program.length = program_end;
-
-	r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START,
-				       &(prog->program));
-	inst_count = (prog->program.length / 4) - 1;
-
-	r300VapCntl(rmesa, i_reg, o_reg, 0);
-
-	R300_STATECHANGE(rmesa, pvs);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-	    (0 << R300_PVS_FIRST_INST_SHIFT) |
-	    (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
-	    (inst_count << R300_PVS_LAST_INST_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-	    (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-	    (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-	    (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-}
-
-static int bit_count (int x)
-{
-    x = ((x & 0xaaaaaaaaU) >> 1) + (x & 0x55555555U);
-    x = ((x & 0xccccccccU) >> 2) + (x & 0x33333333U);
-    x = (x >> 16) + (x & 0xffff);
-    x = ((x & 0xf0f0) >> 4) + (x & 0x0f0f);
-    return (x >> 8) + (x & 0x00ff);
-}
-
-static void r300SetupRealVertexProgram(r300ContextPtr rmesa)
-{
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_vertex_program *prog = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-	int inst_count = 0;
-	int param_count = 0;
-
-	/* FIXME: r300SetupVertexProgramFragment */
-	R300_STATECHANGE(rmesa, vpp);
-	param_count =
-	    r300VertexProgUpdateParams(ctx,
-				       (struct r300_vertex_program_cont *)
-				       ctx->VertexProgram._Current,
-				       (float *)&rmesa->hw.vpp.
-				       cmd[R300_VPP_PARAM_0]);
-	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
-	param_count /= 4;
-
-	r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START, &(prog->program));
-	inst_count = (prog->program.length / 4) - 1;
-
-	r300VapCntl(rmesa, bit_count(prog->key.InputsRead),
-		    bit_count(prog->key.OutputsWritten), prog->num_temporaries);
-
-	R300_STATECHANGE(rmesa, pvs);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-	  (0 << R300_PVS_FIRST_INST_SHIFT) |
-	  (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
-	  (inst_count << R300_PVS_LAST_INST_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-	  (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-	  (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-	  (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-}
-
-static void r300SetupVertexProgram(r300ContextPtr rmesa)
-{
-	GLcontext *ctx = rmesa->radeon.glCtx;
-
-	/* Reset state, in case we don't use something */
-	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
-	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
-	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
-
-	/* Not sure why this doesnt work...
-	   0x400 area might have something to do with pixel shaders as it appears right after pfs programming.
-	   0x406 is set to { 0.0, 0.0, 1.0, 0.0 } most of the time but should change with smooth points and in other rare cases. */
-	//setup_vertex_shader_fragment(rmesa, 0x406, &unk4);
-	if (hw_tcl_on && ((struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx))->translated) {
-		r300SetupRealVertexProgram(rmesa);
-	} else {
-		/* FIXME: This needs to be replaced by vertex shader generation code. */
-		r300SetupDefaultVertexProgram(rmesa);
-	}
-
-}
-
 /**
  * Enable/Disable states.
  *
@@ -2015,20 +1753,13 @@ static void r300SetupVertexProgram(r300ContextPtr rmesa)
  */
 static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	if (RADEON_DEBUG & DEBUG_STATE)
 		fprintf(stderr, "%s( %s = %s )\n", __FUNCTION__,
 			_mesa_lookup_enum_by_nr(cap),
 			state ? "GL_TRUE" : "GL_FALSE");
 
 	switch (cap) {
-	case GL_TEXTURE_1D:
-	case GL_TEXTURE_2D:
-	case GL_TEXTURE_3D:
-		/* empty */
-		break;
-	case GL_FOG:
-		/* empty */
-		break;
 	case GL_ALPHA_TEST:
 		r300SetAlphaState(ctx);
 		break;
@@ -2046,22 +1777,46 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 	case GL_CLIP_PLANE5:
 		r300SetClipPlaneState(ctx, cap, state);
 		break;
+	case GL_CULL_FACE:
+		r300UpdateCulling(ctx);
+		break;
 	case GL_DEPTH_TEST:
 		r300SetDepthState(ctx);
 		break;
-	case GL_STENCIL_TEST:
-		r300SetStencilState(ctx, state);
+	case GL_LINE_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_LINE_SMOOTH, ctx->Line.SmoothFlag);
 		break;
-	case GL_CULL_FACE:
-		r300UpdateCulling(ctx);
+	case GL_LINE_STIPPLE:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_LINE_STIPPLE, ctx->Line.StippleFlag);
+		break;
+	case GL_POINT_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POINT_SMOOTH, ctx->Point.SmoothFlag);
+		break;
+	case GL_POLYGON_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POLYGON_SMOOTH, ctx->Polygon.SmoothFlag);
+		break;
+	case GL_POLYGON_STIPPLE:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POLYGON_STIPPLE, ctx->Polygon.StippleFlag);
 		break;
 	case GL_POLYGON_OFFSET_POINT:
 	case GL_POLYGON_OFFSET_LINE:
 	case GL_POLYGON_OFFSET_FILL:
 		r300SetPolygonOffsetState(ctx, state);
 		break;
+	case GL_SCISSOR_TEST:
+		radeon_firevertices(&rmesa->radeon);
+		rmesa->radeon.state.scissor.enabled = state;
+		radeonUpdateScissor( ctx );
+		break;
+	case GL_STENCIL_TEST:
+		r300SetStencilState(ctx, state);
+		break;
 	default:
-		radeonEnable(ctx, cap, state);
 		break;
 	}
 }
@@ -2072,15 +1827,14 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 static void r300ResetHwState(r300ContextPtr r300)
 {
 	GLcontext *ctx = r300->radeon.glCtx;
-	int has_tcl = 1;
+	int has_tcl;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
 	if (RADEON_DEBUG & DEBUG_STATE)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
-	r300UpdateWindow(ctx);
+	radeon_firevertices(&r300->radeon);
 
 	r300ColorMask(ctx,
 		      ctx->Color.ColorMask[RCOMP],
@@ -2102,8 +1856,6 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300UpdateCulling(ctx);
 
-	r300UpdateTextureState(ctx);
-
 	r300SetBlendState(ctx);
 	r300SetLogicOpState(ctx);
 
@@ -2182,8 +1934,8 @@ static void r300ResetHwState(r300ContextPtr r300)
 	}
 
 	/* XXX: Enable anti-aliasing? */
-	r300->hw.gb_misc.cmd[R300_GB_MISC_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
-	r300->hw.gb_misc.cmd[R300_GB_MISC_SELECT] = 0;
+	r300->hw.gb_misc2.cmd[R300_GB_MISC2_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
+	r300->hw.gb_misc2.cmd[R300_GB_MISC2_SELECT] = 0;
 
 	r300->hw.ga_point_s0.cmd[1] = r300PackFloat32(0.0);
 	r300->hw.ga_point_s0.cmd[2] = r300PackFloat32(0.0);
@@ -2242,20 +1994,6 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300BlendColor(ctx, ctx->Color.BlendColor);
 
-	/* Again, r300ClearBuffer uses this */
-	r300->hw.cb.cmd[R300_CB_OFFSET] =
-	    r300->radeon.state.color.drawOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-
 	r300->hw.rb3d_dither_ctl.cmd[1] = 0;
 	r300->hw.rb3d_dither_ctl.cmd[2] = 0;
 	r300->hw.rb3d_dither_ctl.cmd[3] = 0;
@@ -2268,44 +2006,18 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300->hw.rb3d_aaresolve_ctl.cmd[1] = 0;
 
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
-
-	r300->hw.zb.cmd[R300_ZB_OFFSET] =
-	    r300->radeon.radeonScreen->depthOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.zb.cmd[R300_ZB_PITCH] = r300->radeon.radeonScreen->depthPitch;
-
-	if (r300->radeon.sarea->tiling_enabled) {
-		/* XXX: Turn off when clearing buffers ? */
-		r300->hw.zb.cmd[R300_ZB_PITCH] |= R300_DEPTHMACROTILE_ENABLE;
-
-		if (ctx->Visual.depthBits == 24)
-			r300->hw.zb.cmd[R300_ZB_PITCH] |=
-			    R300_DEPTHMICROTILE_TILED;
-	}
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
 
 	r300->hw.zb_depthclearvalue.cmd[1] = 0;
 
-	switch (ctx->Visual.depthBits) {
-	case 16:
-		r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_16BIT_INT_Z;
-		break;
-	case 24:
-		r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-		break;
-	default:
-		fprintf(stderr, "Error: Unsupported depth %d... exiting\n", ctx->Visual.depthBits);
-		_mesa_exit(-1);
-	}
-
 	r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
 	r300->hw.zstencil_format.cmd[3] = 0x00000003;
 	r300->hw.zstencil_format.cmd[4] = 0x00000000;
 	r300SetEarlyZState(ctx);
 
-	r300->hw.unk4F30.cmd[1] = 0;
-	r300->hw.unk4F30.cmd[2] = 0;
+	r300->hw.zb_zmask.cmd[1] = 0;
+	r300->hw.zb_zmask.cmd[2] = 0;
 
 	r300->hw.zb_hiz_offset.cmd[1] = 0;
 
@@ -2319,20 +2031,26 @@ static void r300ResetHwState(r300ContextPtr r300)
 		r300->hw.vps.cmd[R300_VPS_ZERO_3] = 0;
 	}
 
-	r300->hw.all_dirty = GL_TRUE;
+	r300->radeon.hw.all_dirty = GL_TRUE;
 }
 
 void r300UpdateShaders(r300ContextPtr rmesa)
 {
 	GLcontext *ctx;
-	struct r300_vertex_program *vp;
+	struct r300_fragment_program *fp;
 	int i;
 
 	ctx = rmesa->radeon.glCtx;
+	fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 
-	if (rmesa->NewGLState && hw_tcl_on) {
-		rmesa->NewGLState = 0;
+	/* should only happenen once, just after context is created */
+	/* TODO: shouldn't we fallback to sw here? */
+	if (!fp) {
+		_mesa_fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+		return;
+	}
 
+	if (rmesa->radeon.NewGLState && rmesa->options.hw_tcl_enabled) {
 		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
 			rmesa->temp_attrib[i] =
 			    TNL_CONTEXT(ctx)->vb.AttribPtr[i];
@@ -2348,20 +2066,16 @@ void r300UpdateShaders(r300ContextPtr rmesa)
 		}
 
 		r300SelectVertexShader(rmesa);
-		vp = (struct r300_vertex_program *)
-		    CURRENT_VERTEX_SHADER(ctx);
-		/*if (vp->translated == GL_FALSE)
-		   r300TranslateVertexShader(vp); */
-		if (vp->translated == GL_FALSE) {
-			fprintf(stderr, "Failing back to sw-tcl\n");
-			hw_tcl_on = future_hw_tcl_on = 0;
-			r300ResetHwState(rmesa);
-
-			r300UpdateStateParameters(ctx, _NEW_PROGRAM);
-			return;
-		}
+		r300SwitchFallback(ctx, R300_FALLBACK_VERTEX_PROGRAM, rmesa->selected_vp->error);
 	}
-	r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+
+	if (!fp->translated || rmesa->radeon.NewGLState)
+		r300TranslateFragmentShader(ctx, ctx->FragmentProgram._Current);
+
+	r300SwitchFallback(ctx, R300_FALLBACK_FRAGMENT_PROGRAM, fp->error);
+
+	r300UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+	rmesa->radeon.NewGLState = 0;
 }
 
 static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
@@ -2385,35 +2099,23 @@ static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
 }
 
 
-static void r300SetupPixelShader(r300ContextPtr rmesa)
+static void r300SetupPixelShader(GLcontext *ctx)
 {
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 	struct r300_fragment_program_code *code;
 	int i, k;
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
-	r300TranslateFragmentShader(rmesa, fp);
-	if (!fp->translated) {
-		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
-			__FUNCTION__);
-		return;
-	}
-	code = &fp->code;
-
-	r300SetupTextures(ctx);
+	code = &fp->code.r300;
 
 	R300_STATECHANGE(rmesa, fpi[0]);
 	R300_STATECHANGE(rmesa, fpi[1]);
 	R300_STATECHANGE(rmesa, fpi[2]);
 	R300_STATECHANGE(rmesa, fpi[3]);
-	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu.length);
-	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu.length);
-	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu.length);
-	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, code->alu.length);
+	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, code->alu.length);
+	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, code->alu.length);
+	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
 	for (i = 0; i < code->alu.length; i++) {
 		rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
 		rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
@@ -2444,10 +2146,10 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 	}
 
 	R300_STATECHANGE(rmesa, fpp);
-	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
+	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_PFS_PARAM_0_X, code->const_nr * 4);
 	for (i = 0; i < code->const_nr; i++) {
 		const GLfloat *constant = get_fragmentprogram_constant(ctx,
-			&fp->mesa_program.Base, code->constant[i]);
+			&fp->Base.Base, code->constant[i]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]);
@@ -2469,29 +2171,17 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 	if(_nc>_p->r500fp.count)_p->r500fp.count=_nc;\
 } while(0)
 
-static void r500SetupPixelShader(r300ContextPtr rmesa)
+static void r500SetupPixelShader(GLcontext *ctx)
 {
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 	int i;
 	struct r500_fragment_program_code *code;
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
 	((drm_r300_cmd_header_t *) rmesa->hw.r500fp.cmd)->r500fp.count = 0;
 	((drm_r300_cmd_header_t *) rmesa->hw.r500fp_const.cmd)->r500fp.count = 0;
 
-	r500TranslateFragmentShader(rmesa, fp);
-	if (!fp->translated) {
-		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
-			__FUNCTION__);
-		return;
-	}
-	code = &fp->code;
-
-	r300SetupTextures(ctx);
+	code = &fp->code.r500;
 
 	R300_STATECHANGE(rmesa, fp);
 	rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
@@ -2521,58 +2211,96 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
 	R300_STATECHANGE(rmesa, r500fp_const);
 	for (i = 0; i < code->const_nr; i++) {
 		const GLfloat *constant = get_fragmentprogram_constant(ctx,
-			&fp->mesa_program.Base, code->constant[i]);
+			&fp->Base.Base, code->constant[i]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]);
 	}
 	bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
+}
+
+void r300SetupVAP(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten)
+{
+	r300ContextPtr rmesa = R300_CONTEXT( ctx );
+	struct vertex_attribute *attrs = rmesa->vbuf.attribs;
+	int i, j, reg_count;
+	uint32_t *vir0 = &rmesa->hw.vir[0].cmd[1];
+	uint32_t *vir1 = &rmesa->hw.vir[1].cmd[1];
+
+	for (i = 0; i < R300_VIR_CMDSIZE-1; ++i)
+		vir0[i] = vir1[i] = 0;
+
+	for (i = 0, j = 0; i < rmesa->vbuf.num_attribs; ++i) {
+		int tmp;
+
+		tmp = attrs[i].data_type | (attrs[i].dst_loc << R300_DST_VEC_LOC_SHIFT);
+		if (attrs[i]._signed)
+			tmp |= R300_SIGNED;
+		if (attrs[i].normalize)
+			tmp |= R300_NORMALIZE;
+
+		if (i % 2 == 0) {
+			vir0[j] = tmp << R300_DATA_TYPE_0_SHIFT;
+			vir1[j] = attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT);
+		} else {
+			vir0[j] |= tmp << R300_DATA_TYPE_1_SHIFT;
+			vir1[j] |= (attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
+			++j;
+		}
+	}
+
+	reg_count = (rmesa->vbuf.num_attribs + 1) >> 1;
+	if (rmesa->vbuf.num_attribs % 2 != 0) {
+		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
+	} else {
+		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
+	}
+
+	R300_STATECHANGE(rmesa, vir[0]);
+	R300_STATECHANGE(rmesa, vir[1]);
+	R300_STATECHANGE(rmesa, vof);
+	R300_STATECHANGE(rmesa, vic);
+
+	if (rmesa->radeon.radeonScreen->kernel_mm) {
+		rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[0].cmd[0] |= (reg_count & 0x3FFF) << 16;
+		rmesa->hw.vir[1].cmd[0] |= (reg_count & 0x3FFF) << 16;
+	} else {
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count = reg_count;
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count = reg_count;
+	}
 
+	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
+	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten, ctx->FragmentProgram._Current->Base.InputsRead);
+	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = r300VAPOutputCntl1(ctx, OutputsWritten, ctx->FragmentProgram._Current->Base.InputsRead);
 }
 
 void r300UpdateShaderStates(r300ContextPtr rmesa)
 {
 	GLcontext *ctx;
 	ctx = rmesa->radeon.glCtx;
+	struct r300_fragment_program *r300_fp;
 
-	r300UpdateTextureState(ctx);
-	r300SetEarlyZState(ctx);
+	r300_fp = (struct r300_fragment_program *) ctx->FragmentProgram._Current;
 
-	/* w_fmt value is set to get best performance
-	 * see p.130 R5xx 3D acceleration guide v1.3 */
-	GLuint w_fmt, fgdepthsrc;
-	if (current_fragment_program_writes_depth(ctx)) {
-		fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
-		w_fmt = R300_W_FMT_W24 | R300_W_SRC_US;
-	} else {
-		fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
-		w_fmt = R300_W_FMT_W0 | R300_W_SRC_US;
-	}
+	/* should only happenen once, just after context is created */
+	if (!r300_fp)
+		return;
 
-	if (w_fmt != rmesa->hw.us_out_fmt.cmd[5]) {
-		R300_STATECHANGE(rmesa, us_out_fmt);
-		rmesa->hw.us_out_fmt.cmd[5] = w_fmt;
-	}
+	r300SetEarlyZState(ctx);
 
-	if (fgdepthsrc != rmesa->hw.fg_depth_src.cmd[1]) {
-		R300_STATECHANGE(rmesa, fg_depth_src);
-		rmesa->hw.fg_depth_src.cmd[1] = fgdepthsrc;
-	}
+	r300SetupTextures(ctx);
 
-	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		r500SetupPixelShader(rmesa);
-	else
-		r300SetupPixelShader(rmesa);
+	rmesa->vtbl.SetupPixelShader(ctx);
 
-	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		r500SetupRSUnit(ctx);
-	else
-		r300SetupRSUnit(ctx);
+	rmesa->vtbl.SetupRSUnit(ctx);
 
-	if ((rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (rmesa->options.hw_tcl_enabled) {
 		r300SetupVertexProgram(rmesa);
-
+	}
 }
 
 /**
@@ -2586,15 +2314,18 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
 	_swsetup_InvalidateState(ctx, new_state);
 	_vbo_InvalidateState(ctx, new_state);
 	_tnl_InvalidateState(ctx, new_state);
-	_ae_invalidate_state(ctx, new_state);
 
-	if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
-		r300UpdateDrawBuffer(ctx);
+	if (new_state & _NEW_BUFFERS) {
+		_mesa_update_framebuffer(ctx);
+		/* this updates the DrawBuffer's Width/Height if it's a FBO */
+		_mesa_update_draw_buffer_bounds(ctx);
+
+		R300_STATECHANGE(r300, cb);
 	}
 
 	r300UpdateStateParameters(ctx, new_state);
 
-	r300->NewGLState |= new_state;
+	r300->radeon.NewGLState |= new_state;
 }
 
 /**
@@ -2604,58 +2335,12 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
  */
 void r300InitState(r300ContextPtr r300)
 {
-	GLcontext *ctx = r300->radeon.glCtx;
-	GLuint depth_fmt;
-
-	radeonInitState(&r300->radeon);
-
-	switch (ctx->Visual.depthBits) {
-	case 16:
-		r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
-		depth_fmt = R300_DEPTHFORMAT_16BIT_INT_Z;
-		break;
-	case 24:
-		r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
-		depth_fmt = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-		break;
-	default:
-		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
-			ctx->Visual.depthBits);
-		_mesa_exit(-1);
-	}
-
-	/* Only have hw stencil when depth buffer is 24 bits deep */
-	r300->state.stencil.hw_stencil = (ctx->Visual.stencilBits > 0 &&
-					  ctx->Visual.depthBits == 24);
-
-	memset(&(r300->state.texture), 0, sizeof(r300->state.texture));
-
 	r300ResetHwState(r300);
 }
 
 static void r300RenderMode(GLcontext * ctx, GLenum mode)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	(void)rmesa;
-	(void)mode;
-}
-
-void r300UpdateClipPlanes( GLcontext *ctx )
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	GLuint p;
-
-	for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
-		if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
-			GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
-
-			R300_STATECHANGE( rmesa, vpucp[p] );
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
-		}
-	}
+	r300SwitchFallback(ctx, R300_FALLBACK_RENDER_MODE, ctx->RenderMode != GL_RENDER);
 }
 
 /**
@@ -2663,7 +2348,6 @@ void r300UpdateClipPlanes( GLcontext *ctx )
  */
 void r300InitStateFuncs(struct dd_function_table *functions)
 {
-	radeonInitStateFuncs(functions);
 
 	functions->UpdateState = r300InvalidateState;
 	functions->AlphaFunc = r300AlphaFunc;
@@ -2699,4 +2383,25 @@ void r300InitStateFuncs(struct dd_function_table *functions)
 	functions->RenderMode = r300RenderMode;
 
 	functions->ClipPlane = r300ClipPlane;
+	functions->Scissor = radeonScissor;
+
+	functions->DrawBuffer		= radeonDrawBuffer;
+	functions->ReadBuffer		= radeonReadBuffer;
+}
+
+void r300InitShaderFunctions(r300ContextPtr r300)
+{
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		r300->vtbl.SetupRSUnit = r500SetupRSUnit;
+		r300->vtbl.SetupPixelShader = r500SetupPixelShader;
+		r300->vtbl.SetupFragmentShaderTextures = r500SetupFragmentShaderTextures;
+		r300->vtbl.BuildFragmentProgramHwCode = r500BuildFragmentProgramHwCode;
+		r300->vtbl.FragmentProgramDump = r500FragmentProgramDump;
+	} else {
+		r300->vtbl.SetupRSUnit = r300SetupRSUnit;
+		r300->vtbl.SetupPixelShader = r300SetupPixelShader;
+		r300->vtbl.SetupFragmentShaderTextures = r300SetupFragmentShaderTextures;
+		r300->vtbl.BuildFragmentProgramHwCode = r300BuildFragmentProgramHwCode;
+		r300->vtbl.FragmentProgramDump = r300FragmentProgramDump;
+	}
 }
diff --git a/src/mesa/drivers/dri/r300/r300_state.h b/src/mesa/drivers/dri/r300/r300_state.h
index 0589ab7cad..2328289420 100644
--- a/src/mesa/drivers/dri/r300/r300_state.h
+++ b/src/mesa/drivers/dri/r300/r300_state.h
@@ -39,42 +39,25 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define R300_NEWPRIM( rmesa )			\
   do {						\
-    if ( rmesa->dma.flush )			\
-      rmesa->dma.flush( rmesa );		\
+  if ( rmesa->radeon.dma.flush )			\
+    rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
   } while (0)
 
 #define R300_STATECHANGE(r300, atom) \
 	do {						\
 	  R300_NEWPRIM(r300);				\
 		r300->hw.atom.dirty = GL_TRUE;		\
-		r300->hw.is_dirty = GL_TRUE;		\
+		r300->radeon.hw.is_dirty = GL_TRUE;		\
 	} while(0)
 
-#define R300_PRINT_STATE(r300, atom) \
-		r300PrintStateAtom(r300, &r300->hw.atom)
-
-/* Fire the buffered vertices no matter what.
-   TODO: This has not been implemented yet
- */
-#define R300_FIREVERTICES( r300 )			\
-do {							\
-    \
-   if ( (r300)->cmdbuf.count_used || (r300)->dma.flush ) {	\
-      r300Flush( (r300)->radeon.glCtx );		\
-   }							\
-    \
-} while (0)
-
-// r300_state.c
-extern int future_hw_tcl_on;
-void _tnl_UpdateFixedFunctionProgram (GLcontext * ctx);
 void r300UpdateViewportOffset (GLcontext * ctx);
 void r300UpdateDrawBuffer (GLcontext * ctx);
 void r300UpdateStateParameters (GLcontext * ctx, GLuint new_state);
 void r300UpdateShaders (r300ContextPtr rmesa);
 void r300UpdateShaderStates (r300ContextPtr rmesa);
 void r300InitState (r300ContextPtr r300);
-void r300UpdateClipPlanes (GLcontext * ctx);
 void r300InitStateFuncs (struct dd_function_table *functions);
+void r300VapCntl(r300ContextPtr rmesa, GLuint input_count, GLuint output_count, GLuint temp_count);
+void r300SetupVAP(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten);
 
 #endif				/* __R300_STATE_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.c b/src/mesa/drivers/dri/r300/r300_swtcl.c
index ba3621b16b..ce4179208e 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.c
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.c
@@ -28,362 +28,237 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /*
  * Authors:
  *   Dave Airlie <airlied@linux.ie>
+ *   Maciej Cencora <m.cencora@gmail.com>
  */
 
-/* derived from r200 swtcl path */
-
-
-
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/colormac.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/imports.h"
-#include "main/light.h"
-#include "main/macros.h"
-
-#include "swrast/s_context.h"
-#include "swrast/s_fog.h"
-#include "swrast_setup/swrast_setup.h"
-#include "math/m_translate.h"
 #include "tnl/tnl.h"
-#include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
 
-#include "r300_context.h"
-#include "r300_swtcl.h"
 #include "r300_state.h"
-#include "r300_ioctl.h"
+#include "r300_swtcl.h"
 #include "r300_emit.h"
-#include "r300_mem.h"
-
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  );
+#include "r300_tex.h"
+#include "r300_render.h"
 
-
-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset);
-void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr);
 #define EMIT_ATTR( ATTR, STYLE )					\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
 } while (0)
 
 #define EMIT_PAD( N )							\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
+} while (0)
+
+#define ADD_ATTR(_attr, _format, _dst_loc, _swizzle, _write_mask, _normalize) \
+do { \
+	attrs[num_attrs].element = (_attr); \
+	attrs[num_attrs].data_type = (_format); \
+	attrs[num_attrs].dst_loc = (_dst_loc); \
+	attrs[num_attrs].swizzle = (_swizzle); \
+	attrs[num_attrs].write_mask = (_write_mask); \
+	attrs[num_attrs]._signed = 0; \
+	attrs[num_attrs].normalize = (_normalize); \
+	++num_attrs; \
 } while (0)
 
-static void r300SetVertexFormat( GLcontext *ctx )
+void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_OutputsWritten)
 {
 	r300ContextPtr rmesa = R300_CONTEXT( ctx );
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	struct vertex_buffer *VB = &tnl->vb;
-	DECLARE_RENDERINPUTS(index_bitset);
-	GLuint InputsRead = 0, OutputsWritten = 0;
-	int vap_fmt_1 = 0;
-	int offset = 0;
-	int vte = 0;
-	int fog_id;
-	GLint inputs[VERT_ATTRIB_MAX];
-	GLint tab[VERT_ATTRIB_MAX];
-	int swizzle[VERT_ATTRIB_MAX][4];
-	GLuint i, nr;
-	GLuint sz;
-
-	DECLARE_RENDERINPUTS(render_inputs_bitset);
-	RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
-	RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
-	RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
-
-	vte = rmesa->hw.vte.cmd[1];
-	vte &= ~(R300_VTX_XY_FMT | R300_VTX_Z_FMT | R300_VTX_W0_FMT);
-	/* Important:
-	 */
-	if ( VB->NdcPtr != NULL ) {
-		VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
-		vte |= R300_VTX_XY_FMT | R300_VTX_Z_FMT;
-	}
-	else {
-		VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
-		vte |= R300_VTX_W0_FMT;
-	}
-
-	assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-	rmesa->swtcl.vertex_attr_count = 0;
-
-	/* EMIT_ATTR's must be in order as they tell t_vertex.c how to
-	 * build up a hardware vertex.
-	 */
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POS)) {
-		sz = VB->AttribPtr[VERT_ATTRIB_POS]->size;
-		InputsRead |= 1 << VERT_ATTRIB_POS;
-		OutputsWritten |= 1 << VERT_RESULT_HPOS;
-		EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_1F + sz - 1 );
-		offset = sz;
-	} else {
-		offset = 4;
-		EMIT_PAD(4 * sizeof(float));
-	}
-/*
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POINTSIZE )) {
-		EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
-		offset += 1;
-	}
-*/
-	if (RENDERINPUTS_TEST(index_bitset, _TNL_ATTRIB_COLOR0)) {
-		sz = VB->AttribPtr[VERT_ATTRIB_COLOR0]->size;
-	        rmesa->swtcl.coloroffset = offset;
+	int first_free_tex = 0;
+	GLuint InputsRead = 0;
+	GLuint OutputsWritten = 0;
+	int num_attrs = 0;
+	GLuint fp_reads = ctx->FragmentProgram._Current->Base.InputsRead;
+	struct vertex_attribute *attrs = rmesa->vbuf.attribs;
+
+	rmesa->swtcl.coloroffset = rmesa->swtcl.specoffset = 0;
+	rmesa->radeon.swtcl.vertex_attr_count = 0;
+
+	/* We always want non Ndc coords format */
+	VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+
+	/* Always write position vector */
+	InputsRead |= 1 << VERT_ATTRIB_POS;
+	OutputsWritten |= 1 << VERT_RESULT_HPOS;
+	EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
+	ADD_ATTR(VERT_ATTRIB_POS, R300_DATA_TYPE_FLOAT_4, SWTCL_OVM_POS, SWIZZLE_XYZW, MASK_XYZW, 0);
+	rmesa->swtcl.coloroffset = 4;
+
+	if (fp_reads & FRAG_BIT_COL0) {
 		InputsRead |= 1 << VERT_ATTRIB_COLOR0;
 		OutputsWritten |= 1 << VERT_RESULT_COL0;
-		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_1F + sz - 1 );
-		offset += sz;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
+#endif
 	}
 
-	rmesa->swtcl.specoffset = 0;
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
-		sz = VB->AttribPtr[VERT_ATTRIB_COLOR1]->size;
-		rmesa->swtcl.specoffset = offset;
-		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_1F + sz - 1 );
+	if (fp_reads & FRAG_BIT_COL1) {
+		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
 		InputsRead |= 1 << VERT_ATTRIB_COLOR1;
 		OutputsWritten |= 1 << VERT_RESULT_COL1;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
+#endif
+		rmesa->swtcl.specoffset = rmesa->swtcl.coloroffset + 1;
 	}
 
-	fog_id = -1;
-	if (RENDERINPUTS_TEST(index_bitset, _TNL_ATTRIB_FOG)) {
-		/* find first free tex coord slot */
-		if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
-			int i;
-			for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-				if (!RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-					fog_id = i;
-					break;
-				}
-			}
-		} else {
-			fog_id = 0;
-		}
-
-		if (fog_id == -1) {
-			fprintf(stderr, "\tout of free texcoords to do fog\n");
-			_mesa_exit(-1);
+	if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
+		VB->AttribPtr[VERT_ATTRIB_GENERIC0] = VB->ColorPtr[1];
+		OutputsWritten |= 1 << VERT_RESULT_BFC0;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
+#endif
+		if (fp_reads & FRAG_BIT_COL1) {
+			VB->AttribPtr[VERT_ATTRIB_GENERIC1] = VB->SecondaryColorPtr[1];
+			GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+			OutputsWritten |= 1 << VERT_RESULT_BFC1;
+#if MESA_LITTLE_ENDIAN
+			EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_RGBA );
+			ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
+#else
+			EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_ABGR );
+			ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
+#endif
 		}
+	}
 
-		sz = VB->AttribPtr[VERT_ATTRIB_FOG]->size;
-		EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1F + sz - 1);
-		InputsRead |= 1 << VERT_ATTRIB_FOG;
-		OutputsWritten |= 1 << VERT_RESULT_FOGC;
-		vap_fmt_1 |= sz << (3 * fog_id);
+	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POINTSIZE )) {
+		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
+		InputsRead |= 1 << VERT_ATTRIB_POINT_SIZE;
+		OutputsWritten |= 1 << VERT_RESULT_PSIZ;
+		EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
+		ADD_ATTR(VERT_ATTRIB_POINT_SIZE, R300_DATA_TYPE_FLOAT_1, SWTCL_OVM_POINT_SIZE, swiz, MASK_X, 0);
 	}
 
-	if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+	/**
+	 *  Sending only one texcoord component may lead to lock up,
+	 *  so for all textures always output 4 texcoord components to RS.
+	 */
+	{
 		int i;
-
+		GLuint swiz, format, hw_format;
 		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-			if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-				sz = VB->TexCoordPtr[i]->size;
+			if (fp_reads & FRAG_BIT_TEX(i)) {
+				switch (VB->TexCoordPtr[i]->size) {
+					case 1:
+						format = EMIT_1F;
+						hw_format = R300_DATA_TYPE_FLOAT_1;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+						break;
+					case 2:
+						format = EMIT_2F;
+						hw_format = R300_DATA_TYPE_FLOAT_2;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
+						break;
+					case 3:
+						format = EMIT_3F;
+						hw_format = R300_DATA_TYPE_FLOAT_3;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+						break;
+					case 4:
+						format = EMIT_4F;
+						hw_format = R300_DATA_TYPE_FLOAT_4;
+						swiz = SWIZZLE_XYZW;
+						break;
+					default:
+						continue;
+				}
 				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
 				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-				EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_1F + sz - 1 );
-				vap_fmt_1 |= sz << (3 * i);
+				EMIT_ATTR(_TNL_ATTRIB_TEX(i), format);
+				ADD_ATTR(VERT_ATTRIB_TEX0 + i, hw_format, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_XYZW, 0);
+				++first_free_tex;
 			}
 		}
 	}
 
 	/* RS can't put fragment position on the pixel stack, so stuff it in texcoord if needed */
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POS) && (ctx->FragmentProgram._Current->Base.InputsRead & FRAG_BIT_WPOS)) {
-		int first_free_tex = -1;
-		if (fog_id >= 0) {
-			first_free_tex = fog_id+1;
-		} else {
-			if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
-				int i;
-				for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-					if (!RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-						first_free_tex = i;
-						break;
-					}
-				}
-			} else {
-				first_free_tex = 0;
-			}
-		}
-
-		if (first_free_tex == -1) {
+	if (fp_reads & FRAG_BIT_WPOS) {
+		if (first_free_tex >= ctx->Const.MaxTextureUnits) {
 			fprintf(stderr, "\tout of free texcoords to write w pos\n");
 			_mesa_exit(-1);
 		}
 
-		sz = VB->AttribPtr[VERT_ATTRIB_POS]->size;
 		InputsRead |= 1 << (VERT_ATTRIB_TEX0 + first_free_tex);
 		OutputsWritten |= 1 << (VERT_RESULT_TEX0 + first_free_tex);
-		EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_1F + sz - 1 );
-		vap_fmt_1 |= sz << (3 * first_free_tex);
-	}
-
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			inputs[i] = nr++;
-		} else {
-			inputs[i] = -1;
-		}
+		EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
+		ADD_ATTR(VERT_ATTRIB_POS, R300_DATA_TYPE_FLOAT_4, SWTCL_OVM_TEX(first_free_tex), SWIZZLE_XYZW, MASK_XYZW, 0);
+		++first_free_tex;
 	}
 
-	/* Fixed, apply to vir0 only */
-	if (InputsRead & (1 << VERT_ATTRIB_POS))
-		inputs[VERT_ATTRIB_POS] = 0;
-	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
-		inputs[VERT_ATTRIB_COLOR0] = 2;
-	if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
-		inputs[VERT_ATTRIB_COLOR1] = 3;
-	if (InputsRead & (1 << VERT_ATTRIB_FOG))
-		inputs[VERT_ATTRIB_FOG] = 6 + fog_id;
-	for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
-		if (InputsRead & (1 << i))
-			inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
-
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			tab[nr++] = i;
+	if (fp_reads & FRAG_BIT_FOGC) {
+		if (first_free_tex >= ctx->Const.MaxTextureUnits) {
+			fprintf(stderr, "\tout of free texcoords to write fog coordinate\n");
+			_mesa_exit(-1);
 		}
-	}
-
-	for (i = 0; i < nr; i++) {
-		int ci;
 
-		swizzle[i][0] = SWIZZLE_ZERO;
-		swizzle[i][1] = SWIZZLE_ZERO;
-		swizzle[i][2] = SWIZZLE_ZERO;
-		swizzle[i][3] = SWIZZLE_ONE;
-
-		for (ci = 0; ci < VB->AttribPtr[tab[i]]->size; ci++) {
-			swizzle[i][ci] = ci;
-		}
+		InputsRead |= 1 << VERT_ATTRIB_FOG;
+		OutputsWritten |= 1 << VERT_RESULT_FOGC;
+		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
+		EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1F );
+		ADD_ATTR(VERT_ATTRIB_FOG, R300_DATA_TYPE_FLOAT_1, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_XYZW, 0);
 	}
 
 	R300_NEWPRIM(rmesa);
-	R300_STATECHANGE(rmesa, vir[0]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
-		r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-				   VB->AttribPtr, inputs, tab, nr);
-	R300_STATECHANGE(rmesa, vir[1]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
-		r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-				   nr);
-
-	R300_STATECHANGE(rmesa, vic);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-
-	R300_STATECHANGE(rmesa, vof);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = vap_fmt_1;
-
-	rmesa->swtcl.vertex_size =
-		_tnl_install_attrs( ctx,
-				    rmesa->swtcl.vertex_attrs,
-				    rmesa->swtcl.vertex_attr_count,
-				    NULL, 0 );
-
-	rmesa->swtcl.vertex_size /= 4;
+	rmesa->vbuf.num_attribs = num_attrs;
+	*_InputsRead = InputsRead;
+	*_OutputsWritten = OutputsWritten;
 
-	RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
-
-
-	R300_STATECHANGE(rmesa, vte);
-	rmesa->hw.vte.cmd[1] = vte;
-	rmesa->hw.vte.cmd[2] = rmesa->swtcl.vertex_size;
+	RENDERINPUTS_COPY(rmesa->render_inputs_bitset, tnl->render_inputs_bitset);
 }
 
-
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  )
+static void r300PrepareVertices(GLcontext *ctx)
 {
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	rmesa->dma.flush = NULL;
-
-	if (rmesa->dma.current.buf) {
-		struct r300_dma_region *current = &rmesa->dma.current;
-		GLuint current_offset = GET_START(current);
-
-		assert (current->start +
-			rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-			current->ptr);
-
-		if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-
-			r300EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size + (12*sizeof(int)), __FUNCTION__);
-
-			r300EmitState(rmesa);
-
-			r300EmitVertexAOS( rmesa,
-					   rmesa->swtcl.vertex_size,
-					   current_offset);
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLuint InputsRead, OutputsWritten;
 
-			r300EmitVbufPrim( rmesa,
-					  rmesa->swtcl.hw_primitive,
-					  rmesa->swtcl.numverts);
+	r300ChooseSwtclVertexFormat(ctx, &InputsRead, &OutputsWritten);
+	r300SetupVAP(ctx, InputsRead, OutputsWritten);
 
-			r300EmitCacheFlush(rmesa);
-		}
+	rmesa->radeon.swtcl.vertex_size =
+		_tnl_install_attrs( ctx,
+				    rmesa->radeon.swtcl.vertex_attrs,
+				    rmesa->radeon.swtcl.vertex_attr_count,
+				    NULL, 0 );
 
-		rmesa->swtcl.numverts = 0;
-		current->start = current->ptr;
-	}
+	rmesa->radeon.swtcl.vertex_size /= 4;
 }
 
-/* Alloc space in the current dma region.
- */
-static void *
-r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
-{
-	GLuint bytes = vsize * nverts;
-
-	if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end )
-		r300RefillCurrentDmaRegion( rmesa, bytes);
-
-	if (!rmesa->dma.flush) {
-		rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-		rmesa->dma.flush = flush_last_swtcl_prim;
-	}
-
-	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
-	ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
-	ASSERT( rmesa->dma.current.start +
-		rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-		rmesa->dma.current.ptr );
-
-	{
-		GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
-		rmesa->dma.current.ptr += bytes;
-		rmesa->swtcl.numverts += nverts;
-		return head;
-	}
-}
 
 static GLuint reduced_prim[] = {
-  GL_POINTS,
-  GL_LINES,
-  GL_LINES,
-  GL_LINES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
+	GL_POINTS,
+	GL_LINES,
+	GL_LINES,
+	GL_LINES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
 };
 
 static void r300RasterPrimitive( GLcontext *ctx, GLuint prim );
-static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
-//static void r300ResetLineStipple( GLcontext *ctx );
 
 /***********************************************************************
  *                    Emit primitives as inline vertices               *
@@ -405,15 +280,13 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
 #undef LOCAL_VARS
 #undef ALLOC_VERTS
 #define CTX_ARG r300ContextPtr rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) r300AllocDmaLowVerts( rmesa, n, size * 4 )
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 )
 #define LOCAL_VARS						\
    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
-   const char *r300verts = (char *)rmesa->swtcl.verts;
+   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;
 #define VERT(x) (r300Vertex *)(r300verts + ((x) * vertsize * sizeof(int)))
 #define VERTEX r300Vertex
-#define DO_DEBUG_VERTS (1 && (RADEON_DEBUG & DEBUG_VERTS))
-#define PRINT_VERTEX(x)
 #undef TAG
 #define TAG(x) r300_##x
 #include "tnl_dd/t_dd_triemit.h"
@@ -433,9 +306,8 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
  *              Build render functions from dd templates               *
  ***********************************************************************/
 
-#define R300_TWOSIDE_BIT	0x01
-#define R300_UNFILLED_BIT	0x02
-#define R300_MAX_TRIFUNC	0x04
+#define R300_UNFILLED_BIT	0x01
+#define R300_MAX_TRIFUNC	0x02
 
 static struct {
    tnl_points_func	        points;
@@ -446,9 +318,9 @@ static struct {
 
 #define DO_FALLBACK  0
 #define DO_UNFILLED (IND & R300_UNFILLED_BIT)
-#define DO_TWOSIDE  (IND & R300_TWOSIDE_BIT)
+#define DO_TWOSIDE   0
 #define DO_FLAT      0
-#define DO_OFFSET     0
+#define DO_OFFSET    0
 #define DO_TRI       1
 #define DO_QUAD      1
 #define DO_LINE      1
@@ -468,33 +340,39 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
-
-/* Only used to pull back colors into vertices (ie, we know color is
- * floating point).
- */
-#define R300_COLOR( dst, src )				\
-do {							\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[0], (src)[2]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[1], (src)[1]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[2], (src)[0]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[3], (src)[3]);	\
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
+
+#define VERT_SET_RGBA( v, c ) \
+do { \
+   r300_color_t *color = (r300_color_t *)&((v)->ui[coloroffset]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]); \
 } while (0)
 
-#define VERT_SET_RGBA( v, c )    if (coloroffset) R300_COLOR( v->ub4[coloroffset], c )
-#define VERT_COPY_RGBA( v0, v1 ) if (coloroffset) v0->ui[coloroffset] = v1->ui[coloroffset]
-#define VERT_SAVE_RGBA( idx )    if (coloroffset) color[idx] = v[idx]->ui[coloroffset]
-#define VERT_RESTORE_RGBA( idx ) if (coloroffset) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
+
+#define VERT_SET_SPEC( v0, c ) \
+do { \
+   if (specoffset) { \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.red, (c)[0]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.green, (c)[1]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.blue, (c)[2]); \
+   } \
+} while (0)
 
-#define R300_SPEC( dst, src )				\
-do {							\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[0], (src)[2]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[1], (src)[1]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[2], (src)[0]);	\
+#define VERT_COPY_SPEC( v0, v1 ) \
+do { \
+   if (specoffset) { \
+       v0->v.specular.red = v1->v.specular.red; \
+       v0->v.specular.green = v1->v.specular.green; \
+       v0->v.specular.blue = v1->v.specular.blue; \
+   } \
 } while (0)
 
-#define VERT_SET_SPEC( v, c )    if (specoffset) R300_SPEC( v->ub4[specoffset], c )
-#define VERT_COPY_SPEC( v0, v1 ) if (specoffset) COPY_3V(v0->ub4[specoffset], v1->ub4[specoffset])
+#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
 #define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
 #define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
 
@@ -514,7 +392,7 @@ do {							\
  ***********************************************************************/
 
 #define RASTERIZE(x) r300RasterPrimitive( ctx, reduced_prim[x] )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
 #undef TAG
 #define TAG(x) x
 #include "tnl_dd/t_dd_unfilled.h"
@@ -530,26 +408,15 @@ do {							\
 #define TAG(x) x
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (R300_TWOSIDE_BIT)
-#define TAG(x) x##_twoside
-#include "tnl_dd/t_dd_tritmp.h"
-
 #define IND (R300_UNFILLED_BIT)
 #define TAG(x) x##_unfilled
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (R300_TWOSIDE_BIT|R300_UNFILLED_BIT)
-#define TAG(x) x##_twoside_unfilled
-#include "tnl_dd/t_dd_tritmp.h"
-
-
 
 static void init_rast_tab( void )
 {
    init();
-   init_twoside();
    init_unfilled();
-   init_twoside_unfilled();
 }
 
 /**********************************************************************/
@@ -571,8 +438,8 @@ static void init_rast_tab( void )
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
-   const char *r300verts = (char *)rmesa->swtcl.verts;		\
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
+   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;		\
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
    const GLboolean stipple = ctx->Line.StippleFlag;		\
    (void) elt; (void) stipple;
@@ -601,10 +468,9 @@ static void r300ChooseRenderState( GLcontext *ctx )
 	GLuint index = 0;
 	GLuint flags = ctx->_TriangleCaps;
 
-	if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R300_TWOSIDE_BIT;
 	if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
 
-	if (index != rmesa->swtcl.RenderIndex) {
+	if (index != rmesa->radeon.swtcl.RenderIndex) {
 		tnl->Driver.Render.Points = rast_tab[index].points;
 		tnl->Driver.Render.Line = rast_tab[index].line;
 		tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -621,30 +487,32 @@ static void r300ChooseRenderState( GLcontext *ctx )
 			tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
 		}
 
-		rmesa->swtcl.RenderIndex = index;
+		rmesa->radeon.swtcl.RenderIndex = index;
 	}
 }
 
 
-static void r300RenderStart(GLcontext *ctx)
+void r300RenderStart(GLcontext *ctx)
 {
-        r300ContextPtr rmesa = R300_CONTEXT( ctx );
+	r300ContextPtr rmesa = R300_CONTEXT( ctx );
 
 	r300ChooseRenderState(ctx);
-	r300SetVertexFormat(ctx);
+	r300PrepareVertices(ctx);
+
+	r300ValidateBuffers(ctx);
 
 	r300UpdateShaders(rmesa);
 	r300UpdateShaderStates(rmesa);
 
 	r300EmitCacheFlush(rmesa);
 
-	if (rmesa->dma.flush != 0 &&
-	    rmesa->dma.flush != flush_last_swtcl_prim)
-		rmesa->dma.flush( rmesa );
-
+	/* investigate if we can put back flush optimisation if needed */
+	if (rmesa->radeon.dma.flush != NULL) {
+		rmesa->radeon.dma.flush(ctx);
+	}
 }
 
-static void r300RenderFinish(GLcontext *ctx)
+void r300RenderFinish(GLcontext *ctx)
 {
 }
 
@@ -652,28 +520,26 @@ static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 
-	if (rmesa->swtcl.hw_primitive != hwprim) {
-	        R300_NEWPRIM( rmesa );
-		rmesa->swtcl.hw_primitive = hwprim;
+	if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
+		R300_NEWPRIM( rmesa );
+		rmesa->radeon.swtcl.hw_primitive = hwprim;
 	}
 }
 
-static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
+void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 {
 
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	rmesa->swtcl.render_primitive = prim;
+	rmesa->radeon.swtcl.render_primitive = prim;
 
 	if ((prim == GL_TRIANGLES) && (ctx->_TriangleCaps & DD_TRI_UNFILLED))
-	  return;
+		return;
 
 	r300RasterPrimitive( ctx, reduced_prim[prim] );
 }
 
-static void r300ResetLineStipple(GLcontext *ctx)
+void r300ResetLineStipple(GLcontext *ctx)
 {
-
-
 }
 
 void r300InitSwtcl(GLcontext *ctx)
@@ -699,50 +565,68 @@ void r300InitSwtcl(GLcontext *ctx)
 	_tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
 			    48 * sizeof(GLfloat) );
 
-	rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
-	rmesa->swtcl.RenderIndex = ~0;
-	rmesa->swtcl.render_primitive = GL_TRIANGLES;
-	rmesa->swtcl.hw_primitive = 0;
+	rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+	rmesa->radeon.swtcl.RenderIndex = ~0;
+	rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+	rmesa->radeon.swtcl.hw_primitive = 0;
 
 	_tnl_invalidate_vertex_state( ctx, ~0 );
 	_tnl_invalidate_vertices( ctx, ~0 );
-	RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
 
 	_tnl_need_projected_coords( ctx, GL_FALSE );
-	r300ChooseRenderState(ctx);
 }
 
 void r300DestroySwtcl(GLcontext *ctx)
 {
 }
 
-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset)
+static void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, struct radeon_bo *bo, GLuint offset)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
+	BATCH_LOCALS(&rmesa->radeon);
 
-	drm_radeon_cmd_header_t *cmd = NULL;
 	if (RADEON_DEBUG & DEBUG_VERTS)
-	  fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
-		  __FUNCTION__, vertex_size, offset);
-
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2), 2);
-	e32(1);
-	e32(vertex_size | (vertex_size << 8));
-	e32(offset);
+		fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
+			__FUNCTION__, vertex_size, offset);
+
+	BEGIN_BATCH(7);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2);
+	OUT_BATCH(1);
+	OUT_BATCH(vertex_size | (vertex_size << 8));
+	OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+	END_BATCH();
 }
 
-void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
+static void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
 {
-
-	int cmd_reserved = 0;
-	int cmd_written = 0;
+	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
-	drm_radeon_cmd_header_t *cmd = NULL;
 
 	type = r300PrimitiveType(rmesa, primitive);
 	num_verts = r300NumVerts(rmesa, vertex_nr, primitive);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+	BEGIN_BATCH(3);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+	END_BATCH();
+}
+
+void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	rcommonEnsureCmdBufSpace(&rmesa->radeon,
+			   rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
+			   __FUNCTION__);
+	radeonEmitState(&rmesa->radeon);
+    r300_emit_scissor(ctx);
+	r300EmitVertexAOS(rmesa,
+			rmesa->radeon.swtcl.vertex_size,
+			rmesa->radeon.dma.current,
+			current_offset);
+
+	r300EmitVbufPrim(rmesa,
+		   rmesa->radeon.swtcl.hw_primitive,
+		   rmesa->radeon.swtcl.numverts);
+	r300EmitCacheFlush(rmesa);
+	COMMIT_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.h b/src/mesa/drivers/dri/r300/r300_swtcl.h
index 55df53c1ad..c271d26546 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.h
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.h
@@ -39,7 +39,27 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 #include "r300_context.h"
 
+/*
+ * Here are definitions of OVM locations of vertex attributes for non TCL hw
+ */
+#define SWTCL_OVM_POS 0
+#define SWTCL_OVM_COLOR0 2
+#define SWTCL_OVM_COLOR1 3
+#define SWTCL_OVM_COLOR2 4
+#define SWTCL_OVM_COLOR3 5
+#define SWTCL_OVM_TEX(n) ((n) + 6)
+#define SWTCL_OVM_POINT_SIZE 15
+
+extern void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *InputsRead,  GLuint *OutputsWritten);
+
 extern void r300InitSwtcl( GLcontext *ctx );
 extern void r300DestroySwtcl( GLcontext *ctx );
 
+extern void r300RenderStart(GLcontext *ctx);
+extern void r300RenderFinish(GLcontext *ctx);
+extern void r300RenderPrimitive(GLcontext *ctx, GLenum prim);
+extern void r300ResetLineStipple(GLcontext *ctx);
+
+extern void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 7c699ec572..0af5bb4f46 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/image.h"
+#include "main/mipmap.h"
 #include "main/simple_list.h"
 #include "main/texformat.h"
 #include "main/texstore.h"
@@ -49,6 +50,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_tex.h"
 
 #include "xmlpool.h"
@@ -77,20 +79,20 @@ static unsigned int translate_wrap_mode(GLenum wrapmode)
  *
  * \param t Texture object whose wrap modes are to be set
  */
-static void r300UpdateTexWrap(r300TexObjPtr t)
+static void r300UpdateTexWrap(radeonTexObjPtr t)
 {
-	struct gl_texture_object *tObj = t->base.tObj;
+	struct gl_texture_object *tObj = &t->base;
 
-	t->filter &=
+	t->pp_txfilter &=
 	    ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_R_MASK);
 
-	t->filter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
+	t->pp_txfilter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
 
 	if (tObj->Target != GL_TEXTURE_1D) {
-		t->filter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
+		t->pp_txfilter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
 
 		if (tObj->Target == GL_TEXTURE_3D)
-			t->filter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
+			t->pp_txfilter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
 	}
 }
 
@@ -117,10 +119,13 @@ static GLuint aniso_filter(GLfloat anisotropy)
  * \param magf Texture magnification mode
  * \param anisotropy Maximum anisotropy level
  */
-static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
+static void r300SetTexFilter(radeonTexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
 {
-	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
-	t->filter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
+	/* Force revalidation to account for switches from/to mipmapping. */
+	t->validated = GL_FALSE;
+
+	t->pp_txfilter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
+	t->pp_txfilter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
 
 	/* Note that EXT_texture_filter_anisotropic is extremely vague about
 	 * how anisotropic filtering interacts with the "normal" filter modes.
@@ -128,7 +133,7 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
 	 * filter settings completely. This includes driconf's settings.
 	 */
 	if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
-		t->filter |= R300_TX_MAG_FILTER_ANISO
+		t->pp_txfilter |= R300_TX_MAG_FILTER_ANISO
 			| R300_TX_MIN_FILTER_ANISO
 			| R300_TX_MIN_FILTER_MIP_LINEAR
 			| aniso_filter(anisotropy);
@@ -139,22 +144,22 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
 
 	switch (minf) {
 	case GL_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST;
 		break;
 	case GL_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR;
 		break;
 	case GL_NEAREST_MIPMAP_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
 		break;
 	case GL_NEAREST_MIPMAP_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
 		break;
 	case GL_LINEAR_MIPMAP_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
 		break;
 	case GL_LINEAR_MIPMAP_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
 		break;
 	}
 
@@ -163,15 +168,15 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
 	 */
 	switch (magf) {
 	case GL_NEAREST:
-		t->filter |= R300_TX_MAG_FILTER_NEAREST;
+		t->pp_txfilter |= R300_TX_MAG_FILTER_NEAREST;
 		break;
 	case GL_LINEAR:
-		t->filter |= R300_TX_MAG_FILTER_LINEAR;
+		t->pp_txfilter |= R300_TX_MAG_FILTER_LINEAR;
 		break;
 	}
 }
 
-static void r300SetTexBorderColor(r300TexObjPtr t, const GLfloat color[4])
+static void r300SetTexBorderColor(radeonTexObjPtr t, const GLfloat color[4])
 {
 	GLubyte c[4];
 	CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
@@ -182,729 +187,6 @@ static void r300SetTexBorderColor(r300TexObjPtr t, const GLfloat color[4])
 }
 
 /**
- * Allocate space for and load the mesa images into the texture memory block.
- * This will happen before drawing with a new texture, or drawing with a
- * texture after it was swapped out or teximaged again.
- */
-
-static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
-{
-	r300TexObjPtr t;
-
-	t = CALLOC_STRUCT(r300_tex_obj);
-	texObj->DriverData = t;
-	if (t != NULL) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE) {
-			fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
-				(void *)texObj, (void *)t);
-		}
-
-		/* Initialize non-image-dependent parts of the state:
-		 */
-		t->base.tObj = texObj;
-		t->border_fallback = GL_FALSE;
-
-		make_empty_list(&t->base);
-
-		r300UpdateTexWrap(t);
-		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
-		r300SetTexBorderColor(t, texObj->BorderColor);
-	}
-
-	return t;
-}
-
-/* try to find a format which will only need a memcopy */
-static const struct gl_texture_format *r300Choose8888TexFormat(GLenum srcFormat,
-							       GLenum srcType)
-{
-	const GLuint ui = 1;
-	const GLubyte littleEndian = *((const GLubyte *)&ui);
-
-	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
-	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
-		return &_mesa_texformat_rgba8888;
-	} else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-		   (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
-		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
-		return &_mesa_texformat_rgba8888_rev;
-	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
-					    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
-		return &_mesa_texformat_argb8888_rev;
-	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
-					    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
-		return &_mesa_texformat_argb8888;
-	} else
-		return _dri_texformat_argb8888;
-}
-
-static const struct gl_texture_format *r300ChooseTextureFormat(GLcontext * ctx,
-							       GLint
-							       internalFormat,
-							       GLenum format,
-							       GLenum type)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	const GLboolean do32bpt =
-	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32);
-	const GLboolean force16bpt =
-	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16);
-	(void)format;
-
-#if 0
-	fprintf(stderr, "InternalFormat=%s(%d) type=%s format=%s\n",
-		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
-		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
-	fprintf(stderr, "do32bpt=%d force16bpt=%d\n", do32bpt, force16bpt);
-#endif
-
-	switch (internalFormat) {
-	case 4:
-	case GL_RGBA:
-	case GL_COMPRESSED_RGBA:
-		switch (type) {
-		case GL_UNSIGNED_INT_10_10_10_2:
-		case GL_UNSIGNED_INT_2_10_10_10_REV:
-			return do32bpt ? _dri_texformat_argb8888 :
-			    _dri_texformat_argb1555;
-		case GL_UNSIGNED_SHORT_4_4_4_4:
-		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-			return _dri_texformat_argb4444;
-		case GL_UNSIGNED_SHORT_5_5_5_1:
-		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-			return _dri_texformat_argb1555;
-		default:
-			return do32bpt ? r300Choose8888TexFormat(format, type) :
-			    _dri_texformat_argb4444;
-		}
-
-	case 3:
-	case GL_RGB:
-	case GL_COMPRESSED_RGB:
-		switch (type) {
-		case GL_UNSIGNED_SHORT_4_4_4_4:
-		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-			return _dri_texformat_argb4444;
-		case GL_UNSIGNED_SHORT_5_5_5_1:
-		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-			return _dri_texformat_argb1555;
-		case GL_UNSIGNED_SHORT_5_6_5:
-		case GL_UNSIGNED_SHORT_5_6_5_REV:
-			return _dri_texformat_rgb565;
-		default:
-			return do32bpt ? _dri_texformat_argb8888 :
-			    _dri_texformat_rgb565;
-		}
-
-	case GL_RGBA8:
-	case GL_RGB10_A2:
-	case GL_RGBA12:
-	case GL_RGBA16:
-		return !force16bpt ?
-		    r300Choose8888TexFormat(format,
-					    type) : _dri_texformat_argb4444;
-
-	case GL_RGBA4:
-	case GL_RGBA2:
-		return _dri_texformat_argb4444;
-
-	case GL_RGB5_A1:
-		return _dri_texformat_argb1555;
-
-	case GL_RGB8:
-	case GL_RGB10:
-	case GL_RGB12:
-	case GL_RGB16:
-		return !force16bpt ? _dri_texformat_argb8888 :
-		    _dri_texformat_rgb565;
-
-	case GL_RGB5:
-	case GL_RGB4:
-	case GL_R3_G3_B2:
-		return _dri_texformat_rgb565;
-
-	case GL_ALPHA:
-	case GL_ALPHA4:
-	case GL_ALPHA8:
-	case GL_ALPHA12:
-	case GL_ALPHA16:
-	case GL_COMPRESSED_ALPHA:
-		return _dri_texformat_a8;
-
-	case 1:
-	case GL_LUMINANCE:
-	case GL_LUMINANCE4:
-	case GL_LUMINANCE8:
-	case GL_LUMINANCE12:
-	case GL_LUMINANCE16:
-	case GL_COMPRESSED_LUMINANCE:
-		return _dri_texformat_l8;
-
-	case 2:
-	case GL_LUMINANCE_ALPHA:
-	case GL_LUMINANCE4_ALPHA4:
-	case GL_LUMINANCE6_ALPHA2:
-	case GL_LUMINANCE8_ALPHA8:
-	case GL_LUMINANCE12_ALPHA4:
-	case GL_LUMINANCE12_ALPHA12:
-	case GL_LUMINANCE16_ALPHA16:
-	case GL_COMPRESSED_LUMINANCE_ALPHA:
-		return _dri_texformat_al88;
-
-	case GL_INTENSITY:
-	case GL_INTENSITY4:
-	case GL_INTENSITY8:
-	case GL_INTENSITY12:
-	case GL_INTENSITY16:
-	case GL_COMPRESSED_INTENSITY:
-		return _dri_texformat_i8;
-
-	case GL_YCBCR_MESA:
-		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-		    type == GL_UNSIGNED_BYTE)
-			return &_mesa_texformat_ycbcr;
-		else
-			return &_mesa_texformat_ycbcr_rev;
-
-	case GL_RGB_S3TC:
-	case GL_RGB4_S3TC:
-	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-		return &_mesa_texformat_rgb_dxt1;
-
-	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-		return &_mesa_texformat_rgba_dxt1;
-
-	case GL_RGBA_S3TC:
-	case GL_RGBA4_S3TC:
-	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-		return &_mesa_texformat_rgba_dxt3;
-
-	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-		return &_mesa_texformat_rgba_dxt5;
-
-	case GL_ALPHA16F_ARB:
-		return &_mesa_texformat_alpha_float16;
-	case GL_ALPHA32F_ARB:
-		return &_mesa_texformat_alpha_float32;
-	case GL_LUMINANCE16F_ARB:
-		return &_mesa_texformat_luminance_float16;
-	case GL_LUMINANCE32F_ARB:
-		return &_mesa_texformat_luminance_float32;
-	case GL_LUMINANCE_ALPHA16F_ARB:
-		return &_mesa_texformat_luminance_alpha_float16;
-	case GL_LUMINANCE_ALPHA32F_ARB:
-		return &_mesa_texformat_luminance_alpha_float32;
-	case GL_INTENSITY16F_ARB:
-		return &_mesa_texformat_intensity_float16;
-	case GL_INTENSITY32F_ARB:
-		return &_mesa_texformat_intensity_float32;
-	case GL_RGB16F_ARB:
-		return &_mesa_texformat_rgba_float16;
-	case GL_RGB32F_ARB:
-		return &_mesa_texformat_rgba_float32;
-	case GL_RGBA16F_ARB:
-		return &_mesa_texformat_rgba_float16;
-	case GL_RGBA32F_ARB:
-		return &_mesa_texformat_rgba_float32;
-
-	case GL_DEPTH_COMPONENT:
-	case GL_DEPTH_COMPONENT16:
-	case GL_DEPTH_COMPONENT24:
-	case GL_DEPTH_COMPONENT32:
-#if 0
-		switch (type) {
-		case GL_UNSIGNED_BYTE:
-		case GL_UNSIGNED_SHORT:
-			return &_mesa_texformat_z16;
-		case GL_UNSIGNED_INT:
-			return &_mesa_texformat_z32;
-		case GL_UNSIGNED_INT_24_8_EXT:
-		default:
-			return &_mesa_texformat_z24_s8;
-		}
-#else
-		return &_mesa_texformat_z16;
-#endif
-
-	default:
-		_mesa_problem(ctx,
-			      "unexpected internalFormat 0x%x in r300ChooseTextureFormat",
-			      (int)internalFormat);
-		return NULL;
-	}
-
-	return NULL;		/* never get here */
-}
-
-static GLboolean
-r300ValidateClientStorage(GLcontext * ctx, GLenum target,
-			  GLint internalFormat,
-			  GLint srcWidth, GLint srcHeight,
-			  GLenum format, GLenum type, const void *pixels,
-			  const struct gl_pixelstore_attrib *packing,
-			  struct gl_texture_object *texObj,
-			  struct gl_texture_image *texImage)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "intformat %s format %s type %s\n",
-			_mesa_lookup_enum_by_nr(internalFormat),
-			_mesa_lookup_enum_by_nr(format),
-			_mesa_lookup_enum_by_nr(type));
-
-	if (!ctx->Unpack.ClientStorage)
-		return 0;
-
-	if (ctx->_ImageTransferState ||
-	    texImage->IsCompressed || texObj->GenerateMipmap)
-		return 0;
-
-	/* This list is incomplete, may be different on ppc???
-	 */
-	switch (internalFormat) {
-	case GL_RGBA:
-		if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV) {
-			texImage->TexFormat = _dri_texformat_argb8888;
-		} else
-			return 0;
-		break;
-
-	case GL_RGB:
-		if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
-			texImage->TexFormat = _dri_texformat_rgb565;
-		} else
-			return 0;
-		break;
-
-	case GL_YCBCR_MESA:
-		if (format == GL_YCBCR_MESA &&
-		    type == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
-			texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
-		} else if (format == GL_YCBCR_MESA &&
-			   (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-			    type == GL_UNSIGNED_BYTE)) {
-			texImage->TexFormat = &_mesa_texformat_ycbcr;
-		} else
-			return 0;
-		break;
-
-	default:
-		return 0;
-	}
-
-	/* Could deal with these packing issues, but currently don't:
-	 */
-	if (packing->SkipPixels ||
-	    packing->SkipRows || packing->SwapBytes || packing->LsbFirst) {
-		return 0;
-	}
-
-	GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
-						    format, type);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "%s: srcRowStride %d/%x\n",
-			__FUNCTION__, srcRowStride, srcRowStride);
-
-	/* Could check this later in upload, pitch restrictions could be
-	 * relaxed, but would need to store the image pitch somewhere,
-	 * as packing details might change before image is uploaded:
-	 */
-	if (!r300IsGartMemory(rmesa, pixels, srcHeight * srcRowStride)
-	    || (srcRowStride & 63))
-		return 0;
-
-	/* Have validated that _mesa_transfer_teximage would be a straight
-	 * memcpy at this point.  NOTE: future calls to TexSubImage will
-	 * overwrite the client data.  This is explicitly mentioned in the
-	 * extension spec.
-	 */
-	texImage->Data = (void *)pixels;
-	texImage->IsClientData = GL_TRUE;
-	texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
-
-	return 1;
-}
-
-static void r300TexImage1D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-			return;
-		}
-	}
-
-	/* Note, this will call ChooseTextureFormat */
-	_mesa_store_teximage1d(ctx, target, level, internalFormat,
-			       width, border, format, type, pixels,
-			       &ctx->Unpack, texObj, texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-static void r300TexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
-			      GLint xoffset,
-			      GLsizei width,
-			      GLenum format, GLenum type,
-			      const GLvoid * pixels,
-			      const struct gl_pixelstore_attrib *packing,
-			      struct gl_texture_object *texObj,
-			      struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
-			return;
-		}
-	}
-
-	_mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
-				  format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-static void r300TexImage2D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint height, GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	if (t != NULL) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_teximage2d(ctx, target, level, internalFormat,
-				       width, height, border, format, type,
-				       pixels, &ctx->Unpack, texObj, texImage);
-
-		t->dirty_images[face] |= (1 << level);
-	}
-}
-
-static void r300TexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
-			      GLint xoffset, GLint yoffset,
-			      GLsizei width, GLsizei height,
-			      GLenum format, GLenum type,
-			      const GLvoid * pixels,
-			      const struct gl_pixelstore_attrib *packing,
-			      struct gl_texture_object *texObj,
-			      struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
-			return;
-		}
-	}
-
-	_mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-				  height, format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[face] |= (1 << level);
-}
-
-static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
-				     GLint level, GLint internalFormat,
-				     GLint width, GLint height, GLint border,
-				     GLsizei imageSize, const GLvoid * data,
-				     struct gl_texture_object *texObj,
-				     struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	if (t != NULL) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
-				    "glCompressedTexImage2D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-	/* can't call this, different parameters. Would never evaluate to true anyway currently */
-#if 0
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else
-#endif
-	{
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_compressed_teximage2d(ctx, target, level,
-						  internalFormat, width, height,
-						  border, imageSize, data,
-						  texObj, texImage);
-
-		t->dirty_images[face] |= (1 << level);
-	}
-}
-
-static void r300CompressedTexSubImage2D(GLcontext * ctx, GLenum target,
-					GLint level, GLint xoffset,
-					GLint yoffset, GLsizei width,
-					GLsizei height, GLenum format,
-					GLsizei imageSize, const GLvoid * data,
-					struct gl_texture_object *texObj,
-					struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
-				    "glCompressedTexSubImage3D");
-			return;
-		}
-	}
-
-	_mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset,
-					     yoffset, width, height, format,
-					     imageSize, data, texObj, texImage);
-
-	t->dirty_images[face] |= (1 << level);
-}
-
-static void r300TexImage3D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint height, GLint depth,
-			   GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-#if 0
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else
-#endif
-	{
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_teximage3d(ctx, target, level, internalFormat,
-				       width, height, depth, border,
-				       format, type, pixels,
-				       &ctx->Unpack, texObj, texImage);
-
-		t->dirty_images[0] |= (1 << level);
-	}
-}
-
-static void
-r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
-		  GLint xoffset, GLint yoffset, GLint zoffset,
-		  GLsizei width, GLsizei height, GLsizei depth,
-		  GLenum format, GLenum type,
-		  const GLvoid * pixels,
-		  const struct gl_pixelstore_attrib *packing,
-		  struct gl_texture_object *texObj,
-		  struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-/*     fprintf(stderr, "%s\n", __FUNCTION__); */
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
-			return;
-		}
-		texObj->DriverData = t;
-	}
-
-	_mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
-				  width, height, depth,
-				  format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-/**
  * Changes variables and flags for a state update, which will happen at the
  * next UpdateTextureState
  */
@@ -913,7 +195,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 			     struct gl_texture_object *texObj,
 			     GLenum pname, const GLfloat * params)
 {
-	r300TexObjPtr t = (r300TexObjPtr) texObj->DriverData;
+	radeonTexObj* t = radeon_tex_obj(texObj);
 
 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
 		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
@@ -946,7 +228,11 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 		 * we just have to rely on loading the right subset of mipmap levels
 		 * to simulate a clamped LOD.
 		 */
-		driSwapOutTextureObject((driTextureObject *) t);
+		if (t->mt) {
+			radeon_miptree_unreference(t->mt);
+			t->mt = 0;
+			t->validated = GL_FALSE;
+		}
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
@@ -969,27 +255,10 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 	}
 }
 
-static void r300BindTexture(GLcontext * ctx, GLenum target,
-			    struct gl_texture_object *texObj)
-{
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
-		fprintf(stderr, "%s( %p ) unit=%d\n", __FUNCTION__,
-			(void *)texObj, ctx->Texture.CurrentUnit);
-	}
-
-	if ((target == GL_TEXTURE_1D)
-	    || (target == GL_TEXTURE_2D)
-	    || (target == GL_TEXTURE_3D)
-	    || (target == GL_TEXTURE_CUBE_MAP)
-	    || (target == GL_TEXTURE_RECTANGLE_NV)) {
-		assert(texObj->DriverData != NULL);
-	}
-}
-
 static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	radeonTexObj* t = radeon_tex_obj(texObj);
 
 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
 		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
@@ -997,14 +266,24 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 			_mesa_lookup_enum_by_nr(texObj->Target));
 	}
 
-	if (t != NULL) {
-		if (rmesa) {
-			R300_FIREVERTICES(rmesa);
-		}
+	if (rmesa) {
+		int i;
+		radeon_firevertices(&rmesa->radeon);
+
+		for(i = 0; i < R300_MAX_TEXTURE_UNITS; ++i)
+			if (rmesa->hw.textures[i] == t)
+				rmesa->hw.textures[i] = 0;
+	}
 
-		driDestroyTextureObject(t);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = 0;
 	}
-	/* Free mipmap images and the texture object itself */
 	_mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -1013,8 +292,6 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
  * Called via ctx->Driver.NewTextureObject.
  * Note: this function will be called during context creation to
  * allocate the default texture objects.
- * Note: we could use containment here to 'derive' the driver-specific
- * texture object from the core mesa gl_texture_object.  Not done at this time.
  * Fixup MaxAnisotropy according to user preference.
  */
 static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
@@ -1022,14 +299,23 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
 						      GLenum target)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_object *obj;
-	obj = _mesa_new_texture_object(ctx, name, target);
-	if (!obj)
-		return NULL;
-	obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+	radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
 
-	r300AllocTexObj(obj);
-	return obj;
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+			t, _mesa_lookup_enum_by_nr(target));
+	}
+
+	_mesa_initialize_texture_object(&t->base, name, target);
+	t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+	/* Initialize hardware state */
+	r300UpdateTexWrap(t);
+	r300SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
+	r300SetTexBorderColor(t, t->base.BorderColor);
+
+	return &t->base;
 }
 
 void r300InitTextureFuncs(struct dd_function_table *functions)
@@ -1037,22 +323,30 @@ void r300InitTextureFuncs(struct dd_function_table *functions)
 	/* Note: we only plug in the functions we implement in the driver
 	 * since _mesa_init_driver_functions() was already called.
 	 */
-	functions->ChooseTextureFormat = r300ChooseTextureFormat;
-	functions->TexImage1D = r300TexImage1D;
-	functions->TexImage2D = r300TexImage2D;
-	functions->TexImage3D = r300TexImage3D;
-	functions->TexSubImage1D = r300TexSubImage1D;
-	functions->TexSubImage2D = r300TexSubImage2D;
-	functions->TexSubImage3D = r300TexSubImage3D;
+	functions->NewTextureImage = radeonNewTextureImage;
+	functions->FreeTexImageData = radeonFreeTexImageData;
+	functions->MapTexture = radeonMapTexture;
+	functions->UnmapTexture = radeonUnmapTexture;
+
+	functions->ChooseTextureFormat = radeonChooseTextureFormat_mesa;
+	functions->TexImage1D = radeonTexImage1D;
+	functions->TexImage2D = radeonTexImage2D;
+	functions->TexImage3D = radeonTexImage3D;
+	functions->TexSubImage1D = radeonTexSubImage1D;
+	functions->TexSubImage2D = radeonTexSubImage2D;
+	functions->TexSubImage3D = radeonTexSubImage3D;
+	functions->GetTexImage = radeonGetTexImage;
+	functions->GetCompressedTexImage = radeonGetCompressedTexImage;
 	functions->NewTextureObject = r300NewTextureObject;
-	functions->BindTexture = r300BindTexture;
 	functions->DeleteTexture = r300DeleteTexture;
 	functions->IsTextureResident = driIsTextureResident;
 
 	functions->TexParameter = r300TexParameter;
 
-	functions->CompressedTexImage2D = r300CompressedTexImage2D;
-	functions->CompressedTexSubImage2D = r300CompressedTexSubImage2D;
+	functions->CompressedTexImage2D = radeonCompressedTexImage2D;
+	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
+
+	functions->GenerateMipmap = radeonGenerateMipmap;
 
 	driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_tex.h b/src/mesa/drivers/dri/r300/r300_tex.h
index b86d45bfe0..8a653ea2d1 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.h
+++ b/src/mesa/drivers/dri/r300/r300_tex.h
@@ -37,16 +37,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 extern void r300SetDepthTexMode(struct gl_texture_object *tObj);
 
+extern void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target,
+			     __DRIdrawable *dPriv);
+
+extern void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
+			      GLint format, __DRIdrawable *dPriv);
+
 extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 			     unsigned long long offset, GLint depth,
 			     GLuint pitch);
 
-extern void r300UpdateTextureState(GLcontext * ctx);
-
-extern int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t,
-			       GLuint face);
-
-extern void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t);
+extern GLboolean r300ValidateBuffers(GLcontext * ctx);
 
 extern void r300InitTextureFuncs(struct dd_function_table *functions);
 
diff --git a/src/mesa/drivers/dri/r300/r300_texmem.c b/src/mesa/drivers/dri/r300/r300_texmem.c
deleted file mode 100644
index a89ab83d94..0000000000
--- a/src/mesa/drivers/dri/r300/r300_texmem.c
+++ /dev/null
@@ -1,568 +0,0 @@
-/**************************************************************************
-
-Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.
-The Weather Channel, Inc. funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86
-license. This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation on the rights to use, copy, modify, merge, publish,
-distribute, sub license, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**************************************************************************/
-
-/**
- * \file
- *
- * \author Gareth Hughes <gareth@valinux.com>
- *
- * \author Kevin E. Martin <martin@valinux.com>
- */
-
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/macros.h"
-#include "main/simple_list.h"
-#include "main/texobj.h"
-#include "radeon_reg.h"		/* gets definition for usleep */
-#include "r300_context.h"
-#include "r300_state.h"
-#include "r300_cmdbuf.h"
-#include "radeon_ioctl.h"
-#include "r300_tex.h"
-#include "r300_ioctl.h"
-#include <unistd.h>		/* for usleep() */
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
-
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t)
-{
-	int i;
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
-		fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
-			(void *)t, (void *)t->base.tObj);
-	}
-
-	for (i = 0; i < rmesa->radeon.glCtx->Const.MaxTextureUnits; i++) {
-		if (rmesa->state.texture.unit[i].texobj == t->base.tObj) {
-			_mesa_reference_texobj(&rmesa->state.texture.unit[i].texobj, NULL);
-		}
-	}
-}
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-static void r300UploadGARTClientSubImage(r300ContextPtr rmesa,
-					 r300TexObjPtr t,
-					 struct gl_texture_image *texImage,
-					 GLint hwlevel,
-					 GLint x, GLint y,
-					 GLint width, GLint height)
-{
-	const struct gl_texture_format *texFormat = texImage->TexFormat;
-	GLuint srcPitch, dstPitch;
-	int blit_format;
-	int srcOffset;
-
-	/*
-	 * XXX it appears that we always upload the full image, not a subimage.
-	 * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
-	 * changed, the src pitch will have to change.
-	 */
-	switch (texFormat->TexelBytes) {
-	case 1:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 2:
-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 4:
-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 8:
-	case 16:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	default:
-		return;
-	}
-
-	t->image[0][hwlevel].data = texImage->Data;
-	srcOffset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
-
-	assert(srcOffset != ~0);
-
-	/* Don't currently need to cope with small pitches?
-	 */
-	width = texImage->Width;
-	height = texImage->Height;
-
-	if (texFormat->TexelBytes > 4) {
-		width *= texFormat->TexelBytes;
-	}
-
-	r300EmitWait(rmesa, R300_WAIT_3D);
-
-	r300EmitBlit(rmesa, blit_format,
-		     srcPitch,
-		     srcOffset,
-		     dstPitch,
-		     t->bufAddr,
-		     x,
-		     y,
-		     t->image[0][hwlevel].x + x,
-		     t->image[0][hwlevel].y + y, width, height);
-
-	r300EmitWait(rmesa, R300_WAIT_2D);
-}
-
-static void r300UploadRectSubImage(r300ContextPtr rmesa,
-				   r300TexObjPtr t,
-				   struct gl_texture_image *texImage,
-				   GLint x, GLint y, GLint width, GLint height)
-{
-	const struct gl_texture_format *texFormat = texImage->TexFormat;
-	int blit_format, dstPitch, done;
-
-	switch (texFormat->TexelBytes) {
-	case 1:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		break;
-	case 2:
-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
-		break;
-	case 4:
-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
-		break;
-	case 8:
-	case 16:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		break;
-	default:
-		return;
-	}
-
-	t->image[0][0].data = texImage->Data;
-
-	/* Currently don't need to cope with small pitches.
-	 */
-	width = texImage->Width;
-	height = texImage->Height;
-	dstPitch = t->pitch;
-
-	if (texFormat->TexelBytes > 4) {
-		width *= texFormat->TexelBytes;
-	}
-
-	if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
-		/* In this case, could also use GART texturing.  This is
-		 * currently disabled, but has been tested & works.
-		 */
-		t->offset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
-		t->pitch = texImage->RowStride * texFormat->TexelBytes - 32;
-
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"Using GART texturing for rectangular client texture\n");
-
-		/* Release FB memory allocated for this image:
-		 */
-		/* FIXME This may not be correct as driSwapOutTextureObject sets
-		 * FIXME dirty_images.  It may be fine, though.
-		 */
-		if (t->base.memBlock) {
-			driSwapOutTextureObject((driTextureObject *) t);
-		}
-	} else if (texImage->IsClientData) {
-		/* Data already in GART memory, with usable pitch.
-		 */
-		GLuint srcPitch;
-		srcPitch = texImage->RowStride * texFormat->TexelBytes;
-		r300EmitBlit(rmesa,
-			     blit_format,
-			     srcPitch,
-			     r300GartOffsetFromVirtual(rmesa, texImage->Data),
-			     dstPitch, t->bufAddr, 0, 0, 0, 0, width, height);
-	} else {
-		/* Data not in GART memory, or bad pitch.
-		 */
-		for (done = 0; done < height;) {
-			struct r300_dma_region region;
-			int lines =
-			    MIN2(height - done, RADEON_BUFFER_SIZE / dstPitch);
-			int src_pitch;
-			char *tex;
-
-			src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-			tex = (char *)texImage->Data + done * src_pitch;
-
-			memset(&region, 0, sizeof(region));
-			r300AllocDmaRegion(rmesa, &region, lines * dstPitch,
-					   1024);
-
-			/* Copy texdata to dma:
-			 */
-			if (RADEON_DEBUG & DEBUG_TEXTURE)
-				fprintf(stderr,
-					"%s: src_pitch %d dst_pitch %d\n",
-					__FUNCTION__, src_pitch, dstPitch);
-
-			if (src_pitch == dstPitch) {
-				memcpy(region.address + region.start, tex,
-				       lines * src_pitch);
-			} else {
-				char *buf = region.address + region.start;
-				int i;
-				for (i = 0; i < lines; i++) {
-					memcpy(buf, tex, src_pitch);
-					buf += dstPitch;
-					tex += src_pitch;
-				}
-			}
-
-			r300EmitWait(rmesa, R300_WAIT_3D);
-
-			/* Blit to framebuffer
-			 */
-			r300EmitBlit(rmesa,
-				     blit_format,
-				     dstPitch, GET_START(&region),
-				     dstPitch | (t->tile_bits >> 16),
-				     t->bufAddr, 0, 0, 0, done, width, lines);
-
-			r300EmitWait(rmesa, R300_WAIT_2D);
-#ifdef USER_BUFFERS
-			r300_mem_use(rmesa, region.buf->id);
-#endif
-
-			r300ReleaseDmaRegion(rmesa, &region, __FUNCTION__);
-			done += lines;
-		}
-	}
-}
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
-			       GLint hwlevel,
-			       GLint x, GLint y, GLint width, GLint height,
-			       GLuint face)
-{
-	struct gl_texture_image *texImage = NULL;
-	GLuint offset;
-	GLint imageWidth, imageHeight;
-	GLint ret;
-	drm_radeon_texture_t tex;
-	drm_radeon_tex_image_t tmp;
-	const int level = hwlevel + t->base.firstLevel;
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
-		fprintf(stderr,
-			"%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n",
-			__FUNCTION__, (void *)t, (void *)t->base.tObj, level,
-			width, height, face);
-	}
-
-	ASSERT(face < 6);
-
-	/* Ensure we have a valid texture to upload */
-	if ((hwlevel < 0) || (hwlevel >= R300_MAX_TEXTURE_LEVELS)) {
-		_mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-		return;
-	}
-
-	texImage = t->base.tObj->Image[face][level];
-
-	if (!texImage) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: texImage %d is NULL!\n",
-				__FUNCTION__, level);
-		return;
-	}
-	if (!texImage->Data) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: image data is NULL!\n",
-				__FUNCTION__);
-		return;
-	}
-
-	if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		assert(level == 0);
-		assert(hwlevel == 0);
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: image data is rectangular\n",
-				__FUNCTION__);
-		r300UploadRectSubImage(rmesa, t, texImage, x, y, width, height);
-		return;
-	} else if (texImage->IsClientData) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"%s: image data is in GART client storage\n",
-				__FUNCTION__);
-		r300UploadGARTClientSubImage(rmesa, t, texImage, hwlevel, x, y,
-					     width, height);
-		return;
-	} else if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "%s: image data is in normal memory\n",
-			__FUNCTION__);
-
-	imageWidth = texImage->Width;
-	imageHeight = texImage->Height;
-
-	offset = t->bufAddr;
-
-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
-		GLint imageX = 0;
-		GLint imageY = 0;
-		GLint blitX = t->image[face][hwlevel].x;
-		GLint blitY = t->image[face][hwlevel].y;
-		GLint blitWidth = t->image[face][hwlevel].width;
-		GLint blitHeight = t->image[face][hwlevel].height;
-		fprintf(stderr, "   upload image: %d,%d at %d,%d\n",
-			imageWidth, imageHeight, imageX, imageY);
-		fprintf(stderr, "   upload  blit: %d,%d at %d,%d\n",
-			blitWidth, blitHeight, blitX, blitY);
-		fprintf(stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-			(GLuint) offset, hwlevel, level);
-	}
-
-	t->image[face][hwlevel].data = texImage->Data;
-
-	/* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-	 * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-	 * We used to use 1, 2 and 4-byte texels and used to use the texture
-	 * width to dictate the blit width - but that won't work for compressed
-	 * textures. (Brian)
-	 * NOTE: can't do that with texture tiling. (sroland)
-	 */
-	tex.offset = offset;
-	tex.image = &tmp;
-	/* copy (x,y,width,height,data) */
-	memcpy(&tmp, &t->image[face][hwlevel], sizeof(tmp));
-
-	if (texImage->TexFormat->TexelBytes > 4) {
-		const int log2TexelBytes =
-		    (3 + (texImage->TexFormat->TexelBytes >> 4));
-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
-		tex.pitch =
-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
-			 64, 1);
-		tex.height = imageHeight;
-		tex.width = imageWidth << log2TexelBytes;
-		tex.offset += (tmp.x << log2TexelBytes) & ~1023;
-		tmp.x = tmp.x % (1024 >> log2TexelBytes);
-		tmp.width = tmp.width << log2TexelBytes;
-	} else if (texImage->TexFormat->TexelBytes) {
-		/* use multi-byte upload scheme */
-		tex.height = imageHeight;
-		tex.width = imageWidth;
-		switch (texImage->TexFormat->TexelBytes) {
-		case 1:
-			tex.format = RADEON_TXFORMAT_I8;
-			break;
-		case 2:
-			tex.format = RADEON_TXFORMAT_AI88;
-			break;
-		case 4:
-			tex.format = RADEON_TXFORMAT_ARGB8888;
-			break;
-		}
-		tex.pitch =
-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
-			 64, 1);
-		tex.offset += tmp.x & ~1023;
-		tmp.x = tmp.x % 1024;
-
-		if (t->tile_bits & R300_TXO_MICRO_TILE) {
-			/* need something like "tiled coordinates" ? */
-			tmp.y = tmp.x / (tex.pitch * 128) * 2;
-			tmp.x =
-			    tmp.x % (tex.pitch * 128) / 2 /
-			    texImage->TexFormat->TexelBytes;
-			tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-		} else {
-			tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-		}
-#if 1
-		if ((t->tile_bits & R300_TXO_MACRO_TILE) &&
-		    (texImage->Width * texImage->TexFormat->TexelBytes >= 256)
-		    && ((!(t->tile_bits & R300_TXO_MICRO_TILE)
-			 && (texImage->Height >= 8))
-			|| (texImage->Height >= 16))) {
-			/* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
-			   OR if height is smaller than 8 automatically, but if micro tiling is active
-			   the limit is height 16 instead ? */
-			tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-		}
-#endif
-	} else {
-		/* In case of for instance 8x8 texture (2x2 dxt blocks),
-		   padding after the first two blocks is needed (only
-		   with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-		/* set tex.height to 1/4 since 1 "macropixel" (dxt-block)
-		   has 4 real pixels. Needed so the kernel module reads
-		   the right amount of data. */
-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
-		tex.pitch = (R300_BLIT_WIDTH_BYTES / 64);
-		tex.height = (imageHeight + 3) / 4;
-		tex.width = (imageWidth + 3) / 4;
-		if ((t->format & R300_TX_FORMAT_DXT1) == R300_TX_FORMAT_DXT1) {
-			tex.width *= 8;
-		} else {
-			tex.width *= 16;
-		}
-	}
-
-	LOCK_HARDWARE(&rmesa->radeon);
-	do {
-		ret =
-		    drmCommandWriteRead(rmesa->radeon.dri.fd,
-					DRM_RADEON_TEXTURE, &tex,
-					sizeof(drm_radeon_texture_t));
-		if (ret) {
-			if (RADEON_DEBUG & DEBUG_IOCTL)
-				fprintf(stderr,
-					"DRM_RADEON_TEXTURE:  again!\n");
-			usleep(1);
-		}
-	} while (ret == -EAGAIN);
-
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	if (ret) {
-		fprintf(stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret);
-		fprintf(stderr, "   offset=0x%08x\n", offset);
-		fprintf(stderr, "   image width=%d height=%d\n",
-			imageWidth, imageHeight);
-		fprintf(stderr, "    blit width=%d height=%d data=%p\n",
-			t->image[face][hwlevel].width,
-			t->image[face][hwlevel].height,
-			t->image[face][hwlevel].data);
-		_mesa_exit(-1);
-	}
-}
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- *
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
-{
-	const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-	if (t->image_override)
-		return 0;
-
-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
-		fprintf(stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-			(void *)rmesa->radeon.glCtx, (void *)t->base.tObj,
-			t->base.totalSize, t->base.firstLevel,
-			t->base.lastLevel);
-	}
-
-	if (t->base.totalSize == 0)
-		return 0;
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-		radeonFinish(rmesa->radeon.glCtx);
-	}
-
-	LOCK_HARDWARE(&rmesa->radeon);
-
-	if (t->base.memBlock == NULL) {
-		int heap;
-
-		heap = driAllocateTexture(rmesa->texture_heaps, rmesa->nr_heaps,
-					  (driTextureObject *) t);
-		if (heap == -1) {
-			UNLOCK_HARDWARE(&rmesa->radeon);
-			return -1;
-		}
-
-		/* Set the base offset of the texture image */
-		t->bufAddr = rmesa->radeon.radeonScreen->texOffset[heap]
-		    + t->base.memBlock->ofs;
-		t->offset = t->bufAddr;
-
-		if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-			/* hope it's safe to add that here... */
-			t->offset |= t->tile_bits;
-		}
-	}
-
-	/* Let the world know we've used this memory recently.
-	 */
-	driUpdateTextureLRU((driTextureObject *) t);
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	/* Upload any images that are new */
-	if (t->base.dirty_images[face]) {
-		int i;
-		for (i = 0; i < numLevels; i++) {
-			if ((t->base.
-			     dirty_images[face] & (1 <<
-						   (i + t->base.firstLevel))) !=
-			    0) {
-				r300UploadSubImage(rmesa, t, i, 0, 0,
-						   t->image[face][i].width,
-						   t->image[face][i].height,
-						   face);
-			}
-		}
-		t->base.dirty_images[face] = 0;
-	}
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-		radeonFinish(rmesa->radeon.glCtx);
-	}
-
-	return 0;
-}
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index f6ae4b675b..6e47321246 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -47,7 +47,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
-#include "radeon_ioctl.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_tex.h"
 #include "r300_reg.h"
 
@@ -117,7 +117,12 @@ static const struct tx_table {
 	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
 	_ASSIGN(Z16, R300_EASY_TX_FORMAT(X, X, X, X, X16)),
 	_ASSIGN(Z24_S8, R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8)),
+	_ASSIGN(S8_Z24, R300_EASY_TX_FORMAT(Y, Y, Y, Y, X24_Y8)),
 	_ASSIGN(Z32, R300_EASY_TX_FORMAT(X, X, X, X, X32)),
+	/* EXT_texture_sRGB */
+	_ASSIGN(SRGBA8, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA),
+	_ASSIGN(SLA8, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8) | R300_TX_FORMAT_GAMMA),
+	_ASSIGN(SL8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8) | R300_TX_FORMAT_GAMMA),
 	/* *INDENT-ON* */
 };
 
@@ -143,13 +148,12 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 		},
 	};
 	const GLuint *format;
-	r300TexObjPtr t;
+	radeonTexObjPtr t;
 
 	if (!tObj)
 		return;
 
-	t = (r300TexObjPtr) tObj->DriverData;
-
+	t = radeon_tex_obj(tObj);
 
 	switch (tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat) {
 	case MESA_FORMAT_Z16:
@@ -171,13 +175,13 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 	switch (tObj->DepthMode) {
 	case GL_LUMINANCE:
-		t->format = format[0];
+		t->pp_txformat = format[0];
 		break;
 	case GL_INTENSITY:
-		t->format = format[1];
+		t->pp_txformat = format[1];
 		break;
 	case GL_ALPHA:
-		t->format = format[2];
+		t->pp_txformat = format[2];
 		break;
 	default:
 		/* Error...which should have already been caught by higher
@@ -190,406 +194,134 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 
 /**
- * Compute sizes and fill in offset and blit information for the given
- * image (determined by \p face and \p level).
- *
- * \param curOffset points to the offset at which the image is to be stored
- * and is updated by this function according to the size of the image.
- */
-static void compute_tex_image_offset(
-	struct gl_texture_object *tObj,
-	GLuint face,
-	GLint level,
-	GLint* curOffset)
-{
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	const struct gl_texture_image* texImage;
-	GLuint blitWidth = R300_BLIT_WIDTH_BYTES;
-	GLuint texelBytes;
-	GLuint size;
-
-	texImage = tObj->Image[0][level + t->base.firstLevel];
-	if (!texImage)
-		return;
-
-	texelBytes = texImage->TexFormat->TexelBytes;
-
-	/* find image size in bytes */
-	if (texImage->IsCompressed) {
-		if ((t->format & R300_TX_FORMAT_DXT1) ==
-			R300_TX_FORMAT_DXT1) {
-			// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
-			if ((texImage->Width + 3) < 8)	/* width one block */
-				size = texImage->CompressedSize * 4;
-			else if ((texImage->Width + 3) < 16)
-				size = texImage->CompressedSize * 2;
-			else
-				size = texImage->CompressedSize;
-		} else {
-			/* DXT3/5, 16 bytes per block */
-			WARN_ONCE
-				("DXT 3/5 suffers from multitexturing problems!\n");
-			// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
-			if ((texImage->Width + 3) < 8)
-				size = texImage->CompressedSize * 2;
-			else
-				size = texImage->CompressedSize;
-		}
-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		size =
-			((texImage->Width * texelBytes +
-			63) & ~63) * texImage->Height;
-		blitWidth = 64 / texelBytes;
-	} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
-		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-			though the actual offset may be different (if texture is less than
-			32 bytes width) to the untiled case */
-		int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-		size =
-			(w * ((texImage->Height + 1) / 2)) *
-			texImage->Depth;
-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-	} else {
-		int w = (texImage->Width * texelBytes + 31) & ~31;
-		size = w * texImage->Height * texImage->Depth;
-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-	}
-	assert(size > 0);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
-			texImage->Width, texImage->Height,
-			texImage->Depth,
-			texImage->TexFormat->TexelBytes,
-			texImage->InternalFormat);
-
-	/* All images are aligned to a 32-byte offset */
-	*curOffset = (*curOffset + 0x1f) & ~0x1f;
-
-	if (texelBytes) {
-		/* fix x and y coords up later together with offset */
-		t->image[face][level].x = *curOffset;
-		t->image[face][level].y = 0;
-		t->image[face][level].width =
-			MIN2(size / texelBytes, blitWidth);
-		t->image[face][level].height =
-			(size / texelBytes) / t->image[face][level].width;
-	} else {
-		t->image[face][level].x = *curOffset % R300_BLIT_WIDTH_BYTES;
-		t->image[face][level].y = *curOffset / R300_BLIT_WIDTH_BYTES;
-		t->image[face][level].width =
-			MIN2(size, R300_BLIT_WIDTH_BYTES);
-		t->image[face][level].height = size / t->image[face][level].width;
-	}
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr,
-			"level %d, face %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-			level, face, texImage->Width, texImage->Height,
-			t->image[face][level].x, t->image[face][level].y,
-			t->image[face][level].width, t->image[face][level].height,
-			size, *curOffset);
-
-	*curOffset += size;
-}
-
-
-
-/**
- * This function computes the number of bytes of storage needed for
- * the given texture object (all mipmap levels, all cube faces).
- * The \c image[face][level].x/y/width/height parameters for upload/blitting
- * are computed here.  \c filter, \c format, etc. will be set here
- * too.
+ * Compute the cached hardware register values for the given texture object.
  *
  * \param rmesa Context pointer
- * \param tObj GL texture object whose images are to be posted to
- *                 hardware state.
+ * \param t the r300 texture object
  */
-static void r300SetTexImages(r300ContextPtr rmesa,
-			     struct gl_texture_object *tObj)
+static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 {
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	const struct gl_texture_image *baseImage =
-	    tObj->Image[0][tObj->BaseLevel];
-	GLint curOffset;
-	GLint i, texelBytes;
-	GLint numLevels;
-	GLint log2Width, log2Height, log2Depth;
-
-	/* Set the hardware texture format
-	 */
+	const struct gl_texture_image *firstImage;
+	int firstlevel = t->mt ? t->mt->firstLevel : 0;
+	    
+	firstImage = t->base.Image[0][firstlevel];
+
 	if (!t->image_override
-	    && VALID_FORMAT(baseImage->TexFormat->MesaFormat)) {
-		if (baseImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
-			r300SetDepthTexMode(tObj);
+	    && VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+		if (firstImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
+			r300SetDepthTexMode(&t->base);
 		} else {
-			t->format = tx_table[baseImage->TexFormat->MesaFormat].format;
+			t->pp_txformat = tx_table[firstImage->TexFormat->MesaFormat].format;
 		}
 
-		t->filter |= tx_table[baseImage->TexFormat->MesaFormat].filter;
+		t->pp_txfilter |= tx_table[firstImage->TexFormat->MesaFormat].filter;
 	} else if (!t->image_override) {
 		_mesa_problem(NULL, "unexpected texture format in %s",
 			      __FUNCTION__);
 		return;
 	}
 
-	texelBytes = baseImage->TexFormat->TexelBytes;
-
-	/* Compute which mipmap levels we really want to send to the hardware.
-	 */
-	driCalculateTextureFirstLastLevel((driTextureObject *) t);
-	log2Width = tObj->Image[0][t->base.firstLevel]->WidthLog2;
-	log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
-	log2Depth = tObj->Image[0][t->base.firstLevel]->DepthLog2;
-
-	numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+	if (t->image_override && t->bo)
+		return;
 
-	assert(numLevels <= R300_MAX_TEXTURE_LEVELS);
+	t->pp_txsize = (((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)
+			| ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT)
+			| ((firstImage->DepthLog2) << R300_TX_DEPTHMASK_SHIFT)
+			| ((t->mt->lastLevel - t->mt->firstLevel) << R300_TX_MAX_MIP_LEVEL_SHIFT));
 
-	/* Calculate mipmap offsets and dimensions for blitting (uploading)
-	 * The idea is that we lay out the mipmap levels within a block of
-	 * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
-	 */
 	t->tile_bits = 0;
 
-	/* figure out if this texture is suitable for tiling. */
-#if 0				/* Disabled for now */
-	if (texelBytes) {
-		if ((tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
-		    /* texrect might be able to use micro tiling too in theory? */
-		    (baseImage->Height > 1)) {
-
-			/* allow 32 (bytes) x 1 mip (which will use two times the space
-			   the non-tiled version would use) max if base texture is large enough */
-			if ((numLevels == 1) ||
-			    (((baseImage->Width * texelBytes /
-			       baseImage->Height) <= 32)
-			     && (baseImage->Width * texelBytes > 64))
-			    ||
-			    ((baseImage->Width * texelBytes /
-			      baseImage->Height) <= 16)) {
-				t->tile_bits |= R300_TXO_MICRO_TILE;
-			}
-		}
-
-		if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
-			/* we can set macro tiling even for small textures, they will be untiled anyway */
-			t->tile_bits |= R300_TXO_MACRO_TILE;
-		}
-	}
-#endif
-
-	curOffset = 0;
+	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
+		t->pp_txformat |= R300_TX_FORMAT_CUBIC_MAP;
+	if (t->base.Target == GL_TEXTURE_3D)
+		t->pp_txformat |= R300_TX_FORMAT_3D;
 
-	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-		ASSERT(log2Width == log2Height);
-		t->format |= R300_TX_FORMAT_CUBIC_MAP;
 
-		for(i = 0; i < numLevels; i++) {
-			GLuint face;
-			for(face = 0; face < 6; face++)
-				compute_tex_image_offset(tObj, face, i, &curOffset);
-		}
-	} else {
-		if (tObj->Target == GL_TEXTURE_3D)
-                	t->format |= R300_TX_FORMAT_3D;
-
-		for (i = 0; i < numLevels; i++)
-			compute_tex_image_offset(tObj, 0, i, &curOffset);
-	}
-
-	/* Align the total size of texture memory block.
-	 */
-	t->base.totalSize =
-	    (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-
-	t->size =
-	    (((tObj->Image[0][t->base.firstLevel]->Width -
-	       1) << R300_TX_WIDTHMASK_SHIFT)
-	     | ((tObj->Image[0][t->base.firstLevel]->Height - 1) <<
-		R300_TX_HEIGHTMASK_SHIFT)
-	     | ((tObj->Image[0][t->base.firstLevel]->DepthLog2) <<
-		R300_TX_DEPTHMASK_SHIFT))
-	    | ((numLevels - 1) << R300_TX_MAX_MIP_LEVEL_SHIFT);
-
-	t->pitch = 0;
-
-	/* Only need to round to nearest 32 for textures, but the blitter
-	 * requires 64-byte aligned pitches, and we may/may not need the
-	 * blitter.   NPOT only!
-	 */
-	if (baseImage->IsCompressed) {
-		t->pitch |=
-		    (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		unsigned int align = (64 / texelBytes) - 1;
-		t->pitch |= ((tObj->Image[0][t->base.firstLevel]->Width *
-			     texelBytes) + 63) & ~(63);
-		t->size |= R300_TX_SIZE_TXPITCH_EN;
+	if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+		unsigned int align = (64 / t->mt->bpp) - 1;
+		t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
 		if (!t->image_override)
-			t->pitch_reg =
-			    (((tObj->Image[0][t->base.firstLevel]->Width) +
-			      align) & ~align) - 1;
-	} else {
-		t->pitch |=
-		    ((tObj->Image[0][t->base.firstLevel]->Width *
-		      texelBytes) + 63) & ~(63);
+			t->pp_txpitch = ((firstImage->Width + align) & ~align) - 1;
 	}
 
 	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-	    if (tObj->Image[0][t->base.firstLevel]->Width > 2048)
-		t->pitch_reg |= R500_TXWIDTH_BIT11;
-	    if (tObj->Image[0][t->base.firstLevel]->Height > 2048)
-		t->pitch_reg |= R500_TXHEIGHT_BIT11;
+	    if (firstImage->Width > 2048)
+		t->pp_txpitch |= R500_TXWIDTH_BIT11;
+	    if (firstImage->Height > 2048)
+		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
 	}
 }
 
-/* ================================================================
- * Texture unit state management
+/**
+ * Ensure the given texture is ready for rendering.
+ *
+ * Mostly this means populating the texture object's mipmap tree.
  */
-
-static GLboolean r300EnableTexture2D(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-
-	ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
-
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
-
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock && !t->image_override)
-			return GL_FALSE;
-	}
-
-	return GL_TRUE;
-}
-
-static GLboolean r300EnableTexture3D(GLcontext * ctx, int unit)
+static GLboolean r300_validate_texture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	radeonTexObj *t = radeon_tex_obj(texObj);
 
-	ASSERT(tObj->Target == GL_TEXTURE_3D);
-
-	/* r300 does not support mipmaps for 3D textures. */
-	if ((tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR)) {
+	if (!radeon_validate_texture_miptree(ctx, texObj))
 		return GL_FALSE;
-	}
 
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock)
-			return GL_FALSE;
-	}
+	/* Configure the hardware registers (more precisely, the cached version
+	 * of the hardware registers). */
+	setup_hardware_state(rmesa, t);
 
+	t->validated = GL_TRUE;
 	return GL_TRUE;
 }
 
-static GLboolean r300EnableTextureCube(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	GLuint face;
-
-	ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
-
-	if (t->base.dirty_images[0] || t->base.dirty_images[1] ||
-	    t->base.dirty_images[2] || t->base.dirty_images[3] ||
-	    t->base.dirty_images[4] || t->base.dirty_images[5]) {
-		/* flush */
-		R300_FIREVERTICES(rmesa);
-		/* layout memory space, once for all faces */
-		r300SetTexImages(rmesa, tObj);
-	}
-
-	/* upload (per face) */
-	for (face = 0; face < 6; face++) {
-		if (t->base.dirty_images[face]) {
-			r300UploadTexImages(rmesa,
-					    (r300TexObjPtr) tObj->DriverData,
-					    face);
-		}
-	}
-
-	if (!t->base.memBlock) {
-		/* texmem alloc failed, use s/w fallback */
-		return GL_FALSE;
-	}
-
-	return GL_TRUE;
-}
-
-static GLboolean r300EnableTextureRect(GLcontext * ctx, int unit)
+/**
+ * Ensure all enabled and complete textures are uploaded along with any buffers being used.
+ */
+GLboolean r300ValidateBuffers(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-
-	ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+	struct radeon_renderbuffer *rrb;
+	int i;
 
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
+	radeon_validate_reset_bos(&rmesa->radeon);
 
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock && !t->image_override &&
-		    !rmesa->prefer_gart_client_texturing)
-			return GL_FALSE;
+	rrb = radeon_get_colorbuffer(&rmesa->radeon);
+	/* color buffer */
+	if (rrb && rrb->bo) {
+		radeon_validate_bo(&rmesa->radeon, rrb->bo,
+				   0, RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	return GL_TRUE;
-}
-
-static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_ReallyEnabled ?
-		texUnit->_Current : NULL;
-	r300TexObjPtr t = tObj ? (r300TexObjPtr) tObj->DriverData : NULL;
-
-	/* Fallback if there's a texture border */
-	if (tObj && tObj->Image[0][tObj->BaseLevel]->Border > 0) {
-		tObj = NULL;
-		t = NULL;
+	/* depth buffer */
+	rrb = radeon_get_depthbuffer(&rmesa->radeon);
+	if (rrb && rrb->bo) {
+		radeon_validate_bo(&rmesa->radeon, rrb->bo,
+				   0, RADEON_GEM_DOMAIN_VRAM);
 	}
+	
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+		radeonTexObj *t;
 
-	/* Update state if this is a different texture object to last
-	 * time.
-	 */
-	if (rmesa->state.texture.unit[unit].texobj != tObj) {
-		if (rmesa->state.texture.unit[unit].texobj != NULL) {
-			r300TexObjPtr t_old = (r300TexObjPtr) rmesa->state.texture.unit[unit].texobj->DriverData;
-
-			/* The old texture is no longer bound to this texture unit.
-			 * Mark it as such.
-			 */
-
-			t_old->base.bound &= ~(1 << unit);
-		}
-
-		_mesa_reference_texobj(&rmesa->state.texture.unit[unit].texobj, tObj);
+		if (!ctx->Texture.Unit[i]._ReallyEnabled)
+			continue;
 
-		if (t) {
-			t->base.bound |= (1 << unit);
-			driUpdateTextureLRU(&t->base);	/* XXX: should be locked! */
+		if (!r300_validate_texture(ctx, ctx->Texture.Unit[i]._Current)) {
+			_mesa_warning(ctx,
+				      "failed to validate texture for unit %d.\n",
+				      i);
 		}
+		t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+		if (t->image_override && t->bo)
+			radeon_validate_bo(&rmesa->radeon, t->bo,
+					   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+
+		else if (t->mt->bo)
+			radeon_validate_bo(&rmesa->radeon, t->mt->bo,
+					   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
 	}
+	if (rmesa->radeon.dma.current)
+		radeon_validate_bo(&rmesa->radeon, rmesa->radeon.dma.current, RADEON_GEM_DOMAIN_GTT, 0);
 
-	return !t || !t->border_fallback;
+	return radeon_revalidate_bos(ctx);
 }
 
 void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
@@ -598,78 +330,166 @@ void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	r300ContextPtr rmesa = pDRICtx->driverPrivate;
 	struct gl_texture_object *tObj =
 	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
-	r300TexObjPtr t;
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
 	uint32_t pitch_val;
 
 	if (!tObj)
 		return;
 
-	t = (r300TexObjPtr) tObj->DriverData;
-
 	t->image_override = GL_TRUE;
 
 	if (!offset)
 		return;
 
-	t->offset = offset;
-	t->pitch_reg &= (1 << 13) -1;
+	t->bo = NULL;
+	t->override_offset = offset;
+	t->pp_txpitch &= (1 << 13) -1;
 	pitch_val = pitch;
 
 	switch (depth) {
 	case 32:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
-		t->filter |= tx_table[2].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[2].filter;
 		pitch_val /= 4;
 		break;
 	case 24:
 	default:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
-		t->filter |= tx_table[4].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[4].filter;
 		pitch_val /= 4;
 		break;
 	case 16:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
-		t->filter |= tx_table[5].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+		t->pp_txfilter |= tx_table[5].filter;
 		pitch_val /= 2;
 		break;
 	}
 	pitch_val--;
 
-	t->pitch_reg |= pitch_val;
+	t->pp_txpitch |= pitch_val;
 }
 
-static GLboolean r300UpdateTextureUnit(GLcontext * ctx, int unit)
+void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format, __DRIdrawable *dPriv)
 {
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-
-	if (texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
-		return (r300EnableTextureRect(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT)) {
-		return (r300EnableTexture2D(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
-		return (r300EnableTexture3D(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
-		return (r300EnableTextureCube(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled) {
-		return GL_FALSE;
-	} else {
-		return r300UpdateTexture(ctx, unit);
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	r300ContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
 	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+	
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->width, rb->height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->tile_bits = 0;
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		else
+			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[2].filter;
+		pitch_val /= 4;
+		break;
+	case 3:
+	default:
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[4].filter;
+		pitch_val /= 4;
+		break;
+	case 2:
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+		t->pp_txfilter |= tx_table[5].filter;
+		pitch_val /= 2;
+		break;
+	}
+	pitch_val--;
+	t->pp_txsize = ((rb->width - 1) << R300_TX_WIDTHMASK_SHIFT) |
+              ((rb->height - 1) << R300_TX_HEIGHTMASK_SHIFT);
+	t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
+	t->pp_txpitch |= pitch_val;
+
+	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+	    if (rb->width > 2048)
+		t->pp_txpitch |= R500_TXWIDTH_BIT11;
+	    if (rb->height > 2048)
+		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+	}
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
 }
 
-void r300UpdateTextureState(GLcontext * ctx)
+void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
 {
-	int i;
-
-	for (i = 0; i < 8; i++) {
-		if (!r300UpdateTextureUnit(ctx, i)) {
-			_mesa_warning(ctx,
-				      "failed to update texture state for unit %d.\n",
-				      i);
-		}
-	}
+        r300SetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
 }
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.c b/src/mesa/drivers/dri/r300/r300_vertprog.c
index 146daa367c..c41a8fdd62 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.c
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.c
@@ -34,10 +34,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "shader/program.h"
 #include "shader/prog_instruction.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "tnl/tnl.h"
 
 #include "r300_context.h"
+#include "r300_state.h"
 
 /* TODO: Get rid of t_src_class call */
 #define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
@@ -64,7 +66,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 		int u_temp_used = (VSF_MAX_FRAGMENT_TEMPS - 1) - u_temp_i; \
 		if((vp->num_temporaries + u_temp_used) > VSF_MAX_FRAGMENT_TEMPS) { \
 			WARN_ONCE("Ran out of temps, num temps %d, us %d\n", vp->num_temporaries, u_temp_used); \
-			vp->native = GL_FALSE; \
+			vp->error = GL_TRUE; \
 		} \
 		u_temp_i=VSF_MAX_FRAGMENT_TEMPS-1; \
 	} while (0)
@@ -214,21 +216,8 @@ static void vp_dump_inputs(struct r300_vertex_program *vp, char *caller)
 static unsigned long t_src_index(struct r300_vertex_program *vp,
 				 struct prog_src_register *src)
 {
-	int i;
-	int max_reg = -1;
-
 	if (src->File == PROGRAM_INPUT) {
-		if (vp->inputs[src->Index] != -1)
-			return vp->inputs[src->Index];
-
-		for (i = 0; i < VERT_ATTRIB_MAX; i++)
-			if (vp->inputs[i] > max_reg)
-				max_reg = vp->inputs[i];
-
-		vp->inputs[src->Index] = max_reg + 1;
-
-		//vp_dump_inputs(vp, __FUNCTION__);
-
+		assert(vp->inputs[src->Index] != -1);
 		return vp->inputs[src->Index];
 	} else {
 		if (src->Index < 0) {
@@ -943,11 +932,17 @@ static GLuint *r300TranslateOpcodeXPD(struct r300_vertex_program *vp,
 static void t_inputs_outputs(struct r300_vertex_program *vp)
 {
 	int i;
-	int cur_reg = 0;
+	int cur_reg;
 
-	for (i = 0; i < VERT_ATTRIB_MAX; i++)
-		vp->inputs[i] = -1;
+	cur_reg = -1;
+	for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+		if (vp->key.InputsRead & (1 << i))
+			vp->inputs[i] = ++cur_reg;
+		else
+			vp->inputs[i] = -1;
+	}
 
+	cur_reg = 0;
 	for (i = 0; i < VERT_RESULT_MAX; i++)
 		vp->outputs[i] = -1;
 
@@ -961,26 +956,36 @@ static void t_inputs_outputs(struct r300_vertex_program *vp)
 		vp->outputs[VERT_RESULT_PSIZ] = cur_reg++;
 	}
 
+	/* If we're writing back facing colors we need to send
+	 * four colors to make front/back face colors selection work.
+	 * If the vertex program doesn't write all 4 colors, lets
+	 * pretend it does by skipping output index reg so the colors
+	 * get written into appropriate output vectors.
+	 */
 	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL0)) {
 		vp->outputs[VERT_RESULT_COL0] = cur_reg++;
+	} else if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0) ||
+		vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
 	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL1)) {
-		vp->outputs[VERT_RESULT_COL1] =
-		    vp->outputs[VERT_RESULT_COL0] + 1;
-		cur_reg = vp->outputs[VERT_RESULT_COL1] + 1;
+		vp->outputs[VERT_RESULT_COL1] = cur_reg++;
+	} else if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0) ||
+		vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
 	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0)) {
-		vp->outputs[VERT_RESULT_BFC0] =
-		    vp->outputs[VERT_RESULT_COL0] + 2;
-		cur_reg = vp->outputs[VERT_RESULT_BFC0] + 2;
+		vp->outputs[VERT_RESULT_BFC0] = cur_reg++;
+	} else if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
 	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
-		vp->outputs[VERT_RESULT_BFC1] =
-		    vp->outputs[VERT_RESULT_COL0] + 3;
-		cur_reg = vp->outputs[VERT_RESULT_BFC1] + 1;
+		vp->outputs[VERT_RESULT_BFC1] = cur_reg++;
+	} else if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0)) {
+		cur_reg++;
 	}
 
 	for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++) {
@@ -1007,14 +1012,13 @@ static void r300TranslateVertexShader(struct r300_vertex_program *vp,
 	struct prog_src_register src[3];
 
 	vp->pos_end = 0;	/* Not supported yet */
-	vp->program.length = 0;
-	/*vp->num_temporaries=mesa_vp->Base.NumTemporaries; */
+	vp->hw_code.length = 0;
 	vp->translated = GL_TRUE;
-	vp->native = GL_TRUE;
+	vp->error = GL_FALSE;
 
 	t_inputs_outputs(vp);
 
-	for (inst = vp->program.body.i; vpi->Opcode != OPCODE_END;
+	for (inst = vp->hw_code.body.d; vpi->Opcode != OPCODE_END;
 	     vpi++, inst += 4) {
 
 		FREE_TEMPS();
@@ -1176,38 +1180,15 @@ static void r300TranslateVertexShader(struct r300_vertex_program *vp,
 						      &u_temp_i);
 			break;
 		default:
-			assert(0);
+			vp->error = GL_TRUE;
 			break;
 		}
 	}
 
-	/* Some outputs may be artificially added, to match the inputs
-	   of the fragment program. Blank the outputs here. */
-	for (i = 0; i < VERT_RESULT_MAX; i++) {
-		if (vp->key.OutputsAdded & (1 << i)) {
-			inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-						     GL_FALSE,
-						     GL_FALSE,
-						     vp->outputs[i],
-						     VSF_FLAG_ALL,
-						     PVS_DST_REG_OUT);
-			inst[1] = __CONST(0, SWIZZLE_ZERO);
-			inst[2] = __CONST(0, SWIZZLE_ZERO);
-			inst[3] = __CONST(0, SWIZZLE_ZERO);
-			inst += 4;
-		}
+	vp->hw_code.length = (inst - vp->hw_code.body.d);
+	if (vp->hw_code.length >= VSF_MAX_FRAGMENT_LENGTH) {
+		vp->error = GL_TRUE;
 	}
-
-	vp->program.length = (inst - vp->program.body.i);
-	if (vp->program.length >= VSF_MAX_FRAGMENT_LENGTH) {
-		vp->program.length = 0;
-		vp->native = GL_FALSE;
-	}
-#if 0
-	fprintf(stderr, "hw program:\n");
-	for (i = 0; i < vp->program.length; i++)
-		fprintf(stderr, "%08x\n", vp->program.body.d[i]);
-#endif
 }
 
 /* DP4 version seems to trigger some hw peculiarity */
@@ -1386,6 +1367,49 @@ static struct r300_vertex_program *build_program(struct r300_vertex_program_key
 		pos_as_texcoord(vp, &mesa_vp->Base);
 	}
 
+	if (RADEON_DEBUG & DEBUG_VERTS) {
+		fprintf(stderr, "Vertex program after native rewrite:\n");
+		_mesa_print_program(&mesa_vp->Base);
+		fflush(stdout);
+	}
+
+	/* Some outputs may be artificially added, to match the inputs of the fragment program.
+	 * Issue 16 of vertex program spec says that all vertex attributes that are unwritten by
+	 * vertex program are undefined, so just use MOV [vertex_result], CONST[0]
+	 */
+	{
+		int i, count = 0;
+		for (i = 0; i < VERT_RESULT_MAX; ++i) {
+			if (vp->key.OutputsAdded & (1 << i)) {
+				++count;
+			}
+		}
+
+		if (count > 0) {
+			struct prog_instruction *inst;
+
+			_mesa_insert_instructions(&mesa_vp->Base, mesa_vp->Base.NumInstructions - 1, count);
+			inst = &mesa_vp->Base.Instructions[mesa_vp->Base.NumInstructions - 1 - count];
+
+			for (i = 0; i < VERT_RESULT_MAX; ++i) {
+				if (vp->key.OutputsAdded & (1 << i)) {
+					inst->Opcode = OPCODE_MOV;
+
+					inst->DstReg.File = PROGRAM_OUTPUT;
+					inst->DstReg.Index = i;
+					inst->DstReg.WriteMask = WRITEMASK_XYZW;
+					inst->DstReg.CondMask = COND_TR;
+
+					inst->SrcReg[0].File = PROGRAM_CONSTANT;
+					inst->SrcReg[0].Index = 0;
+					inst->SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+					++inst;
+				}
+			}
+		}
+	}
+
 	assert(mesa_vp->Base.NumInstructions);
 	vp->num_temporaries = mesa_vp->Base.NumTemporaries;
 	r300TranslateVertexShader(vp, mesa_vp->Base.Instructions);
@@ -1432,7 +1456,12 @@ void r300SelectVertexShader(r300ContextPtr r300)
 		wpos_idx = i;
 	}
 
-	add_outputs(&wanted_key, VERT_RESULT_HPOS);
+	if (vpc->mesa_program.IsPositionInvariant) {
+		wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
+		wanted_key.OutputsWritten |= (1 << VERT_RESULT_HPOS);
+	} else {
+		add_outputs(&wanted_key, VERT_RESULT_HPOS);
+	}
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		add_outputs(&wanted_key, VERT_RESULT_COL0);
@@ -1442,27 +1471,103 @@ void r300SelectVertexShader(r300ContextPtr r300)
 		add_outputs(&wanted_key, VERT_RESULT_COL1);
 	}
 
+	if (InputsRead & FRAG_BIT_FOGC) {
+		add_outputs(&wanted_key, VERT_RESULT_FOGC);
+	}
+
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
 			add_outputs(&wanted_key, VERT_RESULT_TEX0 + i);
 		}
 	}
 
-	if (vpc->mesa_program.IsPositionInvariant) {
-		/* we wan't position don't we ? */
-		wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
-	}
-
 	for (vp = vpc->progs; vp; vp = vp->next)
 		if (_mesa_memcmp(&vp->key, &wanted_key, sizeof(wanted_key))
 		    == 0) {
 			r300->selected_vp = vp;
 			return;
 		}
-	//_mesa_print_program(&vpc->mesa_program.Base);
+
+	if (RADEON_DEBUG & DEBUG_VERTS) {
+		fprintf(stderr, "Initial vertex program:\n");
+		_mesa_print_program(&vpc->mesa_program.Base);
+		fflush(stdout);
+	}
 
 	vp = build_program(&wanted_key, &vpc->mesa_program, wpos_idx);
 	vp->next = vpc->progs;
 	vpc->progs = vp;
 	r300->selected_vp = vp;
 }
+
+#define bump_vpu_count(ptr, new_count)   do { \
+		drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr)); \
+		int _nc=(new_count)/4; \
+		assert(_nc < 256); \
+		if(_nc>_p->vpu.count)_p->vpu.count=_nc; \
+	} while(0)
+
+static void r300EmitVertexProgram(r300ContextPtr r300, int dest, struct r300_vertex_shader_hw_code *code)
+{
+	int i;
+
+	assert((code->length > 0) && (code->length % 4 == 0));
+
+	switch ((dest >> 8) & 0xf) {
+		case 0:
+			R300_STATECHANGE(r300, vpi);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vpi.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		case 2:
+			R300_STATECHANGE(r300, vpp);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vpp.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		case 4:
+			R300_STATECHANGE(r300, vps);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vps.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		default:
+			fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
+			_mesa_exit(-1);
+	}
+}
+
+void r300SetupVertexProgram(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+	struct r300_vertex_program *prog = rmesa->selected_vp;
+	int inst_count = 0;
+	int param_count = 0;
+	
+	/* Reset state, in case we don't use something */
+	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
+	
+	R300_STATECHANGE(rmesa, vpp);
+	param_count = r300VertexProgUpdateParams(ctx,
+								(struct r300_vertex_program_cont *)
+								ctx->VertexProgram._Current,
+								(float *)&rmesa->hw.vpp.cmd[R300_VPP_PARAM_0]);
+	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
+	param_count /= 4;
+
+	r300EmitVertexProgram(rmesa, R300_PVS_CODE_START, &(prog->hw_code));
+	inst_count = (prog->hw_code.length / 4) - 1;
+
+	r300VapCntl(rmesa, _mesa_bitcount(prog->key.InputsRead),
+				 _mesa_bitcount(prog->key.OutputsWritten), prog->num_temporaries);
+
+	R300_STATECHANGE(rmesa, pvs);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] = (0 << R300_PVS_FIRST_INST_SHIFT) | (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
+				(inst_count << R300_PVS_LAST_INST_SHIFT);
+
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] = (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) | (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] = (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+}
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.h b/src/mesa/drivers/dri/r300/r300_vertprog.h
index 2f35f02bc8..b552e3fb1b 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.h
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.h
@@ -32,4 +32,6 @@
 
 #endif
 
+void r300SetupVertexProgram(r300ContextPtr rmesa);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c
index 292573de89..4d58cf2162 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.c
@@ -27,10 +27,6 @@
 
 #include "r500_fragprog.h"
 
-#include "radeon_nqssadce.h"
-#include "radeon_program_alu.h"
-
-
 static void reset_srcreg(struct prog_src_register* reg)
 {
 	_mesa_bzero(reg, sizeof(*reg));
@@ -58,12 +54,12 @@ static struct prog_src_register shadow_ambient(struct gl_program *program, int t
  *  - introduce a temporary register when write masks are needed
  *
  */
-static GLboolean transform_TEX(
+GLboolean r500_transform_TEX(
 	struct radeon_transform_context *t,
 	struct prog_instruction* orig_inst, void* data)
 {
-	struct r500_fragment_program_compiler *compiler =
-		(struct r500_fragment_program_compiler*)data;
+	struct r300_fragment_program_compiler *compiler =
+		(struct r300_fragment_program_compiler*)data;
 	struct prog_instruction inst = *orig_inst;
 	struct prog_instruction* tgt;
 	GLboolean destredirect = GL_FALSE;
@@ -188,121 +184,7 @@ static GLboolean transform_TEX(
 	return GL_TRUE;
 }
 
-
-static void update_params(r300ContextPtr r300, struct r500_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
-}
-
-
-/**
- * Transform the program to support fragment.position.
- *
- * Introduce a small fragment at the start of the program that will be
- * the only code that directly reads the FRAG_ATTRIB_WPOS input.
- * All other code pieces that reference that input will be rewritten
- * to read from a newly allocated temporary.
- *
- * \todo if/when r5xx supports the radeon_program architecture, this is a
- * likely candidate for code sharing.
- */
-static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler)
-{
-	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
-
-	if (!(InputsRead & FRAG_BIT_WPOS))
-		return;
-
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
-
-	_mesa_insert_instructions(compiler->program, 0, 3);
-	fpi = compiler->program->Instructions;
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
-
-	for (; i < compiler->program->NumInstructions; ++i) {
-		int reg;
-		for (reg = 0; reg < 3; reg++) {
-			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
-			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
-				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
-				fpi[i].SrcReg[reg].Index = tempregi;
-			}
-		}
-	}
-}
-
-
-static void nqssadce_init(struct nqssadce_state* s)
-{
-	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
-	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
-}
-
-static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
+GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 {
 	GLuint relevant;
 	int i;
@@ -314,22 +196,20 @@ static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
 		if (reg.Abs)
 			return GL_FALSE;
 
+		if (opcode == OPCODE_KIL && (reg.Swizzle != SWIZZLE_NOOP || reg.Negate != NEGATE_NONE))
+			return GL_FALSE;
+
 		if (reg.Negate)
 			reg.Negate ^= NEGATE_XYZW;
 
-		if (opcode == OPCODE_KIL) {
-			if (reg.Swizzle != SWIZZLE_NOOP)
-				return GL_FALSE;
-		} else {
-			for(i = 0; i < 4; ++i) {
-				GLuint swz = GET_SWZ(reg.Swizzle, i);
-				if (swz == SWIZZLE_NIL) {
-					reg.Negate &= ~(1 << i);
-					continue;
-				}
-				if (swz >= 4)
-					return GL_FALSE;
+		for(i = 0; i < 4; ++i) {
+			GLuint swz = GET_SWZ(reg.Swizzle, i);
+			if (swz == SWIZZLE_NIL) {
+				reg.Negate &= ~(1 << i);
+				continue;
 			}
+			if (swz >= 4)
+				return GL_FALSE;
 		}
 
 		if (reg.Negate)
@@ -367,8 +247,7 @@ static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
  * The only thing we *cannot* do in an ALU instruction is per-component
  * negation. Therefore, we split the MOV into two instructions when necessary.
  */
-static void nqssadce_build_swizzle(struct nqssadce_state *s,
-	struct prog_dst_register dst, struct prog_src_register src)
+void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
 {
 	struct prog_instruction *inst;
 	GLuint negatebase[2] = { 0, 0 };
@@ -392,129 +271,12 @@ static void nqssadce_build_swizzle(struct nqssadce_state *s,
 		inst->DstReg = dst;
 		inst->DstReg.WriteMask = negatebase[i];
 		inst->SrcReg[0] = src;
+		inst->SrcReg[0].Negate = (i == 0) ? NEGATE_NONE : NEGATE_XYZW;
 		inst++;
 		s->IP++;
 	}
 }
 
-static GLuint build_dtm(GLuint depthmode)
-{
-	switch(depthmode) {
-	default:
-	case GL_LUMINANCE: return 0;
-	case GL_INTENSITY: return 1;
-	case GL_ALPHA: return 2;
-	}
-}
-
-static GLuint build_func(GLuint comparefunc)
-{
-	return comparefunc - GL_NEVER;
-}
-
-
-/**
- * Collect all external state that is relevant for compiling the given
- * fragment program.
- */
-static void build_state(
-	r300ContextPtr r300,
-	struct r500_fragment_program *fp,
-	struct r500_fragment_program_external_state *state)
-{
-	int unit;
-
-	_mesa_bzero(state, sizeof(*state));
-
-	for(unit = 0; unit < 16; ++unit) {
-		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
-			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
-
-			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
-			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
-		}
-	}
-}
-
-static void dump_program(struct r500_fragment_program_code *code);
-
-void r500TranslateFragmentShader(r300ContextPtr r300,
-				 struct r500_fragment_program *fp)
-{
-	struct r500_fragment_program_external_state state;
-
-	build_state(r300, fp, &state);
-	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
-		/* TODO: cache compiled programs */
-		fp->translated = GL_FALSE;
-		_mesa_memcpy(&fp->state, &state, sizeof(state));
-	}
-
-	if (!fp->translated) {
-		struct r500_fragment_program_compiler compiler;
-
-		compiler.r300 = r300;
-		compiler.fp = fp;
-		compiler.code = &fp->code;
-		compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: Initial program:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		insert_WPOS_trailer(&compiler);
-
-		struct radeon_program_transformation transformations[] = {
-			{ &transform_TEX, &compiler },
-			{ &radeonTransformALU, 0 },
-			{ &radeonTransformDeriv, 0 },
-			{ &radeonTransformTrigScale, 0 }
-		};
-		radeonLocalTransform(r300->radeon.glCtx, compiler.program,
-			4, transformations);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after native rewrite:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &is_native_swizzle,
-			.BuildSwizzle = &nqssadce_build_swizzle,
-			.RewriteDepthOut = GL_TRUE
-		};
-		radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after NqSSA-DCE:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		fp->translated = r500FragmentProgramEmit(&compiler);
-
-		/* Subtle: Rescue any parameters that have been added during transformations */
-		_mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
-		fp->mesa_program.Base.Parameters = compiler.program->Parameters;
-		compiler.program->Parameters = 0;
-
-		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0);
-
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			if (fp->translated) {
-				_mesa_printf("Machine-readable code:\n");
-				dump_program(&fp->code);
-			}
-		}
-
-	}
-
-	update_params(r300, fp);
-
-}
 
 static char *toswiz(int swiz_val) {
   switch(swiz_val) {
@@ -613,9 +375,9 @@ static char *to_texop(int val)
   return NULL;
 }
 
-static void dump_program(struct r500_fragment_program_code *code)
+void r500FragmentProgramDump(union rX00_fragment_program_code *c)
 {
-
+  struct r500_fragment_program_code *code = &c->r500;
   fprintf(stderr, "R500 Fragment Program:\n--------\n");
 
   int n;
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/r500_fragprog.h
index 1e45538f80..1179bf6607 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.h
@@ -33,30 +33,20 @@
 #ifndef __R500_FRAGPROG_H_
 #define __R500_FRAGPROG_H_
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/program.h"
 #include "shader/prog_instruction.h"
 
 #include "r300_context.h"
-#include "r300_state.h"
-#include "radeon_program.h"
+#include "radeon_nqssadce.h"
 
-struct r500_fragment_program;
+extern GLboolean r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
-extern void r500TranslateFragmentShader(r300ContextPtr r300,
-					struct r500_fragment_program *fp);
+extern void r500FragmentProgramDump(union rX00_fragment_program_code *c);
 
-struct r500_fragment_program_compiler {
-	r300ContextPtr r300;
-	struct r500_fragment_program *fp;
-	struct r500_fragment_program_code *code;
-	struct gl_program *program;
-};
+extern GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg);
 
-extern GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler);
+extern void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src);
+
+extern GLboolean r500_transform_TEX(struct radeon_transform_context *t, struct prog_instruction* orig_inst, void* data);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
index 4631235f0d..30f4514897 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
@@ -49,8 +49,8 @@
 
 
 #define PROG_CODE \
-	struct r500_fragment_program_compiler *c = (struct r500_fragment_program_compiler*)data; \
-	struct r500_fragment_program_code *code = c->code
+	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
+	struct r500_fragment_program_code *code = &c->code->r500
 
 #define error(fmt, args...) do {			\
 		fprintf(stderr, "%s::%s(): " fmt "\n",	\
@@ -72,7 +72,7 @@ static GLboolean emit_const(void *data, GLuint file, GLuint idx, GLuint *hwindex
 	}
 
 	if (*hwindex >= code->const_nr) {
-		if (*hwindex >= PFS_NUM_CONST_REGS) {
+		if (*hwindex >= R500_PFS_NUM_CONST_REGS) {
 			error("Out of hw constants!\n");
 			return GL_FALSE;
 		}
@@ -299,9 +299,9 @@ static const struct radeon_pair_handler pair_handler = {
 	.MaxHwTemps = 128
 };
 
-GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler)
+GLboolean r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
-	struct r500_fragment_program_code *code = compiler->code;
+	struct r500_fragment_program_code *code = &compiler->code->r500;
 
 	_mesa_bzero(code, sizeof(*code));
 	code->max_temp_idx = 1;
diff --git a/src/mesa/drivers/dri/r300/radeon_context.c b/src/mesa/drivers/dri/r300/radeon_context.c
deleted file mode 100644
index 5267fe9a77..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_context.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/**
- * \file radeon_context.c
- * Common context initialization.
- *
- * \author Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include <dlfcn.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/state.h"
-#include "main/matrix.h"
-#include "main/framebuffer.h"
-
-#include "drivers/common/driverfuncs.h"
-#include "swrast/swrast.h"
-
-#include "radeon_screen.h"
-#include "radeon_ioctl.h"
-#include "radeon_macros.h"
-#include "radeon_reg.h"
-
-#include "radeon_state.h"
-#include "r300_state.h"
-
-#include "utils.h"
-#include "vblank.h"
-#include "xmlpool.h"		/* for symbolic values of enum-type options */
-
-#define DRIVER_DATE "20060815"
-
-
-/* Return various strings for glGetString().
- */
-static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	static char buffer[128];
-
-	switch (name) {
-	case GL_VENDOR:
-		if (IS_R300_CLASS(radeon->radeonScreen))
-			return (GLubyte *) "DRI R300 Project";
-		else
-			return (GLubyte *) "Tungsten Graphics, Inc.";
-
-	case GL_RENDERER:
-	{
-		unsigned offset;
-		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
-			radeon->radeonScreen->AGPMode;
-		const char* chipname;
-
-		if (IS_R300_CLASS(radeon->radeonScreen))
-			chipname = "R300";
-		else
-			chipname = "R200";
-
-		offset = driGetRendererString(buffer, chipname, DRIVER_DATE,
-					      agp_mode);
-
-		if (IS_R300_CLASS(radeon->radeonScreen)) {
-		sprintf(&buffer[offset], " %sTCL",
-			(radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
-			? "" : "NO-");
-		} else {
-			sprintf(&buffer[offset], " %sTCL",
-			!(radeon->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
-			? "" : "NO-");
-		}
-
-		return (GLubyte *) buffer;
-	}
-
-	default:
-		return NULL;
-	}
-}
-
-/* Initialize the driver's misc functions.
- */
-static void radeonInitDriverFuncs(struct dd_function_table *functions)
-{
-	functions->GetString = radeonGetString;
-}
-
-
-/**
- * Create and initialize all common fields of the context,
- * including the Mesa context itself.
- */
-GLboolean radeonInitContext(radeonContextPtr radeon,
-			    struct dd_function_table* functions,
-			    const __GLcontextModes * glVisual,
-			    __DRIcontextPrivate * driContextPriv,
-			    void *sharedContextPrivate)
-{
-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
-	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
-	GLcontext* ctx;
-	GLcontext* shareCtx;
-	int fthrottle_mode;
-
-	/* Fill in additional standard functions. */
-	radeonInitDriverFuncs(functions);
-
-	radeon->radeonScreen = screen;
-	/* Allocate and initialize the Mesa context */
-	if (sharedContextPrivate)
-		shareCtx = ((radeonContextPtr)sharedContextPrivate)->glCtx;
-	else
-		shareCtx = NULL;
-	radeon->glCtx = _mesa_create_context(glVisual, shareCtx,
-					    functions, (void *)radeon);
-	if (!radeon->glCtx)
-		return GL_FALSE;
-
-	ctx = radeon->glCtx;
-	driContextPriv->driverPrivate = radeon;
-
-	/* DRI fields */
-	radeon->dri.context = driContextPriv;
-	radeon->dri.screen = sPriv;
-	radeon->dri.drawable = NULL;
-	radeon->dri.readable = NULL;
-	radeon->dri.hwContext = driContextPriv->hHWContext;
-	radeon->dri.hwLock = &sPriv->pSAREA->lock;
-	radeon->dri.fd = sPriv->fd;
-	radeon->dri.drmMinor = sPriv->drm_version.minor;
-
-	radeon->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
-					       screen->sarea_priv_offset);
-
-	/* Setup IRQs */
-	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
-	radeon->iw.irq_seq = -1;
-	radeon->irqsEmitted = 0;
-	radeon->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
-			  radeon->radeonScreen->irq);
-
-	radeon->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
-
-	if (!radeon->do_irqs)
-		fprintf(stderr,
-			"IRQ's not enabled, falling back to %s: %d %d\n",
-			radeon->do_usleeps ? "usleeps" : "busy waits",
-			fthrottle_mode, radeon->radeonScreen->irq);
-
-	(*sPriv->systemTime->getUST) (&radeon->swap_ust);
-
-	return GL_TRUE;
-}
-
-
-/**
- * Cleanup common context fields.
- * Called by r200DestroyContext/r300DestroyContext
- */
-void radeonCleanupContext(radeonContextPtr radeon)
-{
-	/* _mesa_destroy_context() might result in calls to functions that
-	 * depend on the DriverCtx, so don't set it to NULL before.
-	 *
-	 * radeon->glCtx->DriverCtx = NULL;
-	 */
-
-	/* free the Mesa context */
-	_mesa_destroy_context(radeon->glCtx);
-
-	if (radeon->state.scissor.pClipRects) {
-		FREE(radeon->state.scissor.pClipRects);
-		radeon->state.scissor.pClipRects = 0;
-	}
-}
-
-
-/**
- * Swap front and back buffer.
- */
-void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
-{
-	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-		radeonContextPtr radeon;
-		GLcontext *ctx;
-
-		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-		ctx = radeon->glCtx;
-
-		if (ctx->Visual.doubleBufferMode) {
-			_mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
-			if (radeon->doPageFlip) {
-				radeonPageFlip(dPriv);
-			} else {
-			    radeonCopyBuffer(dPriv, NULL);
-			}
-		}
-	} else {
-		/* XXX this shouldn't be an error but we can't handle it for now */
-		_mesa_problem(NULL, "%s: drawable has no context!",
-			      __FUNCTION__);
-	}
-}
-
-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-			 int x, int y, int w, int h )
-{
-    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-	radeonContextPtr radeon;
-	GLcontext *ctx;
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-	ctx = radeon->glCtx;
-
-	if (ctx->Visual.doubleBufferMode) {
-	    drm_clip_rect_t rect;
-	    rect.x1 = x + dPriv->x;
-	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
-	    rect.x2 = rect.x1 + w;
-	    rect.y2 = rect.y1 + h;
-	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
-	    radeonCopyBuffer(dPriv, &rect);
-	}
-    } else {
-	/* XXX this shouldn't be an error but we can't handle it for now */
-	_mesa_problem(NULL, "%s: drawable has no context!",
-		      __FUNCTION__);
-    }
-}
-
-/* Force the context `c' to be the current context and associate with it
- * buffer `b'.
- */
-GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-			    __DRIdrawablePrivate * driDrawPriv,
-			    __DRIdrawablePrivate * driReadPriv)
-{
-	if (driContextPriv) {
-		radeonContextPtr radeon =
-			(radeonContextPtr) driContextPriv->driverPrivate;
-
-		if (RADEON_DEBUG & DEBUG_DRI)
-			fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
-				radeon->glCtx);
-
-		if (radeon->dri.drawable != driDrawPriv) {
-			if (driDrawPriv->swap_interval == (unsigned)-1) {
-				driDrawPriv->vblFlags =
-					(radeon->radeonScreen->irq != 0)
-					? driGetDefaultVBlankFlags(&radeon->
-								   optionCache)
-					: VBLANK_FLAG_NO_IRQ;
-
-				driDrawableInitVBlank(driDrawPriv);
-			}
-		}
-
-		radeon->dri.readable = driReadPriv;
-
-		if (radeon->dri.drawable != driDrawPriv ||
-		    radeon->lastStamp != driDrawPriv->lastStamp) {
-			radeon->dri.drawable = driDrawPriv;
-
-			radeonSetCliprects(radeon);
-			r300UpdateViewportOffset(radeon->glCtx);
-		}
-
-		_mesa_make_current(radeon->glCtx,
-				    (GLframebuffer *) driDrawPriv->
-				    driverPrivate,
-				    (GLframebuffer *) driReadPriv->
-				    driverPrivate);
-
-		_mesa_update_state(radeon->glCtx);		
-
-		radeonUpdatePageFlipping(radeon);
-	} else {
-		if (RADEON_DEBUG & DEBUG_DRI)
-			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
-		_mesa_make_current(0, 0, 0);
-	}
-
-	if (RADEON_DEBUG & DEBUG_DRI)
-		fprintf(stderr, "End %s\n", __FUNCTION__);
-	return GL_TRUE;
-}
-
-/* Force the context `c' to be unbound from its buffer.
- */
-GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
-{
-	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_DRI)
-		fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
-			radeon->glCtx);
-
-	return GL_TRUE;
-}
-
diff --git a/src/mesa/drivers/dri/r300/radeon_context.h b/src/mesa/drivers/dri/r300/radeon_context.h
index 47cbc22a72..250570f6b8 100644
--- a/src/mesa/drivers/dri/r300/radeon_context.h
+++ b/src/mesa/drivers/dri/r300/radeon_context.h
@@ -49,20 +49,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "dri_util.h"
 
-struct radeon_context;
-typedef struct radeon_context radeonContextRec;
-typedef struct radeon_context *radeonContextPtr;
-
-/* Rasterizing fallbacks */
-/* See correponding strings in r200_swtcl.c */
-#define RADEON_FALLBACK_TEXTURE		0x0001
-#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
-#define RADEON_FALLBACK_STENCIL		0x0004
-#define RADEON_FALLBACK_RENDER_MODE	0x0008
-#define RADEON_FALLBACK_BLEND_EQ	0x0010
-#define RADEON_FALLBACK_BLEND_FUNC	0x0020
-#define RADEON_FALLBACK_DISABLE		0x0040
-#define RADEON_FALLBACK_BORDER_MODE	0x0080
+#include "radeon_screen.h"
 
 #if R200_MERGED
 extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
@@ -79,155 +66,11 @@ extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 /* TCL fallbacks */
 extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 
-#define RADEON_TCL_FALLBACK_RASTER		0x0001	/* rasterization */
-#define RADEON_TCL_FALLBACK_UNFILLED		0x0002	/* unfilled tris */
-#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE	0x0004	/* twoside tris */
-#define RADEON_TCL_FALLBACK_MATERIAL		0x0008	/* material in vb */
-#define RADEON_TCL_FALLBACK_TEXGEN_0		0x0010	/* texgen, unit 0 */
-#define RADEON_TCL_FALLBACK_TEXGEN_1		0x0020	/* texgen, unit 1 */
-#define RADEON_TCL_FALLBACK_TEXGEN_2		0x0040	/* texgen, unit 2 */
-#define RADEON_TCL_FALLBACK_TEXGEN_3		0x0080	/* texgen, unit 3 */
-#define RADEON_TCL_FALLBACK_TEXGEN_4		0x0100	/* texgen, unit 4 */
-#define RADEON_TCL_FALLBACK_TEXGEN_5		0x0200	/* texgen, unit 5 */
-#define RADEON_TCL_FALLBACK_TCL_DISABLE		0x0400	/* user disable */
-#define RADEON_TCL_FALLBACK_BITMAP		0x0800	/* draw bitmap with points */
-#define RADEON_TCL_FALLBACK_VERTEX_PROGRAM	0x1000	/* vertex program active */
-
 #if R200_MERGED
 #define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
 #else
 #define TCL_FALLBACK( ctx, bit, mode )	;
 #endif
 
-struct radeon_dri_mirror {
-	__DRIcontextPrivate *context;	/* DRI context */
-	__DRIscreenPrivate *screen;	/* DRI screen */
-	/**
-	 * DRI drawable bound to this context for drawing.
-	 */
-	__DRIdrawablePrivate *drawable;
-
-	/**
-	 * DRI drawable bound to this context for reading.
-	 */
-	__DRIdrawablePrivate *readable;
-
-	drm_context_t hwContext;
-	drm_hw_lock_t *hwLock;
-	int fd;
-	int drmMinor;
-};
-
-/**
- * Derived state for internal purposes.
- */
-struct radeon_scissor_state {
-	drm_clip_rect_t rect;
-	GLboolean enabled;
-
-	GLuint numClipRects;	/* Cliprects active */
-	GLuint numAllocedClipRects;	/* Cliprects available */
-	drm_clip_rect_t *pClipRects;
-};
-
-struct radeon_colorbuffer_state {
-	GLuint clear;
-	GLint drawOffset, drawPitch;
-};
-
-struct radeon_state {
-	struct radeon_colorbuffer_state color;
-	struct radeon_scissor_state scissor;
-};
-
-/**
- * Common per-context variables shared by R200 and R300.
- * R200- and R300-specific code "derive" their own context from this
- * structure.
- */
-struct radeon_context {
-	GLcontext *glCtx;	/* Mesa context */
-	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
-
-	/* Fallback state */
-	GLuint Fallback;
-	GLuint TclFallback;
-
-	/* Page flipping */
-	GLuint doPageFlip;
-
-	/* Drawable, cliprect and scissor information */
-	GLuint numClipRects;	/* Cliprects for the draw buffer */
-	drm_clip_rect_t *pClipRects;
-	unsigned int lastStamp;
-	GLboolean lost_context;
-	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
-
-	/* Mirrors of some DRI state */
-	struct radeon_dri_mirror dri;
-
-	/* Busy waiting */
-	GLuint do_usleeps;
-	GLuint do_irqs;
-	GLuint irqsEmitted;
-	drm_radeon_irq_wait_t iw;
-
-	/* buffer swap */
-	int64_t swap_ust;
-	int64_t swap_missed_ust;
-
-	GLuint swap_count;
-	GLuint swap_missed_count;
-
-	/* Derived state */
-	struct radeon_state state;
-
-	/* Configuration cache
-	 */
-	driOptionCache optionCache;
-};
-
-#define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
-
-extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
-extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-				int x, int y, int w, int h);
-extern GLboolean radeonInitContext(radeonContextPtr radeon,
-				   struct dd_function_table *functions,
-				   const __GLcontextModes * glVisual,
-				   __DRIcontextPrivate * driContextPriv,
-				   void *sharedContextPrivate);
-extern void radeonCleanupContext(radeonContextPtr radeon);
-extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-				   __DRIdrawablePrivate * driDrawPriv,
-				   __DRIdrawablePrivate * driReadPriv);
-extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
-
-/* ================================================================
- * Debugging:
- */
-#define DO_DEBUG		1
-
-#if DO_DEBUG
-extern int RADEON_DEBUG;
-#else
-#define RADEON_DEBUG		0
-#endif
-
-#define DEBUG_TEXTURE	0x0001
-#define DEBUG_STATE	0x0002
-#define DEBUG_IOCTL	0x0004
-#define DEBUG_PRIMS	0x0008
-#define DEBUG_VERTS	0x0010
-#define DEBUG_FALLBACKS	0x0020
-#define DEBUG_VFMT	0x0040
-#define DEBUG_CODEGEN	0x0080
-#define DEBUG_VERBOSE	0x0100
-#define DEBUG_DRI       0x0200
-#define DEBUG_DMA       0x0400
-#define DEBUG_SANITY    0x0800
-#define DEBUG_SYNC      0x1000
-#define DEBUG_PIXEL     0x2000
-#define DEBUG_MEMORY    0x4000
 
 #endif				/* __RADEON_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.c b/src/mesa/drivers/dri/r300/radeon_ioctl.c
deleted file mode 100644
index f042a7b943..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_ioctl.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include <sched.h>
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/macros.h"
-#include "main/context.h"
-#include "swrast/swrast.h"
-#include "r300_context.h"
-#include "radeon_ioctl.h"
-#include "r300_ioctl.h"
-#include "r300_state.h"
-#include "radeon_reg.h"
-
-#include "drirenderbuffer.h"
-#include "vblank.h"
-
-static void radeonWaitForIdle(radeonContextPtr radeon);
-
-/* ================================================================
- * SwapBuffers with client-side throttling
- */
-
-static uint32_t radeonGetLastFrame(radeonContextPtr radeon)
-{
-	drm_radeon_getparam_t gp;
-	int ret;
-	uint32_t frame = 0;
-
-	gp.param = RADEON_PARAM_LAST_FRAME;
-	gp.value = (int *)&frame;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
-				  &gp, sizeof(gp));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-
-	return frame;
-}
-
-uint32_t radeonGetAge(radeonContextPtr radeon)
-{
-	drm_radeon_getparam_t gp;
-	int ret;
-	uint32_t age = 0;
-
-	gp.param = RADEON_PARAM_LAST_CLEAR;
-	gp.value = (int *)&age;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
-				  &gp, sizeof(gp));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-
-	return age;
-}
-
-static void radeonEmitIrqLocked(radeonContextPtr radeon)
-{
-	drm_radeon_irq_emit_t ie;
-	int ret;
-
-	ie.irq_seq = &radeon->iw.irq_seq;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_IRQ_EMIT,
-				  &ie, sizeof(ie));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-}
-
-static void radeonWaitIrq(radeonContextPtr radeon)
-{
-	int ret;
-
-	do {
-		ret = drmCommandWrite(radeon->dri.fd, DRM_RADEON_IRQ_WAIT,
-				      &radeon->iw, sizeof(radeon->iw));
-	} while (ret && (errno == EINTR || errno == EBUSY));
-
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-}
-
-static void radeonWaitForFrameCompletion(radeonContextPtr radeon)
-{
-	drm_radeon_sarea_t *sarea = radeon->sarea;
-
-	if (radeon->do_irqs) {
-		if (radeonGetLastFrame(radeon) < sarea->last_frame) {
-			if (!radeon->irqsEmitted) {
-				while (radeonGetLastFrame(radeon) <
-				       sarea->last_frame) ;
-			} else {
-				UNLOCK_HARDWARE(radeon);
-				radeonWaitIrq(radeon);
-				LOCK_HARDWARE(radeon);
-			}
-			radeon->irqsEmitted = 10;
-		}
-
-		if (radeon->irqsEmitted) {
-			radeonEmitIrqLocked(radeon);
-			radeon->irqsEmitted--;
-		}
-	} else {
-		while (radeonGetLastFrame(radeon) < sarea->last_frame) {
-			UNLOCK_HARDWARE(radeon);
-			if (radeon->do_usleeps)
-				DO_USLEEP(1);
-			LOCK_HARDWARE(radeon);
-		}
-	}
-}
-
-/* Copy the back color buffer to the front color buffer.
- */
-void radeonCopyBuffer(__DRIdrawablePrivate * dPriv,
-		      const drm_clip_rect_t	 * rect)
-{
-	radeonContextPtr radeon;
-	GLint nbox, i, ret;
-	GLboolean missed_target;
-	int64_t ust;
-	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-	assert(dPriv);
-	assert(dPriv->driContextPriv);
-	assert(dPriv->driContextPriv->driverPrivate);
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "\n%s( %p )\n\n", __FUNCTION__,
-			(void *)radeon->glCtx);
-	}
-
-	r300Flush(radeon->glCtx);
-
-	LOCK_HARDWARE(radeon);
-
-	/* Throttle the frame rate -- only allow one pending swap buffers
-	 * request at a time.
-	 */
-	radeonWaitForFrameCompletion(radeon);
-	if (!rect)
-	{
-	    UNLOCK_HARDWARE(radeon);
-	    driWaitForVBlank(dPriv, &missed_target);
-	    LOCK_HARDWARE(radeon);
-	}
-
-	nbox = dPriv->numClipRects;	/* must be in locked region */
-
-	for (i = 0; i < nbox;) {
-		GLint nr = MIN2(i + RADEON_NR_SAREA_CLIPRECTS, nbox);
-		drm_clip_rect_t *box = dPriv->pClipRects;
-		drm_clip_rect_t *b = radeon->sarea->boxes;
-		GLint n = 0;
-
-		for ( ; i < nr ; i++ ) {
-
-		    *b = box[i];
-
-		    if (rect)
-		    {
-			if (rect->x1 > b->x1)
-			    b->x1 = rect->x1;
-			if (rect->y1 > b->y1)
-			    b->y1 = rect->y1;
-			if (rect->x2 < b->x2)
-			    b->x2 = rect->x2;
-			if (rect->y2 < b->y2)
-			    b->y2 = rect->y2;
-
-			if (b->x1 >= b->x2 || b->y1 >= b->y2)
-			    continue;
-		    }
-
-		    b++;
-		    n++;
-		}
-		radeon->sarea->nbox = n;
-
-		if (!n)
-		   continue;
-
-		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_SWAP);
-
-		if (ret) {
-			fprintf(stderr, "DRM_RADEON_SWAP: return = %d\n",
-				ret);
-			UNLOCK_HARDWARE(radeon);
-			exit(1);
-		}
-	}
-
-	UNLOCK_HARDWARE(radeon);
-	if (!rect)
-	{
-	    ((r300ContextPtr)radeon)->hw.all_dirty = GL_TRUE;
-
-	    radeon->swap_count++;
-	    (*psp->systemTime->getUST) (&ust);
-	    if (missed_target) {
-		radeon->swap_missed_count++;
-		radeon->swap_missed_ust = ust - radeon->swap_ust;
-	    }
-
-	    radeon->swap_ust = ust;
-
-	    sched_yield();
-	}
-}
-
-void radeonPageFlip(__DRIdrawablePrivate * dPriv)
-{
-	radeonContextPtr radeon;
-	GLint ret;
-	GLboolean missed_target;
-	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-	assert(dPriv);
-	assert(dPriv->driContextPriv);
-	assert(dPriv->driContextPriv->driverPrivate);
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
-			radeon->sarea->pfCurrentPage);
-	}
-
-	r300Flush(radeon->glCtx);
-	LOCK_HARDWARE(radeon);
-
-	if (!dPriv->numClipRects) {
-		UNLOCK_HARDWARE(radeon);
-		usleep(10000);	/* throttle invisible client 10ms */
-		return;
-	}
-
-	/* Need to do this for the perf box placement:
-	 */
-	{
-		drm_clip_rect_t *box = dPriv->pClipRects;
-		drm_clip_rect_t *b = radeon->sarea->boxes;
-		b[0] = box[0];
-		radeon->sarea->nbox = 1;
-	}
-
-	/* Throttle the frame rate -- only allow a few pending swap buffers
-	 * request at a time.
-	 */
-	radeonWaitForFrameCompletion(radeon);
-	UNLOCK_HARDWARE(radeon);
-	driWaitForVBlank(dPriv, &missed_target);
-	if (missed_target) {
-		radeon->swap_missed_count++;
-		(void)(*psp->systemTime->getUST) (&radeon->swap_missed_ust);
-	}
-	LOCK_HARDWARE(radeon);
-
-	ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_FLIP);
-
-	UNLOCK_HARDWARE(radeon);
-
-	if (ret) {
-		fprintf(stderr, "DRM_RADEON_FLIP: return = %d\n", ret);
-		exit(1);
-	}
-
-	radeon->swap_count++;
-	(void)(*psp->systemTime->getUST) (&radeon->swap_ust);
-
-        driFlipRenderbuffers(radeon->glCtx->WinSysDrawBuffer, 
-                             radeon->sarea->pfCurrentPage);
-
-	if (radeon->sarea->pfCurrentPage == 1) {
-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
-	} else {
-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
-	}
-
-	if (IS_R300_CLASS(radeon->radeonScreen)) {
-		r300ContextPtr r300 = (r300ContextPtr)radeon;
-		R300_STATECHANGE(r300, cb);
-		r300->hw.cb.cmd[R300_CB_OFFSET] = r300->radeon.state.color.drawOffset + 
-						r300->radeon.radeonScreen->fbLocation;
-		r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
-		
-		if (r300->radeon.radeonScreen->cpp == 4)
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-		else
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-	
-		if (r300->radeon.sarea->tiling_enabled)
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-	}
-}
-
-void radeonWaitForIdleLocked(radeonContextPtr radeon)
-{
-	int ret;
-	int i = 0;
-
-	do {
-		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_CP_IDLE);
-		if (ret)
-			DO_USLEEP(1);
-	} while (ret && ++i < 100);
-
-	if (ret < 0) {
-		UNLOCK_HARDWARE(radeon);
-		fprintf(stderr, "Error: R300 timed out... exiting\n");
-		exit(-1);
-	}
-}
-
-static void radeonWaitForIdle(radeonContextPtr radeon)
-{
-	LOCK_HARDWARE(radeon);
-	radeonWaitForIdleLocked(radeon);
-	UNLOCK_HARDWARE(radeon);
-}
-
-void radeonFlush(GLcontext * ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	if (IS_R300_CLASS(radeon->radeonScreen))
-		r300Flush(ctx);
-}
-
-
-/* Make sure all commands have been sent to the hardware and have
- * completed processing.
- */
-void radeonFinish(GLcontext * ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	radeonFlush(ctx);
-
-	if (radeon->do_irqs) {
-		LOCK_HARDWARE(radeon);
-		radeonEmitIrqLocked(radeon);
-		UNLOCK_HARDWARE(radeon);
-		radeonWaitIrq(radeon);
-	} else
-		radeonWaitForIdle(radeon);
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.h b/src/mesa/drivers/dri/r300/radeon_ioctl.h
deleted file mode 100644
index 3add775b82..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_ioctl.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#ifndef __RADEON_IOCTL_H__
-#define __RADEON_IOCTL_H__
-
-#include "main/simple_list.h"
-#include "radeon_dri.h"
-#include "radeon_lock.h"
-
-#include "xf86drm.h"
-#include "drm.h"
-#if 0
-#include "r200context.h"
-#endif
-#include "radeon_drm.h"
-
-extern void radeonCopyBuffer(__DRIdrawablePrivate * drawable,
-			     const drm_clip_rect_t	* rect);
-extern void radeonPageFlip(__DRIdrawablePrivate * drawable);
-extern void radeonFlush(GLcontext * ctx);
-extern void radeonFinish(GLcontext * ctx);
-extern void radeonWaitForIdleLocked(radeonContextPtr radeon);
-extern uint32_t radeonGetAge(radeonContextPtr radeon);
-
-#endif				/* __RADEON_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r300/radeon_lock.c b/src/mesa/drivers/dri/r300/radeon_lock.c
deleted file mode 100644
index 4f47afd5dc..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_lock.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/**************************************************************************
-
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Gareth Hughes <gareth@valinux.com>
- *   Keith Whitwell <keith@tungstengraphics.com>
- *   Kevin E. Martin <martin@valinux.com>
- */
-
-#include "radeon_lock.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
-#include "r300_context.h"
-#include "r300_state.h"
-
-#include "main/framebuffer.h"
-
-#include "drirenderbuffer.h"
-
-#if DEBUG_LOCKING
-char *prevLockFile = NULL;
-int prevLockLine = 0;
-#endif
-
-/* Turn on/off page flipping according to the flags in the sarea:
- */
-void radeonUpdatePageFlipping(radeonContextPtr rmesa)
-{
-	int use_back;
-
-	rmesa->doPageFlip = rmesa->sarea->pfState;
-	if (rmesa->glCtx->WinSysDrawBuffer) {
-		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-				     rmesa->sarea->pfCurrentPage);
-		r300UpdateDrawBuffer(rmesa->glCtx);
-	}
-
-	use_back = rmesa->glCtx->DrawBuffer ?
-	    (rmesa->glCtx->DrawBuffer->_ColorDrawBufferIndexes[0] ==
-	     BUFFER_BACK_LEFT) : 1;
-	use_back ^= (rmesa->sarea->pfCurrentPage == 1);
-
-	if (use_back) {
-		rmesa->state.color.drawOffset =
-		    rmesa->radeonScreen->backOffset;
-		rmesa->state.color.drawPitch = rmesa->radeonScreen->backPitch;
-	} else {
-		rmesa->state.color.drawOffset =
-		    rmesa->radeonScreen->frontOffset;
-		rmesa->state.color.drawPitch =
-		    rmesa->radeonScreen->frontPitch;
-	}
-}
-
-/* Update the hardware state.  This is called if another context has
- * grabbed the hardware lock, which includes the X server.  This
- * function also updates the driver's window state after the X server
- * moves, resizes or restacks a window -- the change will be reflected
- * in the drawable position and clip rects.  Since the X server grabs
- * the hardware lock when it changes the window state, this routine will
- * automatically be called after such a change.
- */
-void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
-{
-	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
-	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
-	drm_radeon_sarea_t *sarea = rmesa->sarea;
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-
-	assert(drawable != NULL);
-
-	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
-
-	/* The window might have moved, so we might need to get new clip
-	 * rects.
-	 *
-	 * NOTE: This releases and regrabs the hw lock to allow the X server
-	 * to respond to the DRI protocol request for new drawable info.
-	 * Since the hardware state depends on having the latest drawable
-	 * clip rects, all state checking must be done _after_ this call.
-	 */
-	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
-	if (drawable != readable) {
-		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
-	}
-
-	if (rmesa->lastStamp != drawable->lastStamp) {
-		radeonUpdatePageFlipping(rmesa);
-		radeonSetCliprects(rmesa);
-		r300UpdateViewportOffset(rmesa->glCtx);
-		driUpdateFramebufferSize(rmesa->glCtx, drawable);
-	}
-
-	if (sarea->ctx_owner != rmesa->dri.hwContext) {
-		int i;
-
-		sarea->ctx_owner = rmesa->dri.hwContext;
-		for (i = 0; i < r300->nr_heaps; i++) {
-			DRI_AGE_TEXTURES(r300->texture_heaps[i]);
-		}
-	}
-
-	rmesa->lost_context = GL_TRUE;
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_lock.h b/src/mesa/drivers/dri/r300/radeon_lock.h
deleted file mode 100644
index a344837f47..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_lock.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/**************************************************************************
-
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Gareth Hughes <gareth@valinux.com>
- *   Keith Whitwell <keith@tungstengraphics.com>
- *   Kevin E. Martin <martin@valinux.com>
- */
-
-#ifndef __RADEON_LOCK_H__
-#define __RADEON_LOCK_H__
-
-#include "radeon_context.h"
-
-extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
-extern void radeonUpdatePageFlipping(radeonContextPtr rmesa);
-
-/* Turn DEBUG_LOCKING on to find locking conflicts.
- */
-#define DEBUG_LOCKING	0
-
-#if DEBUG_LOCKING
-extern char *prevLockFile;
-extern int prevLockLine;
-
-#define DEBUG_LOCK()							\
-   do {									\
-      prevLockFile = (__FILE__);					\
-      prevLockLine = (__LINE__);					\
-   } while (0)
-
-#define DEBUG_RESET()							\
-   do {									\
-      prevLockFile = 0;							\
-      prevLockLine = 0;							\
-   } while (0)
-
-#define DEBUG_CHECK_LOCK()						\
-   do {									\
-      if (prevLockFile) {						\
-	 fprintf(stderr,						\
-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
-		  prevLockFile, prevLockLine, __FILE__, __LINE__);	\
-	 exit(1);							\
-      }									\
-   } while (0)
-
-#else
-
-#define DEBUG_LOCK()
-#define DEBUG_RESET()
-#define DEBUG_CHECK_LOCK()
-
-#endif
-
-/*
- * !!! We may want to separate locks from locks with validation.  This
- * could be used to improve performance for those things commands that
- * do not do any drawing !!!
- */
-
-/* Lock the hardware and validate our state.
- */
-#define LOCK_HARDWARE( rmesa )						\
-	do {								\
-		char __ret = 0;						\
-		DEBUG_CHECK_LOCK();					\
-		DRM_CAS((rmesa)->dri.hwLock, (rmesa)->dri.hwContext,	\
-			(DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret); \
-		if (__ret)						\
-			radeonGetLock((rmesa), 0);			\
-		DEBUG_LOCK();						\
-	} while (0)
-
-#define UNLOCK_HARDWARE( rmesa )					\
-	do {								\
-		DRM_UNLOCK((rmesa)->dri.fd,				\
-			(rmesa)->dri.hwLock,				\
-			(rmesa)->dri.hwContext);			\
-		DEBUG_RESET();						\
-	} while (0)
-
-#endif				/* __RADEON_LOCK_H__ */
diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c
index 2e21f7bf66..906d36e522 100644
--- a/src/mesa/drivers/dri/r300/radeon_program_pair.c
+++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c
@@ -35,7 +35,7 @@
 
 #include "radeon_program_pair.h"
 
-#include "radeon_context.h"
+#include "radeon_common.h"
 
 #include "shader/prog_print.h"
 
@@ -47,7 +47,6 @@
 
 struct pair_state_instruction {
 	GLuint IsTex:1; /**< Is a texture instruction */
-	GLuint IsOutput:1; /**< Is output instruction */
 	GLuint NeedRGB:1; /**< Needs the RGB ALU */
 	GLuint NeedAlpha:1; /**< Needs the Alpha ALU */
 	GLuint IsTranscendent:1; /**< Is a special transcendent instruction */
@@ -124,7 +123,6 @@ struct pair_state {
 	GLboolean Debug;
 	GLboolean Verbose;
 	void *UserData;
-	GLubyte NumKillInsts;
 
 	/**
 	 * Translate Mesa registers to hardware registers
@@ -151,11 +149,6 @@ struct pair_state {
 	struct pair_state_instruction *ReadyTEX;
 
 	/**
-	 * Linked list of deferred instructions
-	 */
-	struct pair_state_instruction *DeferredInsts;
-
-	/**
 	 * Pool of @ref reg_value structures for fast allocation.
 	 */
 	struct reg_value *ValuePool;
@@ -238,9 +231,7 @@ static void instruction_ready(struct pair_state *s, int ip)
 	if (s->Verbose)
 		_mesa_printf("instruction_ready(%i)\n", ip);
 
-	if (s->NumKillInsts > 0 && pairinst->IsOutput)
-		add_pairinst_to_list(&s->DeferredInsts, pairinst);
-	else if (pairinst->IsTex)
+	if (pairinst->IsTex)
 		add_pairinst_to_list(&s->ReadyTEX, pairinst);
 	else if (!pairinst->NeedAlpha)
 		add_pairinst_to_list(&s->ReadyRGB, pairinst);
@@ -348,8 +339,6 @@ static void classify_instruction(struct pair_state *s,
 		error("Unknown opcode %d\n", inst->Opcode);
 		break;
 	}
-
-	pairinst->IsOutput = (inst->DstReg.File == PROGRAM_OUTPUT);
 }
 
 
@@ -613,16 +602,14 @@ static void emit_all_tex(struct pair_state *s)
 		struct prog_instruction *inst = s->Program->Instructions + ip;
 		commit_instruction(s, ip);
 
-		if (inst->Opcode == OPCODE_KIL)
-			--s->NumKillInsts;
-		else
+		if (inst->Opcode != OPCODE_KIL)
 			inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
-
 		inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index);
 
 		if (s->Debug) {
 			_mesa_printf("   ");
 			_mesa_print_instruction(inst);
+			fflush(stdout);
 		}
 		s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst);
 	}
@@ -875,17 +862,6 @@ static void emit_alu(struct pair_state *s)
 	s->Error = s->Error || !s->Handler->EmitPaired(s->UserData, &pair);
 }
 
-static GLubyte countKillInsts(struct gl_program *prog)
-{
-	GLubyte i, count = 0;
-
-	for (i = 0; i < prog->NumInstructions; ++i) {
-		if (prog->Instructions[i].Opcode == OPCODE_KIL)
-			++count;
-	}
-
-	return count;
-}
 
 GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
 	const struct radeon_pair_handler* handler, void *userdata)
@@ -899,7 +875,6 @@ GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
 	s.UserData = userdata;
 	s.Debug = (RADEON_DEBUG & DEBUG_PIXEL) ? GL_TRUE : GL_FALSE;
 	s.Verbose = GL_FALSE && s.Debug;
-	s.NumKillInsts = countKillInsts(program);
 
 	s.Instructions = (struct pair_state_instruction*)_mesa_calloc(
 		sizeof(struct pair_state_instruction)*s.Program->NumInstructions);
@@ -918,21 +893,6 @@ GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
 		if (s.ReadyTEX)
 			emit_all_tex(&s);
 
-		if (!s.NumKillInsts) {
-			struct pair_state_instruction *pairinst = s.DeferredInsts;
-			while (pairinst) {
-				if (!pairinst->NeedAlpha)
-					add_pairinst_to_list(&s.ReadyRGB, pairinst);
-				else if (!pairinst->NeedRGB)
-					add_pairinst_to_list(&s.ReadyAlpha, pairinst);
-				else
-					add_pairinst_to_list(&s.ReadyFullALU, pairinst);
-
-				pairinst = pairinst->NextReady;
-			}
-			s.DeferredInsts = NULL;
-		}
-
 		while(s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha)
 			emit_alu(&s);
 	}
diff --git a/src/mesa/drivers/dri/r300/radeon_span.c b/src/mesa/drivers/dri/r300/radeon_span.c
deleted file mode 100644
index 16f9fb99e6..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_span.c
+++ /dev/null
@@ -1,349 +0,0 @@
-/**************************************************************************
-
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Kevin E. Martin <martin@valinux.com>
- *   Gareth Hughes <gareth@valinux.com>
- *   Keith Whitwell <keith@tungstengraphics.com>
- *
- */
-
-#include "main/glheader.h"
-#include "swrast/swrast.h"
-
-#include "r300_state.h"
-#include "radeon_ioctl.h"
-#include "r300_ioctl.h"
-#include "radeon_span.h"
-
-#include "drirenderbuffer.h"
-
-#define DBG 0
-
-/*
- * Note that all information needed to access pixels in a renderbuffer
- * should be obtained through the gl_renderbuffer parameter, not per-context
- * information.
- */
-#define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
-   const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
-   GLuint p;							\
-   (void) p;
-
-#define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
-   const GLuint bottom = dPriv->h - 1;			\
-   GLuint xo = dPriv->x;				\
-   GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
-
-#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
-
-#define Y_FLIP(Y) (bottom - (Y))
-
-#define HW_LOCK()
-
-#define HW_UNLOCK()
-
-/* ================================================================
- * Color buffer
- */
-
-/* 16 bit, RGB565 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    radeon##x##_RGB565
-#define TAG2(x,y) radeon##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
-#include "spantmp2.h"
-
-/* 32 bit, ARGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    radeon##x##_ARGB8888
-#define TAG2(x,y) radeon##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
-#include "spantmp2.h"
-
-/* ================================================================
- * Depth buffer
- */
-
-/* The Radeon family has depth tiling on all the time, so we have to convert
- * the x,y coordinates into the memory bus address (mba) in the same
- * manner as the engine.  In each case, the linear block address (ba)
- * is calculated, and then wired with x and y to produce the final
- * memory address.
- * The chip will do address translation on its own if the surface registers
- * are set up correctly. It is not quite enough to get it working with hyperz
- * too...
- */
-
-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 4 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0..1] = 0           */
-
-#ifdef COMPILE_R300
-		ba = (y / 8) * (pitch / 8) + (x / 8);
-#else
-		ba = (y / 16) * (pitch / 16) + (x / 16);
-#endif
-
-		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
-		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
-		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-static INLINE GLuint
-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 2 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0]    = 0           */
-
-		ba = (y / 16) * (pitch / 32) + (x / 32);
-
-		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
-		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
-		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-/* 16-bit depth buffer functions
- */
-#define VALUE_TYPE GLushort
-
-#define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
-
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
-
-#define TAG(x) radeon##x##_z16
-#include "depthtmp.h"
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- *
- * Careful: It looks like the R300 uses ZZZS byte order while the R200
- * uses SZZZ for 24 bit depth, 8 bit stencil mode.
- */
-#define VALUE_TYPE GLuint
-
-#ifdef COMPILE_R300
-#define WRITE_DEPTH( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0x000000ff;							\
-   tmp |= ((d << 8) & 0xffffff00);					\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#else
-#define WRITE_DEPTH( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xff000000;							\
-   tmp |= ((d) & 0x00ffffff);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#endif
-
-#ifdef COMPILE_R300
-#define READ_DEPTH( d, _x, _y )						\
-  do { \
-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
-					 _y + yo )) & 0xffffff00) >> 8; \
-  }while(0)
-#else
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
-					 _y + yo )) & 0x00ffffff;
-#endif
-
-#define TAG(x) radeon##x##_z24_s8
-#include "depthtmp.h"
-
-/* ================================================================
- * Stencil buffer
- */
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- */
-#ifdef COMPILE_R300
-#define WRITE_STENCIL( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xffffff00;							\
-   tmp |= (d) & 0xff;							\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#else
-#define WRITE_STENCIL( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0x00ffffff;							\
-   tmp |= (((d) & 0xff) << 24);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#endif
-
-#ifdef COMPILE_R300
-#define READ_STENCIL( d, _x, _y )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   d = tmp & 0x000000ff;						\
-} while (0)
-#else
-#define READ_STENCIL( d, _x, _y )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   d = (tmp & 0xff000000) >> 24;					\
-} while (0)
-#endif
-
-#define TAG(x) radeon##x##_z24_s8
-#include "stenciltmp.h"
-
-/* Move locking out to get reasonable span performance (10x better
- * than doing this in HW_LOCK above).  WaitForIdle() is the main
- * culprit.
- */
-
-static void radeonSpanRenderStart(GLcontext * ctx)
-{
-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-#ifdef COMPILE_R300
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-	R300_FIREVERTICES(r300);
-#else
-	RADEON_FIREVERTICES(rmesa);
-#endif
-	LOCK_HARDWARE(rmesa);
-	radeonWaitForIdleLocked(rmesa);
-
-	/* Read the first pixel in the frame buffer.  This should
-	 * be a noop, right?  In fact without this conform fails as reading
-	 * from the framebuffer sometimes produces old results -- the
-	 * on-card read cache gets mixed up and doesn't notice that the
-	 * framebuffer has been updated.
-	 *
-	 * Note that we should probably be reading some otherwise unused
-	 * region of VRAM, otherwise we might get incorrect results when
-	 * reading pixels from the top left of the screen.
-	 *
-	 * I found this problem on an R420 with glean's texCube test.
-	 * Note that the R200 span code also *writes* the first pixel in the
-	 * framebuffer, but I've found this to be unnecessary.
-	 *  -- Nicolai Hähnle, June 2008
-	 */
-	{
-		int p;
-		driRenderbuffer *drb =
-			(driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
-		volatile int *buf =
-			(volatile int *)(rmesa->dri.screen->pFB + drb->offset);
-		p = *buf;
-	}
-}
-
-static void radeonSpanRenderFinish(GLcontext * ctx)
-{
-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-	_swrast_flush(ctx);
-	UNLOCK_HARDWARE(rmesa);
-}
-
-void radeonInitSpanFuncs(GLcontext * ctx)
-{
-	struct swrast_device_driver *swdd =
-	    _swrast_GetDeviceDriverReference(ctx);
-	swdd->SpanRenderStart = radeonSpanRenderStart;
-	swdd->SpanRenderFinish = radeonSpanRenderFinish;
-}
-
-/**
- * Plug in the Get/Put routines for the given driRenderbuffer.
- */
-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
-{
-	if (drb->Base.InternalFormat == GL_RGBA) {
-		if (vis->redBits == 5 && vis->greenBits == 6
-		    && vis->blueBits == 5) {
-			radeonInitPointers_RGB565(&drb->Base);
-		} else {
-			radeonInitPointers_ARGB8888(&drb->Base);
-		}
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-		radeonInitDepthPointers_z16(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-		radeonInitDepthPointers_z24_s8(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-		radeonInitStencilPointers_z24_s8(&drb->Base);
-	}
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_state.c b/src/mesa/drivers/dri/r300/radeon_state.c
deleted file mode 100644
index c401da6c54..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_state.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/**************************************************************************
-
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/api_arrayelt.h"
-#include "main/enums.h"
-#include "main/framebuffer.h"
-#include "main/colormac.h"
-#include "main/light.h"
-
-#include "swrast/swrast.h"
-#include "vbo/vbo.h"
-#include "tnl/tnl.h"
-#include "tnl/t_pipeline.h"
-#include "swrast_setup/swrast_setup.h"
-
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
-#include "r300_ioctl.h"
-
-
-/* =============================================================
- * Scissoring
- */
-
-static GLboolean intersect_rect(drm_clip_rect_t * out,
-				drm_clip_rect_t * a, drm_clip_rect_t * b)
-{
-	*out = *a;
-	if (b->x1 > out->x1)
-		out->x1 = b->x1;
-	if (b->y1 > out->y1)
-		out->y1 = b->y1;
-	if (b->x2 < out->x2)
-		out->x2 = b->x2;
-	if (b->y2 < out->y2)
-		out->y2 = b->y2;
-	if (out->x1 >= out->x2)
-		return GL_FALSE;
-	if (out->y1 >= out->y2)
-		return GL_FALSE;
-	return GL_TRUE;
-}
-
-void radeonRecalcScissorRects(radeonContextPtr radeon)
-{
-	drm_clip_rect_t *out;
-	int i;
-
-	/* Grow cliprect store?
-	 */
-	if (radeon->state.scissor.numAllocedClipRects < radeon->numClipRects) {
-		while (radeon->state.scissor.numAllocedClipRects <
-		       radeon->numClipRects) {
-			radeon->state.scissor.numAllocedClipRects += 1;	/* zero case */
-			radeon->state.scissor.numAllocedClipRects *= 2;
-		}
-
-		if (radeon->state.scissor.pClipRects)
-			FREE(radeon->state.scissor.pClipRects);
-
-		radeon->state.scissor.pClipRects =
-		    MALLOC(radeon->state.scissor.numAllocedClipRects *
-			   sizeof(drm_clip_rect_t));
-
-		if (radeon->state.scissor.pClipRects == NULL) {
-			radeon->state.scissor.numAllocedClipRects = 0;
-			return;
-		}
-	}
-
-	out = radeon->state.scissor.pClipRects;
-	radeon->state.scissor.numClipRects = 0;
-
-	for (i = 0; i < radeon->numClipRects; i++) {
-		if (intersect_rect(out,
-				   &radeon->pClipRects[i],
-				   &radeon->state.scissor.rect)) {
-			radeon->state.scissor.numClipRects++;
-			out++;
-		}
-	}
-}
-
-void radeonUpdateScissor(GLcontext* ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	if (radeon->dri.drawable) {
-		__DRIdrawablePrivate *dPriv = radeon->dri.drawable;
-		int x1 = dPriv->x + ctx->Scissor.X;
-		int y1 = dPriv->y + dPriv->h - (ctx->Scissor.Y + ctx->Scissor.Height);
-
-		radeon->state.scissor.rect.x1 = x1;
-		radeon->state.scissor.rect.y1 = y1;
-		radeon->state.scissor.rect.x2 = x1 + ctx->Scissor.Width;
-		radeon->state.scissor.rect.y2 = y1 + ctx->Scissor.Height;
-
-		radeonRecalcScissorRects(radeon);
-	}
-}
-
-static void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
-{
-	if (ctx->Scissor.Enabled) {
-		/* We don't pipeline cliprect changes */
-		r300Flush(ctx);
-		radeonUpdateScissor(ctx);
-	}
-}
-
-
-/**
- * Update cliprects and scissors.
- */
-void radeonSetCliprects(radeonContextPtr radeon)
-{
-	__DRIdrawablePrivate *const drawable = radeon->dri.drawable;
-	__DRIdrawablePrivate *const readable = radeon->dri.readable;
-	GLframebuffer *const draw_fb = (GLframebuffer*)drawable->driverPrivate;
-	GLframebuffer *const read_fb = (GLframebuffer*)readable->driverPrivate;
-
-	if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-		/* Can't ignore 2d windows if we are page flipping. */
-		if (drawable->numBackClipRects == 0 || radeon->doPageFlip ||
-		    radeon->sarea->pfCurrentPage == 1) {
-			radeon->numClipRects = drawable->numClipRects;
-			radeon->pClipRects = drawable->pClipRects;
-		} else {
-			radeon->numClipRects = drawable->numBackClipRects;
-			radeon->pClipRects = drawable->pBackClipRects;
-		}
-	} else {
-		/* front buffer (or none, or multiple buffers */
-		radeon->numClipRects = drawable->numClipRects;
-		radeon->pClipRects = drawable->pClipRects;
-	}
-
-	if ((draw_fb->Width != drawable->w) ||
-	    (draw_fb->Height != drawable->h)) {
-		_mesa_resize_framebuffer(radeon->glCtx, draw_fb,
-					 drawable->w, drawable->h);
-		draw_fb->Initialized = GL_TRUE;
-	}
-
-	if (drawable != readable) {
-		if ((read_fb->Width != readable->w) ||
-		    (read_fb->Height != readable->h)) {
-			_mesa_resize_framebuffer(radeon->glCtx, read_fb,
-						 readable->w, readable->h);
-			read_fb->Initialized = GL_TRUE;
-		}
-	}
-
-	if (radeon->state.scissor.enabled)
-		radeonRecalcScissorRects(radeon);
-
-	radeon->lastStamp = drawable->lastStamp;
-}
-
-
-/**
- * Handle common enable bits.
- * Called as a fallback by r200Enable/r300Enable.
- */
-void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	switch(cap) {
-	case GL_SCISSOR_TEST:
-		/* We don't pipeline cliprect & scissor changes */
-		r300Flush(ctx);
-
-		radeon->state.scissor.enabled = state;
-		radeonUpdateScissor(ctx);
-		break;
-
-	default:
-		return;
-	}
-}
-
-
-/**
- * Initialize default state.
- * This function is called once at context init time from
- * r200InitState/r300InitState
- */
-void radeonInitState(radeonContextPtr radeon)
-{
-	radeon->Fallback = 0;
-
-	if (radeon->glCtx->Visual.doubleBufferMode && radeon->sarea->pfCurrentPage == 0) {
-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
-	} else {
-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
-	}
-}
-
-
-/**
- * Initialize common state functions.
- * Called by r200InitStateFuncs/r300InitStateFuncs
- */
-void radeonInitStateFuncs(struct dd_function_table *functions)
-{
-	functions->Scissor = radeonScissor;
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_state.h b/src/mesa/drivers/dri/r300/radeon_state.h
deleted file mode 100644
index 821cb40c7e..0000000000
--- a/src/mesa/drivers/dri/r300/radeon_state.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (C) 2004 Nicolai Haehnle.  All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Nicolai Haehnle <prefect_@gmx.net>
- */
-
-#ifndef __RADEON_STATE_H__
-#define __RADEON_STATE_H__
-
-extern void radeonRecalcScissorRects(radeonContextPtr radeon);
-extern void radeonSetCliprects(radeonContextPtr radeon);
-extern void radeonUpdateScissor(GLcontext* ctx);
-
-extern void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state);
-
-extern void radeonInitState(radeonContextPtr radeon);
-extern void radeonInitStateFuncs(struct dd_function_table* functions);
-
-#endif