9 files changed, 367 insertions, 159 deletions
diff --git a/src/mesa/drivers/dri/radeon/Makefile b/src/mesa/drivers/dri/radeon/Makefile
index 6bf67d2ea5..8fe9d98a0b 100644
--- a/src/mesa/drivers/dri/radeon/Makefile
+++ b/src/mesa/drivers/dri/radeon/Makefile
@@ -46,7 +46,8 @@ C_SOURCES = \
 	$(DRIVER_SOURCES) \
 	$(CS_SOURCES)
 
-DRIVER_DEFINES = -DRADEON_COMMON=0
+DRIVER_DEFINES = -DRADEON_COMMON=0 \
+				 -Wall
 
 DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h b/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
index c31421c253..410df4d665 100644
--- a/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
+++ b/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
@@ -3,7 +3,7 @@
 
 #include "radeon_bocs_wrapper.h"
 
-void rcommonEnsureCmdBufSpace(radeonContextPtr rmesa, int dwords, const char *caller);
+GLboolean rcommonEnsureCmdBufSpace(radeonContextPtr rmesa, int dwords, const char *caller);
 int rcommonFlushCmdBuf(radeonContextPtr rmesa, const char *caller);
 int rcommonFlushCmdBufLocked(radeonContextPtr rmesa, const char *caller);
 void rcommonInitCmdBuf(radeonContextPtr rmesa);
@@ -71,7 +71,7 @@ void rcommonBeginBatch(radeonContextPtr rmesa,
  */
 #define OUT_BATCH_TABLE(ptr,n) \
 	do { \
-		radeon_cs_write_table(b_l_rmesa->cmdbuf.cs, ptr, n);	\
+		radeon_cs_write_table(b_l_rmesa->cmdbuf.cs, (ptr), (n));\
 	} while(0)
 
 /**
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index b5b4fed8fa..8f34fbf6bc 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -895,7 +895,7 @@ void radeon_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei he
 	ctx->Driver.Viewport = old_viewport;
 }
 
-static void radeon_print_state_atom(radeonContextPtr radeon, struct radeon_state_atom *state)
+static void radeon_print_state_atom_prekmm(radeonContextPtr radeon, struct radeon_state_atom *state)
 {
 	int i, j, reg;
 	int dwords = (*state->check) (radeon->glCtx, state);
@@ -904,6 +904,9 @@ static void radeon_print_state_atom(radeonContextPtr radeon, struct radeon_state
 	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords, state->cmd_size);
 
 	if (RADEON_DEBUG & DEBUG_VERBOSE) {
+		if (dwords > state->cmd_size)
+			dwords = state->cmd_size;
+
 		for (i = 0; i < dwords;) {
 			cmd = *((drm_r300_cmd_header_t *) &state->cmd[i]);
 			reg = (cmd.packet0.reghi << 8) | cmd.packet0.reglo;
@@ -920,16 +923,27 @@ static void radeon_print_state_atom(radeonContextPtr radeon, struct radeon_state
 	}
 }
 
-static void radeon_print_state_atom_kmm(radeonContextPtr radeon, struct radeon_state_atom *state)
+static void radeon_print_state_atom(radeonContextPtr radeon, struct radeon_state_atom *state)
 {
 	int i, j, reg, count;
-	int dwords = (*state->check) (radeon->glCtx, state);
+	int dwords;
 	uint32_t packet0;
+	if (! (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) )
+		return;
+
+	if (!radeon->radeonScreen->kernel_mm) {
+		radeon_print_state_atom_prekmm(radeon, state);
+		return;
+	}
+
+	dwords = (*state->check) (radeon->glCtx, state);
 
 	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords, state->cmd_size);
 
 	if (RADEON_DEBUG & DEBUG_VERBOSE) {
-		for (i = 0; i < dwords;) {
+		if (dwords > state->cmd_size)
+			dwords = state->cmd_size;
+		for (i = 0; i < state->cmd_size;) {
 			packet0 = state->cmd[i];
 			reg = (packet0 & 0x1FFF) << 2;
 			count = ((packet0 & 0x3FFF0000) >> 16) + 1;
@@ -946,40 +960,84 @@ static void radeon_print_state_atom_kmm(radeonContextPtr radeon, struct radeon_s
 	}
 }
 
-static INLINE void radeonEmitAtoms(radeonContextPtr radeon, GLboolean dirty)
+/**
+ * Count total size for next state emit.
+ **/
+GLuint radeonCountStateEmitSize(radeonContextPtr radeon)
 {
-	BATCH_LOCALS(radeon);
 	struct radeon_state_atom *atom;
+	GLuint dwords = 0;
+	/* check if we are going to emit full state */
+	if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE)
+		fprintf(stderr, "%s\n", __func__);
+
+	if (radeon->cmdbuf.cs->cdw && !radeon->hw.all_dirty) {
+		if (!radeon->hw.is_dirty)
+			return dwords;
+		foreach(atom, &radeon->hw.atomlist) {
+			if (atom->dirty) {
+				const GLuint atom_size = atom->check(radeon->glCtx, atom);
+				dwords += atom_size;
+				if (DEBUG_CMDBUF && atom_size) {
+					radeon_print_state_atom(radeon, atom);
+				}
+			}
+		}
+	} else {
+		foreach(atom, &radeon->hw.atomlist) {
+			const GLuint atom_size = atom->check(radeon->glCtx, atom);
+			dwords += atom_size;
+			if (DEBUG_CMDBUF && atom_size) {
+				radeon_print_state_atom(radeon, atom);
+			}
+
+		}
+	}
+	return dwords;
+}
+
+static INLINE void radeon_emit_atom(radeonContextPtr radeon, struct radeon_state_atom *atom)
+{
+	BATCH_LOCALS(radeon);
 	int dwords;
 
+	dwords = (*atom->check) (radeon->glCtx, atom);
+	if (dwords) {
+
+		radeon_print_state_atom(radeon, atom);
+
+		if (atom->emit) {
+			(*atom->emit)(radeon->glCtx, atom);
+		} else {
+			BEGIN_BATCH_NO_AUTOSTATE(dwords);
+			OUT_BATCH_TABLE(atom->cmd, dwords);
+			END_BATCH();
+		}
+	} else {
+		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
+			fprintf(stderr, "  skip state %s\n",
+					atom->name);
+		}
+	}
+	atom->dirty = GL_FALSE;
+
+}
+
+static INLINE void radeonEmitAtoms(radeonContextPtr radeon, GLboolean emitAll)
+{
+	struct radeon_state_atom *atom;
+
 	if (radeon->vtbl.pre_emit_atoms)
 		radeon->vtbl.pre_emit_atoms(radeon);
 
 	/* Emit actual atoms */
-	foreach(atom, &radeon->hw.atomlist) {
-		if ((atom->dirty || radeon->hw.all_dirty) == dirty) {
-			dwords = (*atom->check) (radeon->glCtx, atom);
-			if (dwords) {
-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					if (radeon->radeonScreen->kernel_mm)
-						radeon_print_state_atom_kmm(radeon, atom);
-					else
-						radeon_print_state_atom(radeon, atom);
-				}
-				if (atom->emit) {
-					(*atom->emit)(radeon->glCtx, atom);
-				} else {
-					BEGIN_BATCH_NO_AUTOSTATE(dwords);
-					OUT_BATCH_TABLE(atom->cmd, dwords);
-					END_BATCH();
-				}
-				atom->dirty = GL_FALSE;
-			} else {
-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					fprintf(stderr, "  skip state %s\n",
-						atom->name);
-				}
-			}
+	if (radeon->hw.all_dirty || emitAll) {
+		foreach(atom, &radeon->hw.atomlist)
+			radeon_emit_atom( radeon, atom );
+	} else {
+		foreach(atom, &radeon->hw.atomlist) {
+			if ( atom->dirty )
+				radeon_emit_atom( radeon, atom );
 		}
 	}
 
@@ -1009,26 +1067,21 @@ void radeonEmitState(radeonContextPtr radeon)
 	if (radeon->cmdbuf.cs->cdw && !radeon->hw.is_dirty && !radeon->hw.all_dirty)
 		return;
 
-	/* To avoid going across the entire set of states multiple times, just check
-	 * for enough space for the case of emitting all state, and inline the
-	 * radeonAllocCmdBuf code here without all the checks.
-	 */
-	rcommonEnsureCmdBufSpace(radeon, radeon->hw.max_state_size, __FUNCTION__);
-
 	if (!radeon->cmdbuf.cs->cdw) {
 		if (RADEON_DEBUG & DEBUG_STATE)
 			fprintf(stderr, "Begin reemit state\n");
 
+		radeonEmitAtoms(radeon, GL_TRUE);
+	} else {
+
+		if (RADEON_DEBUG & DEBUG_STATE)
+			fprintf(stderr, "Begin dirty state\n");
+
 		radeonEmitAtoms(radeon, GL_FALSE);
 	}
 
-	if (RADEON_DEBUG & DEBUG_STATE)
-		fprintf(stderr, "Begin dirty state\n");
-
-	radeonEmitAtoms(radeon, GL_TRUE);
 	radeon->hw.is_dirty = GL_FALSE;
 	radeon->hw.all_dirty = GL_FALSE;
-
 }
 
 
@@ -1172,12 +1225,16 @@ int rcommonFlushCmdBuf(radeonContextPtr rmesa, const char *caller)
  *
  * \param dwords The number of dwords we need to be free on the command buffer
  */
-void rcommonEnsureCmdBufSpace(radeonContextPtr rmesa, int dwords, const char *caller)
+GLboolean rcommonEnsureCmdBufSpace(radeonContextPtr rmesa, int dwords, const char *caller)
 {
-	if ((rmesa->cmdbuf.cs->cdw + dwords + 128) > rmesa->cmdbuf.size ||
-	    radeon_cs_need_flush(rmesa->cmdbuf.cs)) {
-		rcommonFlushCmdBuf(rmesa, caller);
-	}
+   if ((rmesa->cmdbuf.cs->cdw + dwords + 128) > rmesa->cmdbuf.size
+	 || radeon_cs_need_flush(rmesa->cmdbuf.cs)) {
+      /* If we try to flush empty buffer there is too big rendering operation. */
+      assert(rmesa->cmdbuf.cs->cdw);
+      rcommonFlushCmdBuf(rmesa, caller);
+      return GL_TRUE;
+   }
+   return GL_FALSE;
 }
 
 void rcommonInitCmdBuf(radeonContextPtr rmesa)
@@ -1252,7 +1309,6 @@ void rcommonBeginBatch(radeonContextPtr rmesa, int n,
 		       const char *function,
 		       int line)
 {
-	rcommonEnsureCmdBufSpace(rmesa, n, function);
 	if (!rmesa->cmdbuf.cs->cdw && dostate) {
 		if (RADEON_DEBUG & DEBUG_IOCTL)
 			fprintf(stderr, "Reemit state after flush (from %s)\n", function);
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.h b/src/mesa/drivers/dri/radeon/radeon_common.h
index cebae18b2d..a9e1ca49eb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common.h
@@ -24,6 +24,7 @@ void radeonUpdatePageFlipping(radeonContextPtr rmesa);
 void radeonFlush(GLcontext *ctx);
 void radeonFinish(GLcontext * ctx);
 void radeonEmitState(radeonContextPtr radeon);
+GLuint radeonCountStateEmitSize(radeonContextPtr radeon);
 
 void radeon_clear_tris(GLcontext *ctx, GLbitfield mask);
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.c b/src/mesa/drivers/dri/radeon/radeon_dma.c
index a1835427f1..f306befec4 100644
--- a/src/mesa/drivers/dri/radeon/radeon_dma.c
+++ b/src/mesa/drivers/dri/radeon/radeon_dma.c
@@ -438,7 +438,7 @@ restart:
         if (!rmesa->dma.flush) {
 		/* make sure we have enough space to use this in cmdbuf */
    		rcommonEnsureCmdBufSpace(rmesa,
-			      rmesa->hw.max_state_size + (20*sizeof(int)),
+			      radeonCountStateEmitSize( rmesa ) + (20*sizeof(int)),
 			      __FUNCTION__);
 		/* if cmdbuf flushed DMA restart */
 		if (is_empty_list(&rmesa->dma.reserved))
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.h b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
index 18805d4c57..deb53ae313 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.h
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
@@ -132,16 +132,18 @@ static INLINE int RADEON_DB_STATECHANGE(r100ContextPtr rmesa,
  * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
  */
 #if RADEON_OLD_PACKETS
-#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2))+nr*2)
 #define VERT_AOS_BUFSZ	(0)
 #define ELTS_BUFSZ(nr)	(24 + nr * 2)
-#define VBUF_BUFSZ	(6 * sizeof(int))
+#define VBUF_BUFSZ	(8)
 #else
-#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2)) * sizeof(int))
-#define VERT_AOS_BUFSZ	(5 * sizeof(int))
+#define AOS_BUFSZ(nr)	((3 + ((nr / 2) * 3) + ((nr & 1) * 2) + nr*2))
+#define VERT_AOS_BUFSZ	(5)
 #define ELTS_BUFSZ(nr)	(16 + nr * 2)
-#define VBUF_BUFSZ	(4 * sizeof(int))
+#define VBUF_BUFSZ	(4)
 #endif
+#define SCISSOR_BUFSZ	(8)
+#define INDEX_BUFSZ	(7)
 
 
 static inline uint32_t cmdpacket3(int cmd_type)
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index 501ea0b66b..b34a4e803d 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -197,55 +197,85 @@ static int cmdscl( int offset, int stride, int count )
    return h.i;
 }
 
-#define CHECK( NM, FLAG )				\
+#define CHECK( NM, FLAG, ADD )				\
 static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom )	\
 {							\
-   return FLAG ? atom->cmd_size : 0;			\
+   return FLAG ? atom->cmd_size + (ADD) : 0;			\
 }
 
-#define TCL_CHECK( NM, FLAG )				\
+#define TCL_CHECK( NM, FLAG, ADD )				\
 static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom )	\
 {							\
    r100ContextPtr rmesa = R100_CONTEXT(ctx);	\
-   return (!rmesa->radeon.TclFallback && (FLAG)) ? atom->cmd_size : 0;	\
+   return (!rmesa->radeon.TclFallback && (FLAG)) ? atom->cmd_size + (ADD) : 0;	\
 }
 
 
-CHECK( always, GL_TRUE )
-CHECK( never, GL_FALSE )
-CHECK( tex0, ctx->Texture.Unit[0]._ReallyEnabled )
-CHECK( tex1, ctx->Texture.Unit[1]._ReallyEnabled )
+CHECK( always, GL_TRUE, 0 )
+CHECK( always_add2, GL_TRUE, 2 )
+CHECK( always_add4, GL_TRUE, 4 )
+CHECK( never, GL_FALSE, 0 )
+CHECK( tex0_mm, ctx->Texture.Unit[0]._ReallyEnabled, 3 )
+CHECK( tex1_mm, ctx->Texture.Unit[1]._ReallyEnabled, 3 )
 /* need this for the cubic_map on disabled unit 2 bug, maybe r100 only? */
-CHECK( tex2, ctx->Texture._EnabledUnits )
-CHECK( cube0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_CUBE_BIT))
-CHECK( cube1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_CUBE_BIT))
-CHECK( cube2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_CUBE_BIT))
-CHECK( fog, ctx->Fog.Enabled )
-TCL_CHECK( tcl, GL_TRUE )
-TCL_CHECK( tcl_tex0, ctx->Texture.Unit[0]._ReallyEnabled )
-TCL_CHECK( tcl_tex1, ctx->Texture.Unit[1]._ReallyEnabled )
-TCL_CHECK( tcl_tex2, ctx->Texture.Unit[2]._ReallyEnabled )
-TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
-TCL_CHECK( tcl_eyespace_or_lighting, ctx->_NeedEyeCoords || ctx->Light.Enabled )
-TCL_CHECK( tcl_lit0, ctx->Light.Enabled && ctx->Light.Light[0].Enabled )
-TCL_CHECK( tcl_lit1, ctx->Light.Enabled && ctx->Light.Light[1].Enabled )
-TCL_CHECK( tcl_lit2, ctx->Light.Enabled && ctx->Light.Light[2].Enabled )
-TCL_CHECK( tcl_lit3, ctx->Light.Enabled && ctx->Light.Light[3].Enabled )
-TCL_CHECK( tcl_lit4, ctx->Light.Enabled && ctx->Light.Light[4].Enabled )
-TCL_CHECK( tcl_lit5, ctx->Light.Enabled && ctx->Light.Light[5].Enabled )
-TCL_CHECK( tcl_lit6, ctx->Light.Enabled && ctx->Light.Light[6].Enabled )
-TCL_CHECK( tcl_lit7, ctx->Light.Enabled && ctx->Light.Light[7].Enabled )
-TCL_CHECK( tcl_ucp0, (ctx->Transform.ClipPlanesEnabled & 0x1) )
-TCL_CHECK( tcl_ucp1, (ctx->Transform.ClipPlanesEnabled & 0x2) )
-TCL_CHECK( tcl_ucp2, (ctx->Transform.ClipPlanesEnabled & 0x4) )
-TCL_CHECK( tcl_ucp3, (ctx->Transform.ClipPlanesEnabled & 0x8) )
-TCL_CHECK( tcl_ucp4, (ctx->Transform.ClipPlanesEnabled & 0x10) )
-TCL_CHECK( tcl_ucp5, (ctx->Transform.ClipPlanesEnabled & 0x20) )
-TCL_CHECK( tcl_eyespace_or_fog, ctx->_NeedEyeCoords || ctx->Fog.Enabled ) 
-
-CHECK( txr0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_RECT_BIT))
-CHECK( txr1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_RECT_BIT))
-CHECK( txr2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_RECT_BIT))
+CHECK( tex2_mm, ctx->Texture._EnabledUnits, 3 )
+CHECK( tex0, ctx->Texture.Unit[0]._ReallyEnabled, 2 )
+CHECK( tex1, ctx->Texture.Unit[1]._ReallyEnabled, 2 )
+CHECK( tex2, ctx->Texture._EnabledUnits, 2 )
+CHECK( cube0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_CUBE_BIT), 3 + 3*5 - CUBE_STATE_SIZE )
+CHECK( cube1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_CUBE_BIT), 3 + 3*5 - CUBE_STATE_SIZE )
+CHECK( cube2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_CUBE_BIT), 3 + 3*5 - CUBE_STATE_SIZE )
+CHECK( cube0_mm, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_CUBE_BIT), 2 + 4*5 - CUBE_STATE_SIZE )
+CHECK( cube1_mm, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_CUBE_BIT), 2 + 4*5 - CUBE_STATE_SIZE )
+CHECK( cube2_mm, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_CUBE_BIT), 2 + 4*5 - CUBE_STATE_SIZE )
+CHECK( fog, ctx->Fog.Enabled, 0 )
+CHECK( fog_add4, ctx->Fog.Enabled, 4 )
+TCL_CHECK( tcl, GL_TRUE, 0 )
+TCL_CHECK( tcl_add4, GL_TRUE, 4 )
+TCL_CHECK( tcl_tex0, ctx->Texture.Unit[0]._ReallyEnabled, 0 )
+TCL_CHECK( tcl_tex1, ctx->Texture.Unit[1]._ReallyEnabled, 0 )
+TCL_CHECK( tcl_tex2, ctx->Texture.Unit[2]._ReallyEnabled, 0 )
+TCL_CHECK( tcl_tex0_add4, ctx->Texture.Unit[0]._ReallyEnabled, 4 )
+TCL_CHECK( tcl_tex1_add4, ctx->Texture.Unit[1]._ReallyEnabled, 4 )
+TCL_CHECK( tcl_tex2_add4, ctx->Texture.Unit[2]._ReallyEnabled, 4 )
+TCL_CHECK( tcl_lighting, ctx->Light.Enabled, 0 )
+TCL_CHECK( tcl_lighting_add4, ctx->Light.Enabled, 4 )
+TCL_CHECK( tcl_eyespace_or_lighting, ctx->_NeedEyeCoords || ctx->Light.Enabled, 0 )
+TCL_CHECK( tcl_eyespace_or_lighting_add4, ctx->_NeedEyeCoords || ctx->Light.Enabled, 4 )
+TCL_CHECK( tcl_lit0, ctx->Light.Enabled && ctx->Light.Light[0].Enabled, 0 )
+TCL_CHECK( tcl_lit1, ctx->Light.Enabled && ctx->Light.Light[1].Enabled, 0 )
+TCL_CHECK( tcl_lit2, ctx->Light.Enabled && ctx->Light.Light[2].Enabled, 0 )
+TCL_CHECK( tcl_lit3, ctx->Light.Enabled && ctx->Light.Light[3].Enabled, 0 )
+TCL_CHECK( tcl_lit4, ctx->Light.Enabled && ctx->Light.Light[4].Enabled, 0 )
+TCL_CHECK( tcl_lit5, ctx->Light.Enabled && ctx->Light.Light[5].Enabled, 0 )
+TCL_CHECK( tcl_lit6, ctx->Light.Enabled && ctx->Light.Light[6].Enabled, 0 )
+TCL_CHECK( tcl_lit7, ctx->Light.Enabled && ctx->Light.Light[7].Enabled, 0 )
+TCL_CHECK( tcl_lit0_add6, ctx->Light.Enabled && ctx->Light.Light[0].Enabled, 6 )
+TCL_CHECK( tcl_lit1_add6, ctx->Light.Enabled && ctx->Light.Light[1].Enabled, 6 )
+TCL_CHECK( tcl_lit2_add6, ctx->Light.Enabled && ctx->Light.Light[2].Enabled, 6 )
+TCL_CHECK( tcl_lit3_add6, ctx->Light.Enabled && ctx->Light.Light[3].Enabled, 6 )
+TCL_CHECK( tcl_lit4_add6, ctx->Light.Enabled && ctx->Light.Light[4].Enabled, 6 )
+TCL_CHECK( tcl_lit5_add6, ctx->Light.Enabled && ctx->Light.Light[5].Enabled, 6 )
+TCL_CHECK( tcl_lit6_add6, ctx->Light.Enabled && ctx->Light.Light[6].Enabled, 6 )
+TCL_CHECK( tcl_lit7_add6, ctx->Light.Enabled && ctx->Light.Light[7].Enabled, 6 )
+TCL_CHECK( tcl_ucp0, (ctx->Transform.ClipPlanesEnabled & 0x1), 0 )
+TCL_CHECK( tcl_ucp1, (ctx->Transform.ClipPlanesEnabled & 0x2), 0 )
+TCL_CHECK( tcl_ucp2, (ctx->Transform.ClipPlanesEnabled & 0x4), 0 )
+TCL_CHECK( tcl_ucp3, (ctx->Transform.ClipPlanesEnabled & 0x8), 0 )
+TCL_CHECK( tcl_ucp4, (ctx->Transform.ClipPlanesEnabled & 0x10), 0 )
+TCL_CHECK( tcl_ucp5, (ctx->Transform.ClipPlanesEnabled & 0x20), 0 )
+TCL_CHECK( tcl_ucp0_add4, (ctx->Transform.ClipPlanesEnabled & 0x1), 4 )
+TCL_CHECK( tcl_ucp1_add4, (ctx->Transform.ClipPlanesEnabled & 0x2), 4 )
+TCL_CHECK( tcl_ucp2_add4, (ctx->Transform.ClipPlanesEnabled & 0x4), 4 )
+TCL_CHECK( tcl_ucp3_add4, (ctx->Transform.ClipPlanesEnabled & 0x8), 4 )
+TCL_CHECK( tcl_ucp4_add4, (ctx->Transform.ClipPlanesEnabled & 0x10), 4 )
+TCL_CHECK( tcl_ucp5_add4, (ctx->Transform.ClipPlanesEnabled & 0x20), 4 )
+TCL_CHECK( tcl_eyespace_or_fog, ctx->_NeedEyeCoords || ctx->Fog.Enabled, 0 )
+TCL_CHECK( tcl_eyespace_or_fog_add4, ctx->_NeedEyeCoords || ctx->Fog.Enabled, 4 )
+
+CHECK( txr0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_RECT_BIT), 0 )
+CHECK( txr1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_RECT_BIT), 0 )
+CHECK( txr2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_RECT_BIT), 0 )
 
 #define OUT_VEC(hdr, data) do {			\
     drm_radeon_cmd_header_t h;					\
@@ -271,9 +301,8 @@ static void scl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
    r100ContextPtr r100 = R100_CONTEXT(ctx);
    BATCH_LOCALS(&r100->radeon);
-   uint32_t dwords = atom->cmd_size;
+   uint32_t dwords = atom->check(ctx, atom);
    
-   dwords += 2;
    BEGIN_BATCH_NO_AUTOSTATE(dwords);
    OUT_SCL(atom->cmd[0], atom->cmd+1);
    END_BATCH();
@@ -284,9 +313,8 @@ static void vec_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
    r100ContextPtr r100 = R100_CONTEXT(ctx);
    BATCH_LOCALS(&r100->radeon);
-   uint32_t dwords = atom->cmd_size;
+   uint32_t dwords = atom->check(ctx, atom);
 
-   dwords += 4;
    BEGIN_BATCH_NO_AUTOSTATE(dwords);
    OUT_VEC(atom->cmd[0], atom->cmd+1);
    END_BATCH();
@@ -297,9 +325,8 @@ static void lit_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
    r100ContextPtr r100 = R100_CONTEXT(ctx);
    BATCH_LOCALS(&r100->radeon);
-   uint32_t dwords = atom->cmd_size;
+   uint32_t dwords = atom->check(ctx, atom);
 
-   dwords += 6;
    BEGIN_BATCH_NO_AUTOSTATE(dwords);
    OUT_VEC(atom->cmd[LIT_CMD_0], atom->cmd+1);
    OUT_SCL(atom->cmd[LIT_CMD_1], atom->cmd+LIT_CMD_1+1);
@@ -313,10 +340,10 @@ static void ctx_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    struct radeon_renderbuffer *rrb;
    uint32_t cbpitch;
    uint32_t zbpitch, depth_fmt;
-   uint32_t dwords = atom->cmd_size;
+   uint32_t dwords = atom->check(ctx, atom);
 
    /* output the first 7 bytes of context */
-   BEGIN_BATCH_NO_AUTOSTATE(dwords + 4);
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
    OUT_BATCH_TABLE(atom->cmd, 5);
 
    rrb = radeon_get_depthbuffer(&r100->radeon);
@@ -371,6 +398,28 @@ static void ctx_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    END_BATCH();
 }
 
+static int check_always_ctx( GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb, *drb;
+   uint32_t dwords;
+
+   rrb = radeon_get_colorbuffer(&r100->radeon);
+   if (!rrb || !rrb->bo) {
+      return 0;
+   }
+
+   drb = radeon_get_depthbuffer(&r100->radeon);
+
+   dwords = 10;
+   if (drb)
+     dwords += 6;
+   if (rrb)
+     dwords += 8;
+
+   return dwords;
+}
+
 static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
 {
    r100ContextPtr r100 = R100_CONTEXT(ctx);
@@ -378,7 +427,7 @@ static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
    struct radeon_renderbuffer *rrb, *drb;
    uint32_t cbpitch = 0;
    uint32_t zbpitch = 0;
-   uint32_t dwords = atom->cmd_size;
+   uint32_t dwords = atom->check(ctx, atom);
    uint32_t depth_fmt;
 
    rrb = radeon_get_colorbuffer(&r100->radeon);
@@ -418,12 +467,6 @@ static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
      
    }
 
-   /* output the first 7 bytes of context */
-   dwords = 10;
-   if (drb)
-     dwords += 6;
-   if (rrb)
-     dwords += 8;
    BEGIN_BATCH_NO_AUTOSTATE(dwords);
 
    /* In the CS case we need to split this up */
@@ -474,7 +517,7 @@ static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
    r100ContextPtr r100 = R100_CONTEXT(ctx);
    BATCH_LOCALS(&r100->radeon);
-   uint32_t dwords = 3;
+   uint32_t dwords = atom->check(ctx, atom);
    int i = atom->idx, j;
    radeonTexObj *t = r100->state.texture.unit[i].texobj;
    radeon_mipmap_level *lvl;
@@ -488,7 +531,7 @@ static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
    if (!t->mt)
 	return;
 
-   BEGIN_BATCH_NO_AUTOSTATE(dwords + (5 * 3));
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
    OUT_BATCH_TABLE(atom->cmd, 3);
    lvl = &t->mt->levels[0];
    for (j = 0; j < 5; j++) {
@@ -502,7 +545,7 @@ static void cube_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
 {
    r100ContextPtr r100 = R100_CONTEXT(ctx);
    BATCH_LOCALS(&r100->radeon);
-   uint32_t dwords = 2;
+   uint32_t dwords = atom->check(ctx, atom);
    int i = atom->idx, j;
    radeonTexObj *t = r100->state.texture.unit[i].texobj;
    radeon_mipmap_level *lvl;
@@ -523,7 +566,7 @@ static void cube_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
 	default:
 	case 0: base_reg = RADEON_PP_CUBIC_OFFSET_T0_0; break;
    };
-   BEGIN_BATCH_NO_AUTOSTATE(dwords + (5 * 4));
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
    OUT_BATCH_TABLE(atom->cmd, 2);
    lvl = &t->mt->levels[0];
    for (j = 0; j < 5; j++) {
@@ -665,10 +708,11 @@ void radeonInitState( r100ContextPtr rmesa )
 
    /* Allocate state buffers:
     */
-   ALLOC_STATE( ctx, always, CTX_STATE_SIZE, "CTX/context", 0 );
-   if (rmesa->radeon.radeonScreen->kernel_mm)
+   ALLOC_STATE( ctx, always_add4, CTX_STATE_SIZE, "CTX/context", 0 );
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
      rmesa->hw.ctx.emit = ctx_emit_cs;
-   else
+     rmesa->hw.ctx.check = check_always_ctx;
+   } else
      rmesa->hw.ctx.emit = ctx_emit;
    ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
    ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
@@ -678,13 +722,63 @@ void radeonInitState( r100ContextPtr rmesa )
    ALLOC_STATE( zbs, always, ZBS_STATE_SIZE, "ZBS/zbias", 0 );
    ALLOC_STATE( tcl, always, TCL_STATE_SIZE, "TCL/tcl", 1 );
    ALLOC_STATE( mtl, tcl_lighting, MTL_STATE_SIZE, "MTL/material", 1 );
-   ALLOC_STATE( grd, always, GRD_STATE_SIZE, "GRD/guard-band", 1 );
-   ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 1 );
-   ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 1 );
-   ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
-   ALLOC_STATE_IDX( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0, 0);
-   ALLOC_STATE_IDX( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0, 1);
-   ALLOC_STATE_IDX( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0, 2);
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      ALLOC_STATE( grd, always_add2, GRD_STATE_SIZE, "GRD/guard-band", 1 );
+      ALLOC_STATE( fog, fog_add4, FOG_STATE_SIZE, "FOG/fog", 1 );
+      ALLOC_STATE( glt, tcl_lighting_add4, GLT_STATE_SIZE, "GLT/light-global", 1 );
+      ALLOC_STATE( eye, tcl_lighting_add4, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
+      ALLOC_STATE_IDX( tex[0], tex0_mm, TEX_STATE_SIZE, "TEX/tex-0", 0, 0);
+      ALLOC_STATE_IDX( tex[1], tex1_mm, TEX_STATE_SIZE, "TEX/tex-1", 0, 1);
+      ALLOC_STATE_IDX( tex[2], tex2_mm, TEX_STATE_SIZE, "TEX/tex-2", 0, 2);
+      ALLOC_STATE( mat[0], tcl_add4, MAT_STATE_SIZE, "MAT/modelproject", 1 );
+      ALLOC_STATE( mat[1], tcl_eyespace_or_fog_add4, MAT_STATE_SIZE, "MAT/modelview", 1 );
+      ALLOC_STATE( mat[2], tcl_eyespace_or_lighting_add4, MAT_STATE_SIZE, "MAT/it-modelview", 1 );
+      ALLOC_STATE( mat[3], tcl_tex0_add4, MAT_STATE_SIZE, "MAT/texmat0", 1 );
+      ALLOC_STATE( mat[4], tcl_tex1_add4, MAT_STATE_SIZE, "MAT/texmat1", 1 );
+      ALLOC_STATE( mat[5], tcl_tex2_add4, MAT_STATE_SIZE, "MAT/texmat2", 1 );
+      ALLOC_STATE( lit[0], tcl_lit0_add6, LIT_STATE_SIZE, "LIT/light-0", 1 );
+      ALLOC_STATE( lit[1], tcl_lit1_add6, LIT_STATE_SIZE, "LIT/light-1", 1 );
+      ALLOC_STATE( lit[2], tcl_lit2_add6, LIT_STATE_SIZE, "LIT/light-2", 1 );
+      ALLOC_STATE( lit[3], tcl_lit3_add6, LIT_STATE_SIZE, "LIT/light-3", 1 );
+      ALLOC_STATE( lit[4], tcl_lit4_add6, LIT_STATE_SIZE, "LIT/light-4", 1 );
+      ALLOC_STATE( lit[5], tcl_lit5_add6, LIT_STATE_SIZE, "LIT/light-5", 1 );
+      ALLOC_STATE( lit[6], tcl_lit6_add6, LIT_STATE_SIZE, "LIT/light-6", 1 );
+      ALLOC_STATE( lit[7], tcl_lit7_add6, LIT_STATE_SIZE, "LIT/light-7", 1 );
+      ALLOC_STATE( ucp[0], tcl_ucp0_add4, UCP_STATE_SIZE, "UCP/userclip-0", 1 );
+      ALLOC_STATE( ucp[1], tcl_ucp1_add4, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
+      ALLOC_STATE( ucp[2], tcl_ucp2_add4, UCP_STATE_SIZE, "UCP/userclip-2", 1 );
+      ALLOC_STATE( ucp[3], tcl_ucp3_add4, UCP_STATE_SIZE, "UCP/userclip-3", 1 );
+      ALLOC_STATE( ucp[4], tcl_ucp4_add4, UCP_STATE_SIZE, "UCP/userclip-4", 1 );
+      ALLOC_STATE( ucp[5], tcl_ucp5_add4, UCP_STATE_SIZE, "UCP/userclip-5", 1 );
+   } else {
+      ALLOC_STATE( grd, always, GRD_STATE_SIZE, "GRD/guard-band", 1 );
+      ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 1 );
+      ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 1 );
+      ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
+      ALLOC_STATE_IDX( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0, 0);
+      ALLOC_STATE_IDX( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0, 1);
+      ALLOC_STATE_IDX( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0, 2);
+      ALLOC_STATE( mat[0], tcl, MAT_STATE_SIZE, "MAT/modelproject", 1 );
+      ALLOC_STATE( mat[1], tcl_eyespace_or_fog, MAT_STATE_SIZE, "MAT/modelview", 1 );
+      ALLOC_STATE( mat[2], tcl_eyespace_or_lighting, MAT_STATE_SIZE, "MAT/it-modelview", 1 );
+      ALLOC_STATE( mat[3], tcl_tex0, MAT_STATE_SIZE, "MAT/texmat0", 1 );
+      ALLOC_STATE( mat[4], tcl_tex1, MAT_STATE_SIZE, "MAT/texmat1", 1 );
+      ALLOC_STATE( mat[5], tcl_tex2, MAT_STATE_SIZE, "MAT/texmat2", 1 );
+      ALLOC_STATE( lit[0], tcl_lit0, LIT_STATE_SIZE, "LIT/light-0", 1 );
+      ALLOC_STATE( lit[1], tcl_lit1, LIT_STATE_SIZE, "LIT/light-1", 1 );
+      ALLOC_STATE( lit[2], tcl_lit2, LIT_STATE_SIZE, "LIT/light-2", 1 );
+      ALLOC_STATE( lit[3], tcl_lit3, LIT_STATE_SIZE, "LIT/light-3", 1 );
+      ALLOC_STATE( lit[4], tcl_lit4, LIT_STATE_SIZE, "LIT/light-4", 1 );
+      ALLOC_STATE( lit[5], tcl_lit5, LIT_STATE_SIZE, "LIT/light-5", 1 );
+      ALLOC_STATE( lit[6], tcl_lit6, LIT_STATE_SIZE, "LIT/light-6", 1 );
+      ALLOC_STATE( lit[7], tcl_lit7, LIT_STATE_SIZE, "LIT/light-7", 1 );
+      ALLOC_STATE( ucp[0], tcl_ucp0, UCP_STATE_SIZE, "UCP/userclip-0", 1 );
+      ALLOC_STATE( ucp[1], tcl_ucp1, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
+      ALLOC_STATE( ucp[2], tcl_ucp2, UCP_STATE_SIZE, "UCP/userclip-2", 1 );
+      ALLOC_STATE( ucp[3], tcl_ucp3, UCP_STATE_SIZE, "UCP/userclip-3", 1 );
+      ALLOC_STATE( ucp[4], tcl_ucp4, UCP_STATE_SIZE, "UCP/userclip-4", 1 );
+      ALLOC_STATE( ucp[5], tcl_ucp5, UCP_STATE_SIZE, "UCP/userclip-5", 1 );
+   }
 
    for (i = 0; i < 3; i++) {
       if (rmesa->radeon.radeonScreen->kernel_mm)
@@ -694,14 +788,19 @@ void radeonInitState( r100ContextPtr rmesa )
    }
    if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR100)
    {
-      ALLOC_STATE_IDX( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
-      ALLOC_STATE_IDX( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
-      ALLOC_STATE_IDX( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
-      for (i = 0; i < 3; i++)
-          if (rmesa->radeon.radeonScreen->kernel_mm)
-              rmesa->hw.cube[i].emit = cube_emit_cs;
-          else
-              rmesa->hw.cube[i].emit = cube_emit;
+      if (rmesa->radeon.radeonScreen->kernel_mm) {
+         ALLOC_STATE_IDX( cube[0], cube0_mm, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
+         ALLOC_STATE_IDX( cube[1], cube1_mm, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
+         ALLOC_STATE_IDX( cube[2], cube2_mm, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
+         for (i = 0; i < 3; i++)
+            rmesa->hw.cube[i].emit = cube_emit_cs;
+      } else {
+         ALLOC_STATE_IDX( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
+         ALLOC_STATE_IDX( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
+         ALLOC_STATE_IDX( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
+         for (i = 0; i < 3; i++)
+            rmesa->hw.cube[i].emit = cube_emit;
+      }
    }
    else
    {
@@ -709,26 +808,6 @@ void radeonInitState( r100ContextPtr rmesa )
       ALLOC_STATE_IDX( cube[1], never, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
       ALLOC_STATE_IDX( cube[2], never, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
    }
-   ALLOC_STATE( mat[0], tcl, MAT_STATE_SIZE, "MAT/modelproject", 1 );
-   ALLOC_STATE( mat[1], tcl_eyespace_or_fog, MAT_STATE_SIZE, "MAT/modelview", 1 );
-   ALLOC_STATE( mat[2], tcl_eyespace_or_lighting, MAT_STATE_SIZE, "MAT/it-modelview", 1 );
-   ALLOC_STATE( mat[3], tcl_tex0, MAT_STATE_SIZE, "MAT/texmat0", 1 );
-   ALLOC_STATE( mat[4], tcl_tex1, MAT_STATE_SIZE, "MAT/texmat1", 1 );
-   ALLOC_STATE( mat[5], tcl_tex2, MAT_STATE_SIZE, "MAT/texmat2", 1 );
-   ALLOC_STATE( ucp[0], tcl_ucp0, UCP_STATE_SIZE, "UCP/userclip-0", 1 );
-   ALLOC_STATE( ucp[1], tcl_ucp1, UCP_STATE_SIZE, "UCP/userclip-1", 1 );
-   ALLOC_STATE( ucp[2], tcl_ucp2, UCP_STATE_SIZE, "UCP/userclip-2", 1 );
-   ALLOC_STATE( ucp[3], tcl_ucp3, UCP_STATE_SIZE, "UCP/userclip-3", 1 );
-   ALLOC_STATE( ucp[4], tcl_ucp4, UCP_STATE_SIZE, "UCP/userclip-4", 1 );
-   ALLOC_STATE( ucp[5], tcl_ucp5, UCP_STATE_SIZE, "UCP/userclip-5", 1 );
-   ALLOC_STATE( lit[0], tcl_lit0, LIT_STATE_SIZE, "LIT/light-0", 1 );
-   ALLOC_STATE( lit[1], tcl_lit1, LIT_STATE_SIZE, "LIT/light-1", 1 );
-   ALLOC_STATE( lit[2], tcl_lit2, LIT_STATE_SIZE, "LIT/light-2", 1 );
-   ALLOC_STATE( lit[3], tcl_lit3, LIT_STATE_SIZE, "LIT/light-3", 1 );
-   ALLOC_STATE( lit[4], tcl_lit4, LIT_STATE_SIZE, "LIT/light-4", 1 );
-   ALLOC_STATE( lit[5], tcl_lit5, LIT_STATE_SIZE, "LIT/light-5", 1 );
-   ALLOC_STATE( lit[6], tcl_lit6, LIT_STATE_SIZE, "LIT/light-6", 1 );
-   ALLOC_STATE( lit[7], tcl_lit7, LIT_STATE_SIZE, "LIT/light-7", 1 );
    ALLOC_STATE_IDX( txr[0], txr0, TXR_STATE_SIZE, "TXR/txr-0", 0, 0 );
    ALLOC_STATE_IDX( txr[1], txr1, TXR_STATE_SIZE, "TXR/txr-1", 0, 1 );
    ALLOC_STATE_IDX( txr[2], txr2, TXR_STATE_SIZE, "TXR/txr-2", 0, 2 );
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index 58b3be9391..20ce6c470b 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -285,7 +285,7 @@ void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
    r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    rcommonEnsureCmdBufSpace(&rmesa->radeon,
-			    rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
+			    radeonCountStateEmitSize( &rmesa->radeon ) + (12*sizeof(int)),
 			    __FUNCTION__);
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c
index df6708f05e..2404f28450 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c
@@ -50,6 +50,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_tcl.h"
 #include "radeon_swtcl.h"
 #include "radeon_maos.h"
+#include "radeon_common_context.h"
 
 
 
@@ -149,9 +150,6 @@ static GLushort *radeonAllocElts( r100ContextPtr rmesa, GLuint nr )
       if (rmesa->radeon.dma.flush)
 	 rmesa->radeon.dma.flush( rmesa->radeon.glCtx );
 
-      rcommonEnsureCmdBufSpace(&rmesa->radeon, rmesa->radeon.hw.max_state_size + ELTS_BUFSZ(nr) + 
-			       AOS_BUFSZ(rmesa->radeon.tcl.aos_count), __FUNCTION__);
-
       radeonEmitAOS( rmesa,
 		     rmesa->radeon.tcl.aos_count, 0 );
 
@@ -176,10 +174,6 @@ static void radeonEmitPrim( GLcontext *ctx,
    r100ContextPtr rmesa = R100_CONTEXT( ctx );
    radeonTclPrimitive( ctx, prim, hwprim );
    
-   rcommonEnsureCmdBufSpace( &rmesa->radeon,
-			     AOS_BUFSZ(rmesa->radeon.tcl.aos_count) +
-			     rmesa->radeon.hw.max_state_size + VBUF_BUFSZ, __FUNCTION__ );
-
    radeonEmitAOS( rmesa,
 		  rmesa->radeon.tcl.aos_count,
 		  start );
@@ -196,6 +190,8 @@ static void radeonEmitPrim( GLcontext *ctx,
    radeonEmitPrim( ctx, prim, hwprim, start, count );           \
    (void) rmesa; } while (0)
 
+#define MAX_CONVERSION_SIZE 40
+
 /* Try & join small primitives
  */
 #if 0
@@ -360,6 +356,73 @@ radeonComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord )
    }
 }
 
+/**
+ * Predict total emit size for next rendering operation so there is no flush in middle of rendering
+ * Prediction has to aim towards the best possible value that is worse than worst case scenario
+ */
+static GLuint radeonEnsureEmitSize( GLcontext * ctx , GLuint inputs )
+{
+  r100ContextPtr rmesa = R100_CONTEXT(ctx);
+  TNLcontext *tnl = TNL_CONTEXT(ctx);
+  struct vertex_buffer *VB = &tnl->vb;
+  GLuint space_required;
+  GLuint state_size;
+  GLuint nr_aos = 1; /* radeonEmitArrays does always emit one */
+  int i;
+  /* list of flags that are allocating aos object */
+  const GLuint flags_to_check[] = {
+    VERT_BIT_NORMAL,
+    VERT_BIT_COLOR0,
+    VERT_BIT_COLOR1,
+    VERT_BIT_FOG
+  };
+  /* predict number of aos to emit */
+  for (i=0; i < sizeof(flags_to_check)/sizeof(flags_to_check[0]); ++i)
+  {
+    if (inputs & flags_to_check[i])
+      ++nr_aos;
+  }
+  for (i = 0; i < ctx->Const.MaxTextureUnits; ++i)
+  {
+    if (inputs & VERT_BIT_TEX(i))
+      ++nr_aos;
+  }
+
+  {
+    /* count the prediction for state size */
+    space_required = 0;
+    state_size = radeonCountStateEmitSize( &rmesa->radeon );
+    /* tcl may be changed in radeonEmitArrays so account for it if not dirty */
+    if (!rmesa->hw.tcl.dirty)
+      state_size += rmesa->hw.tcl.check( rmesa->radeon.glCtx, &rmesa->hw.tcl );
+    /* predict size for elements */
+    for (i = 0; i < VB->PrimitiveCount; ++i)
+    {
+      if (!VB->Primitive[i].count)
+	continue;
+      /* If primitive.count is less than MAX_CONVERSION_SIZE
+	 rendering code may decide convert to elts.
+	 In that case we have to make pessimistic prediction.
+	 and use larger of 2 paths. */
+      const GLuint elts = ELTS_BUFSZ(nr_aos);
+      const GLuint index = INDEX_BUFSZ;
+      const GLuint vbuf = VBUF_BUFSZ;
+      if ( (!VB->Elts && VB->Primitive[i].count >= MAX_CONVERSION_SIZE)
+	  || vbuf > index + elts)
+	space_required += vbuf;
+      else
+	space_required += index + elts;
+      space_required += AOS_BUFSZ(nr_aos);
+    }
+    space_required += SCISSOR_BUFSZ;
+  }
+  /* flush the buffer in case we need more than is left. */
+  if (rcommonEnsureCmdBufSpace(&rmesa->radeon, space_required, __FUNCTION__))
+    return space_required + radeonCountStateEmitSize( &rmesa->radeon );
+  else
+    return space_required + state_size;
+}
+
 /**********************************************************************/
 /*                          Render pipeline stage                     */
 /**********************************************************************/
@@ -410,6 +473,8 @@ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
    }
 
    radeonReleaseArrays( ctx, ~0 );
+   GLuint emit_end = radeonEnsureEmitSize( ctx, inputs )
+     + rmesa->radeon.cmdbuf.cs->cdw;
    radeonEmitArrays( ctx, inputs );
 
    rmesa->tcl.Elts = VB->Elts;
@@ -429,6 +494,10 @@ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
 	 radeonEmitPrimitive( ctx, start, start+length, prim );
    }
 
+   if (emit_end < rmesa->radeon.cmdbuf.cs->cdw)
+      WARN_ONCE("Rendering was %d commands larger than predicted size."
+	  " We might overflow  command buffer.\n", rmesa->radeon.cmdbuf.cs->cdw - emit_end);
+
    return GL_FALSE;		/* finished the pipe */
 }