4 files changed, 89 insertions, 59 deletions
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index cfe81760f2..fdd5b732f2 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -32,7 +32,6 @@ DRIVER_SOURCES = \
 		 r300_cmdbuf.c \
 		 r300_state.c \
 		 r300_render.c \
-		 r300_lib.c \
 		 r300_texmem.c \
 		 r300_tex.c \
 		 r300_texstate.c \
diff --git a/src/mesa/drivers/dri/r300/r300_emit.h b/src/mesa/drivers/dri/r300/r300_emit.h
index e0c77373ad..6433944e19 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.h
+++ b/src/mesa/drivers/dri/r300/r300_emit.h
@@ -202,4 +202,45 @@ cmd=(drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, \
 cmd[0].i=cmdcpdelay(count);
 }
 
+/* fire vertex buffer */
+static void inline fire_AOS(PREFIX int vertex_count, int type)
+{
+LOCAL_VARS
+check_space(9);
+
+start_packet3(RADEON_CP_PACKET3_3D_DRAW_VBUF_2, 0);
+/*	e32(0x840c0024);  */
+	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count<<16) | type); 
+}
+
+/* these are followed by the corresponding data */
+#define start_index32_packet(vertex_count, type) \
+	{\
+	int _vc;\
+	_vc=(vertex_count); \
+	start_packet3(RADEON_CP_PACKET3_3D_DRAW_INDX_2, _vc); \
+		e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (_vc<<16) | type \
+		    | R300_VAP_VF_CNTL__INDEX_SIZE_32bit); \
+	}
+
+#define start_index16_packet(vertex_count, type) \
+	{\
+	int _vc, _n;\
+	_vc=(vertex_count); \
+	_n=(vertex_count+1)>>1; \
+	start_packet3(RADEON_CP_PACKET3_3D_DRAW_INDX_2, _n); \
+		e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (_vc<<16) | type); \
+	}
+	
+/* Interestingly enough this ones needs the call to setup_AOS, even thought
+   some of the data so setup is not needed and some is not as arbitrary 
+   as when used by DRAW_VBUF_2 or DRAW_INDX_2 */
+#define start_immediate_packet(vertex_count, type, vertex_size) \
+	{\
+	int _vc; \
+	_vc=(vertex_count); \
+	start_packet3(RADEON_CP_PACKET3_3D_DRAW_IMMD_2, _vc*(vertex_size)); \
+	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED | (_vc<<16) | type); \
+	}
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index 93cef4eb6d..4e33781fc4 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -56,8 +56,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_program.h"
 #include "r300_tex.h"
 
-#include "r300_lib.h"
-
+#include "r300_emit.h"
 
 /**********************************************************************
 *                     Hardware rasterization
@@ -335,64 +334,57 @@ static GLboolean r300_run_immediate_render(GLcontext *ctx,
 /* We use the start part of GART texture buffer for vertices */
 
 
-static void upload_vertex_buffer(r300ContextPtr rmesa, 
-	GLcontext *ctx, AOS_DATA *array, int *n_arrays)
+static void upload_vertex_buffer(r300ContextPtr rmesa, GLcontext *ctx)
 {
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct vertex_buffer *VB = &tnl->vb;
-   int idx=0;
-   int i,j,k;
-   radeonScreenPtr rsp=rmesa->radeon.radeonScreen;
-   
-   /* A hack - we don't want to overwrite vertex buffers, so we
-      just use AGP space for them.. Fix me ! */
-   static int offset=0;
-   if(offset>2*1024*1024){
-   	//fprintf(stderr, "Wrapping agp vertex buffer offset\n");
-   	offset=0;
-	}
-   /* Not the most efficient implementation, but, for now, I just want something that
-      works */
-      /* to do - make single memcpy per column (is it possible ?) */
-      /* to do - use dirty flags to avoid redundant copies */
-#define UPLOAD_VECTOR(v, r, f)\
-	{ \
-	 /* Is the data dirty ? */ \
-	if (v->flags & ((1<<v->size)-1)) { \
-		/* fprintf(stderr, "size=%d vs stride=%d\n", v->size, v->stride); */ \
-		if(v->size*4==v->stride){\
-			/* fast path */  \
-			memcpy(rsp->gartTextures.map+offset, v->data, v->stride*VB->Count); \
-			} else { \
-			for(i=0;i<VB->Count;i++){ \
-				/* copy one vertex at a time*/ \
-				memcpy(rsp->gartTextures.map+offset+i*v->size*4, VEC_ELT(v, GLfloat, i), v->size*4); \
+	TNLcontext *tnl = TNL_CONTEXT(ctx);
+	struct vertex_buffer *VB = &tnl->vb;
+	int idx=0;
+	int i,j,k;
+	radeonScreenPtr rsp=rmesa->radeon.radeonScreen;
+	
+	/* A hack - we don't want to overwrite vertex buffers, so we
+	just use AGP space for them.. Fix me ! */
+	static int offset=0;
+	if(offset>2*1024*1024){
+		//fprintf(stderr, "Wrapping agp vertex buffer offset\n");
+		offset=0;
+		}
+	/* Not the most efficient implementation, but, for now, I just want something that
+	works */
+	/* to do - make single memcpy per column (is it possible ?) */
+	/* to do - use dirty flags to avoid redundant copies */
+	#define UPLOAD_VECTOR(v)\
+		{ \
+		/* Is the data dirty ? */ \
+		if (v->flags & ((1<<v->size)-1)) { \
+			/* fprintf(stderr, "size=%d vs stride=%d\n", v->size, v->stride); */ \
+			if(v->size*4==v->stride){\
+				/* fast path */  \
+				memcpy(rsp->gartTextures.map+offset, v->data, v->stride*VB->Count); \
+				} else { \
+				for(i=0;i<VB->Count;i++){ \
+					/* copy one vertex at a time*/ \
+					memcpy(rsp->gartTextures.map+offset+i*v->size*4, VEC_ELT(v, GLfloat, i), v->size*4); \
+					} \
 				} \
+			/* v->flags &= ~((1<<v->size)-1);*/ \
 			} \
-		/* v->flags &= ~((1<<v->size)-1);*/ \
-		} \
-	array[idx].element_size=v->size; \
-	array[idx].stride=v->size; \
-	array[idx].format=(f); \
-	array[idx].ncomponents=v->size; \
-	array[idx].offset=rsp->gartTextures.handle+offset; \
-	array[idx].reg=r; \
-	offset+=v->size*4*VB->Count; \
-	idx++; \
-	}
-	
-UPLOAD_VECTOR(VB->ObjPtr, REG_COORDS, AOS_FORMAT_FLOAT);
-UPLOAD_VECTOR(VB->ColorPtr[0], REG_COLOR0, AOS_FORMAT_FLOAT_COLOR);
+		rmesa->state.aos[idx].offset=rsp->gartTextures.handle+offset; \
+		offset+=v->size*4*VB->Count; \
+		idx++; \
+		}
+		
+	UPLOAD_VECTOR(VB->ObjPtr);
+	UPLOAD_VECTOR(VB->ColorPtr[0]);
 	/* texture coordinates */
 	for(k=0;k < ctx->Const.MaxTextureUnits;k++)
 		if(ctx->Texture.Unit[k].Enabled)
-			UPLOAD_VECTOR(VB->TexCoordPtr[k], REG_TEX0+i, AOS_FORMAT_FLOAT);
+			UPLOAD_VECTOR(VB->TexCoordPtr[k]);
 
-*n_arrays=idx;
-if(idx>=R300_MAX_AOS_ARRAYS){
-	fprintf(stderr, "Aieee ! Maximum AOS arrays count exceeded.. \n");
-	exit(-1);
-	}
+	if(idx>=R300_MAX_AOS_ARRAYS){
+		fprintf(stderr, "Aieee ! Maximum AOS arrays count exceeded.. \n");
+		exit(-1);
+		}
 }
 
 static void r300_render_vb_primitive(r300ContextPtr rmesa, 
@@ -418,8 +410,7 @@ static GLboolean r300_run_vb_render(GLcontext *ctx,
    r300ContextPtr rmesa = R300_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    struct vertex_buffer *VB = &tnl->vb;
-   int i, j, n_arrays;
-   AOS_DATA vb_arrays[R300_MAX_AOS_ARRAYS];
+   int i, j;
    LOCAL_VARS
 	
 	if (RADEON_DEBUG == DEBUG_PRIMS)
@@ -439,10 +430,8 @@ static GLboolean r300_run_vb_render(GLcontext *ctx,
    /* setup array of structures data */
    LOCK_HARDWARE(&(rmesa->radeon));
 
-   upload_vertex_buffer(rmesa, ctx, vb_arrays, &n_arrays);
+   upload_vertex_buffer(rmesa, ctx);
    //fprintf(stderr, "Using %d AOS arrays\n", n_arrays);
-   for(i=0;i<n_arrays;i++)
-   	rmesa->state.aos[i].offset=vb_arrays[i].offset;
    
    for(i=0; i < VB->PrimitiveCount; i++){
        GLuint prim = VB->Primitive[i].mode;
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index dcaf122238..bf5685aa7d 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -56,6 +56,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_program.h"
+#include "r300_emit.h"
 #include "r300_fixed_pipelines.h"
 
 static void r300AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref)