146 files changed, 9805 insertions, 1575 deletions
diff --git a/.gitignore b/.gitignore
index 033e6e10bd..b5e59dfc3e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,6 @@ depend
 depend.bak
 lib
 lib64
+.sconsign*
+config.py
+build
diff --git a/SConstruct b/SConstruct
new file mode 100644
index 0000000000..22a4072c93
--- /dev/null
+++ b/SConstruct
@@ -0,0 +1,228 @@
+#######################################################################
+# Top-level SConstruct
+
+import os
+import os.path
+import sys
+
+
+#######################################################################
+# Configuration options
+#
+# For example, invoke scons as 
+#
+#   scons debug=1 dri=0 machine=x86
+#
+# to set configuration variables. Or you can write those options to a file
+# named config.py:
+#
+#   # config.py
+#   debug=1
+#   dri=0
+#   machine='x86'
+# 
+# Invoke
+#
+#   scons -h
+#
+# to get the full list of options. See scons manpage for more info.
+#  
+
+# TODO: auto-detect defaults
+opts = Options('config.py')
+opts.Add(BoolOption('debug', 'build debug version', False))
+opts.Add(BoolOption('dri', 'build dri drivers', False))
+opts.Add(EnumOption('machine', 'use machine-specific assembly code', 'x86',
+                     allowed_values=('generic', 'x86', 'x86-64')))
+
+env = Environment(
+	options = opts, 
+	ENV = os.environ)
+Help(opts.GenerateHelpText(env))
+
+# for debugging
+#print env.Dump()
+
+if 0:
+	# platform will be typically 'posix' or 'win32' 
+	platform = env['PLATFORM']
+else:
+	# platform will be one of 'linux', 'freebsd', 'win32', 'darwin', etc.
+	platform = sys.platform
+	if platform == 'linux2':
+		platform = 'linux' 
+
+# replicate options values in local variables
+debug = env['debug']
+dri = env['dri']
+machine = env['machine']
+
+# derived options
+x86 = machine == 'x86'
+gcc = platform in ('posix', 'linux', 'freebsd', 'darwin')
+msvc = platform == 'win32'
+
+Export([
+	'debug', 
+	'x86', 
+	'dri', 
+	'platform',
+	'gcc',
+	'msvc',
+])
+
+
+#######################################################################
+# Environment setup
+#
+# TODO: put the compiler specific settings in seperate files
+# TODO: auto-detect as much as possible
+
+         
+# Optimization flags
+if gcc:
+	if debug:
+		env.Append(CFLAGS = '-O0 -g3')
+		env.Append(CXXFLAGS = '-O0 -g3')
+	else:
+		env.Append(CFLAGS = '-O3 -g3')
+		env.Append(CXXFLAGS = '-O3 -g3')
+
+	env.Append(CFLAGS = '-Wall -Wmissing-prototypes -std=c99 -ffast-math -pedantic')
+	env.Append(CXXFLAGS = '-Wall -pedantic')
+	
+	# Be nice to Eclipse
+	env.Append(CFLAGS = '-fmessage-length=0')
+	env.Append(CXXFLAGS = '-fmessage-length=0')
+
+
+# Defines
+if debug:
+	env.Append(CPPDEFINES = ['DEBUG'])
+else:
+	env.Append(CPPDEFINES = ['NDEBUG'])
+
+
+# Includes
+env.Append(CPPPATH = [
+	'#/include',
+	'#/src/mesa',
+	'#/src/mesa/main',
+	'#/src/mesa/pipe',
+])
+
+
+# x86 assembly
+if x86:
+	env.Append(CPPDEFINES = [
+		'USE_X86_ASM', 
+		'USE_MMX_ASM',
+		'USE_3DNOW_ASM',
+		'USE_SSE_ASM',
+	])
+	if gcc:	
+		env.Append(CFLAGS = '-m32')
+		env.Append(CXXFLAGS = '-m32')
+
+
+# Posix
+if platform in ('posix', 'linux', 'freebsd', 'darwin'):
+	env.Append(CPPDEFINES = [
+		'_POSIX_SOURCE',
+		('_POSIX_C_SOURCE', '199309L'), 
+		'_SVID_SOURCE',
+		'_BSD_SOURCE', 
+		'_GNU_SOURCE',
+		
+		'PTHREADS',
+		'HAVE_POSIX_MEMALIGN',
+	])
+	env.Append(CPPPATH = ['/usr/X11R6/include'])
+	env.Append(LIBPATH = ['/usr/X11R6/lib'])
+	env.Append(LIBS = [
+		'm',
+		'pthread',
+		'expat',
+		'dl',
+	])
+
+
+# DRI
+if dri:
+	env.ParseConfig('pkg-config --cflags --libs libdrm')
+	env.Append(CPPDEFINES = [
+		('USE_EXTERNAL_DXTN_LIB', '1'), 
+		'IN_DRI_DRIVER',
+		'GLX_DIRECT_RENDERING',
+		'GLX_INDIRECT_RENDERING',
+	])
+
+# libGL
+if 1:
+	env.Append(LIBS = [
+		'X11',
+		'Xext',
+		'Xxf86vm',
+		'Xdamage',
+		'Xfixes',
+	])
+
+Export('env')
+
+
+#######################################################################
+# Convenience Library Builder
+# based on the stock StaticLibrary and SharedLibrary builders
+
+def createConvenienceLibBuilder(env):
+    """This is a utility function that creates the ConvenienceLibrary
+    Builder in an Environment if it is not there already.
+
+    If it is already there, we return the existing one.
+    """
+
+    try:
+        convenience_lib = env['BUILDERS']['ConvenienceLibrary']
+    except KeyError:
+        action_list = [ Action("$ARCOM", "$ARCOMSTR") ]
+        if env.Detect('ranlib'):
+            ranlib_action = Action("$RANLIBCOM", "$RANLIBCOMSTR")
+            action_list.append(ranlib_action)
+
+        convenience_lib = Builder(action = action_list,
+                                  emitter = '$LIBEMITTER',
+                                  prefix = '$LIBPREFIX',
+                                  suffix = '$LIBSUFFIX',
+                                  src_suffix = '$SHOBJSUFFIX',
+                                  src_builder = 'SharedObject')
+        env['BUILDERS']['ConvenienceLibrary'] = convenience_lib
+        env['BUILDERS']['Library'] = convenience_lib
+
+    return convenience_lib
+
+createConvenienceLibBuilder(env)
+
+
+#######################################################################
+# Invoke SConscripts
+
+# Put build output in a separate dir, which depends on the current configuration
+# See also http://www.scons.org/wiki/AdvancedBuildExample
+build_topdir = 'build'
+build_subdir = platform
+if dri:
+	build_subdir += "-dri"
+if x86:
+	build_subdir += "-x86"
+if debug:
+	build_subdir += "-debug"
+build_dir = os.path.join(build_topdir, build_subdir)
+
+# TODO: Build several variants at the same time?
+# http://www.scons.org/wiki/SimultaneousVariantBuilds
+
+SConscript(
+	'src/mesa/SConscript',
+	build_dir = build_dir,
+	duplicate = 0 # http://www.scons.org/doc/0.97/HTML/scons-user/x2261.html
+)
diff --git a/configs/linux-cell b/configs/linux-cell
index 4f0086cc1f..3d874491e4 100644
--- a/configs/linux-cell
+++ b/configs/linux-cell
@@ -10,11 +10,13 @@ CC = ppu32-gcc
 CXX = ppu32-g++
 HOST_CC = gcc
 
+OPT_FLAGS = -g
+
 # Cell SDK location
 SDK = /opt/ibm/cell-sdk/prototype/sysroot/usr
 
 
-CFLAGS = -g -Wall -Winline -fPIC -m32 -mabi=altivec -maltivec -I. -I$(SDK)/include -DGALLIUM_CELL
+CFLAGS = $(OPT_FLAGS) -Wall -Winline -fPIC -m32 -mabi=altivec -maltivec -I. -I$(SDK)/include -DGALLIUM_CELL
 
 CXXFLAGS = $(CFLAGS)
 
@@ -34,7 +36,7 @@ GL_LIB_DEPS = $(EXTRA_LIB_PATH) -lX11 -lXext -lm -lpthread \
 
 SPU_CC = spu-gcc
 
-SPU_CFLAGS = -g -W -Wall -Winline -Wmissing-prototypes -Wno-main -I. -I $(SDK)/spu/include -include spu_intrinsics.h -I $(TOP)/src/mesa/
+SPU_CFLAGS = $(OPT_FLAGS) -W -Wall -Winline -Wmissing-prototypes -Wno-main -I. -I $(SDK)/spu/include -include spu_intrinsics.h -I $(TOP)/src/mesa/
 
 SPU_LFLAGS = -L$(SDK)/spu/lib -Wl,-N -lmisc
 
diff --git a/progs/demos/gears.c b/progs/demos/gears.c
index ab9bc00742..2a9fefefb5 100644
--- a/progs/demos/gears.c
+++ b/progs/demos/gears.c
@@ -27,6 +27,9 @@ static GLint T0 = 0;
 static GLint Frames = 0;
 static GLint autoexit = 0;
 static GLint win = 0;
+static GLboolean Visible = GL_TRUE;
+static GLboolean Animate = GL_TRUE;
+static GLfloat viewDist = 40.0;
 
 
 /**
@@ -179,6 +182,9 @@ draw(void)
   glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
   glPushMatrix();
+
+    glTranslatef(0.0, 0.0, -viewDist);
+
     glRotatef(view_rotx, 1.0, 0.0, 0.0);
     glRotatef(view_roty, 0.0, 1.0, 0.0);
     glRotatef(view_rotz, 0.0, 0.0, 1.0);
@@ -240,6 +246,15 @@ idle(void)
   glutPostRedisplay();
 }
 
+static void
+update_idle_func(void)
+{
+  if (Visible && Animate)
+    glutIdleFunc(idle);
+  else
+    glutIdleFunc(NULL);
+}
+
 /* change view angle, exit upon ESC */
 /* ARGSUSED1 */
 static void
@@ -252,6 +267,16 @@ key(unsigned char k, int x, int y)
   case 'Z':
     view_rotz -= 5.0;
     break;
+  case 'd':
+     viewDist += 1.0;
+     break;
+  case 'D':
+     viewDist -= 1.0;
+     break;
+  case 'a':
+     Animate = !Animate;
+     update_idle_func();
+     break;
   case 27:  /* Escape */
     cleanup();
     exit(0);
@@ -295,10 +320,8 @@ reshape(int width, int height)
   glViewport(0, 0, (GLint) width, (GLint) height);
   glMatrixMode(GL_PROJECTION);
   glLoadIdentity();
-  glFrustum(-1.0, 1.0, -h, h, 5.0, 60.0);
+  glFrustum(-1.0, 1.0, -h, h, 5.0, 200.0);
   glMatrixMode(GL_MODELVIEW);
-  glLoadIdentity();
-  glTranslatef(0.0, 0.0, -40.0);
 }
 
 static void
@@ -351,13 +374,12 @@ init(int argc, char *argv[])
   }
 }
 
+
 static void 
 visible(int vis)
 {
-  if (vis == GLUT_VISIBLE)
-    glutIdleFunc(idle);
-  else
-    glutIdleFunc(NULL);
+   Visible = vis;
+   update_idle_func();
 }
 
 int main(int argc, char *argv[])
@@ -375,6 +397,7 @@ int main(int argc, char *argv[])
   glutKeyboardFunc(key);
   glutSpecialFunc(special);
   glutVisibilityFunc(visible);
+  update_idle_func();
 
   glutMainLoop();
   return 0;             /* ANSI C requires main to return int. */
diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index b16d74bf49..720f1b2e02 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -125,24 +125,25 @@ osmesa-only: depend subdirs $(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME)
 # Make the GL library
 $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME): $(STAND_ALONE_OBJECTS) $(PIPE_LIB) $(CELL_LIB) $(CELL_LIB_SPU) $(LLVM_LIB)
 	@ $(TOP)/bin/mklib -o $(GL_LIB) \
-		-linker $(CC) \
+		-linker "$(CC)" \
 		-major $(GL_MAJOR) -minor $(GL_MINOR) -patch $(GL_TINY) \
 		-install $(TOP)/$(LIB_DIR) \
 		$(MKLIB_OPTIONS) $(STAND_ALONE_OBJECTS) \
 		$(PIPE_LIB) $(CELL_LIB) $(CELL_LIB_SPU) $(LLVM_LIB) $(GL_LIB_DEPS)
 
 # Make the OSMesa library
-$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OSMESA_DRIVER_OBJECTS) $(OSMESA16_OBJECTS)
+$(TOP)/$(LIB_DIR)/$(OSMESA_LIB_NAME): $(OSMESA_DRIVER_OBJECTS) \
+		$(OSMESA16_OBJECTS) $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME)
 	@ if [ "${DRIVER_DIRS}" = "osmesa" ] ; then \
 		$(TOP)/bin/mklib -o $(OSMESA_LIB) \
-			-linker $(CC) \
+			-linker "$(CC)" \
 			-major $(MESA_MAJOR) \
 			-minor $(MESA_MINOR) -patch $(MESA_TINY) \
 			-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
 			$(OSMESA_LIB_DEPS) $(OSMESA16_OBJECTS) ; \
 	else \
 		$(TOP)/bin/mklib -o $(OSMESA_LIB) \
-			-linker $(CC) \
+			-linker "$(CC)" \
 			-major $(MESA_MAJOR) \
 			-minor $(MESA_MINOR) -patch $(GL_TINY) \
 			-install $(TOP)/$(LIB_DIR) $(MKLIB_OPTIONS) \
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
new file mode 100644
index 0000000000..faf8c84872
--- /dev/null
+++ b/src/mesa/SConscript
@@ -0,0 +1,436 @@
+#######################################################################
+# SConscript for mesa
+#
+# TODO: Split this into per-module SConscripts 
+
+
+Import('*')
+
+
+#######################################################################
+# Core sources
+
+MAIN_SOURCES = [
+	'main/api_arrayelt.c',
+	'main/api_loopback.c',
+	'main/api_noop.c',
+	'main/api_validate.c',
+	'main/accum.c',
+	'main/attrib.c',
+	'main/arrayobj.c',
+	'main/blend.c',
+	'main/bufferobj.c',
+	'main/buffers.c',
+	'main/clip.c',
+	'main/colortab.c',
+	'main/context.c',
+	'main/convolve.c',
+	'main/debug.c',
+	'main/depth.c',
+	'main/depthstencil.c',
+	'main/dlist.c',
+	'main/drawpix.c',
+	'main/enable.c',
+	'main/enums.c',
+	'main/eval.c',
+	'main/execmem.c',
+	'main/extensions.c',
+	'main/fbobject.c',
+	'main/feedback.c',
+	'main/ffvertex_prog.c',
+	'main/fog.c',
+	'main/framebuffer.c',
+	'main/get.c',
+	'main/getstring.c',
+	'main/hash.c',
+	'main/hint.c',
+	'main/histogram.c',
+	'main/image.c',
+	'main/imports.c',
+	'main/light.c',
+	'main/lines.c',
+	'main/matrix.c',
+	'main/mipmap.c',
+	'main/mm.c',
+	'main/pixel.c',
+	'main/points.c',
+	'main/polygon.c',
+	'main/queryobj.c',
+	'main/rastpos.c',
+	'main/rbadaptors.c',
+	'main/renderbuffer.c',
+	'main/shaders.c',
+	'main/state.c',
+	'main/stencil.c',
+	'main/texcompress.c',
+	'main/texcompress_s3tc.c',
+	'main/texcompress_fxt1.c',
+	'main/texenvprogram.c',
+	'main/texformat.c',
+	'main/teximage.c',
+	'main/texobj.c',
+	'main/texrender.c',
+	'main/texstate.c',
+	'main/texstore.c',
+	'main/varray.c',
+	'main/vtxfmt.c',
+]
+
+GLAPI_SOURCES = [
+	'main/dispatch.c',
+	'glapi/glapi.c',
+	'glapi/glthread.c',
+]
+
+MATH_SOURCES = [
+	'math/m_debug_clip.c',
+	'math/m_debug_norm.c',
+	'math/m_debug_xform.c',
+	'math/m_eval.c',
+	'math/m_matrix.c',
+	'math/m_translate.c',
+	'math/m_vector.c',
+	'math/m_xform.c',
+]
+
+VBO_SOURCES = [
+	'vbo/vbo_context.c',
+	'vbo/vbo_exec.c',
+	'vbo/vbo_exec_api.c',
+	'vbo/vbo_exec_array.c',
+	'vbo/vbo_exec_draw.c',
+	'vbo/vbo_exec_eval.c',
+	'vbo/vbo_rebase.c',
+	'vbo/vbo_split.c',
+	'vbo/vbo_split_copy.c',
+	'vbo/vbo_split_inplace.c',
+	'vbo/vbo_save.c',
+	'vbo/vbo_save_api.c',
+	'vbo/vbo_save_draw.c',
+	'vbo/vbo_save_loopback.c',
+]
+
+VF_SOURCES = [
+	'vf/vf.c',
+	'vf/vf_generic.c',
+	'vf/vf_sse.c',
+]
+
+DRAW_SOURCES = [
+	'pipe/draw/draw_clip.c',
+	'pipe/draw/draw_context.c',
+	'pipe/draw/draw_cull.c',
+	'pipe/draw/draw_debug.c',
+	'pipe/draw/draw_flatshade.c',
+	'pipe/draw/draw_offset.c',
+	'pipe/draw/draw_prim.c',
+	'pipe/draw/draw_stipple.c',
+	'pipe/draw/draw_twoside.c',
+	'pipe/draw/draw_unfilled.c',
+	'pipe/draw/draw_validate.c',
+	'pipe/draw/draw_vbuf.c',
+	'pipe/draw/draw_vertex.c',
+	'pipe/draw/draw_vertex_cache.c',
+	'pipe/draw/draw_vertex_fetch.c',
+	'pipe/draw/draw_vertex_shader.c',
+	'pipe/draw/draw_vertex_shader_llvm.c',
+	'pipe/draw/draw_vf.c',
+	'pipe/draw/draw_vf_generic.c',
+	'pipe/draw/draw_vf_sse.c',
+	'pipe/draw/draw_wide_prims.c',
+]
+
+TGSIEXEC_SOURCES = [
+	'pipe/tgsi/exec/tgsi_exec.c',
+	'pipe/tgsi/exec/tgsi_sse2.c',
+]
+
+TGSIUTIL_SOURCES = [
+	'pipe/tgsi/util/tgsi_build.c',
+	'pipe/tgsi/util/tgsi_dump.c',
+	'pipe/tgsi/util/tgsi_parse.c',
+	'pipe/tgsi/util/tgsi_util.c',
+]
+
+STATECACHE_SOURCES = [
+	'pipe/cso_cache/cso_hash.c',
+	'pipe/cso_cache/cso_cache.c',
+]
+
+PIPEUTIL_SOURCES = [
+	'pipe/util/p_debug.c',
+	'pipe/util/p_tile.c',
+	'pipe/util/p_util.c',
+]
+
+STATETRACKER_SOURCES = [
+	'state_tracker/st_atom.c',
+	'state_tracker/st_atom_blend.c',
+	'state_tracker/st_atom_clip.c',
+	'state_tracker/st_atom_constbuf.c',
+	'state_tracker/st_atom_depth.c',
+	'state_tracker/st_atom_framebuffer.c',
+	'state_tracker/st_atom_pixeltransfer.c',
+	'state_tracker/st_atom_sampler.c',
+	'state_tracker/st_atom_scissor.c',
+	'state_tracker/st_atom_shader.c',
+	'state_tracker/st_atom_rasterizer.c',
+	'state_tracker/st_atom_stipple.c',
+	'state_tracker/st_atom_texture.c',
+	'state_tracker/st_atom_viewport.c',
+	'state_tracker/st_cb_accum.c',
+	'state_tracker/st_cb_bufferobjects.c',
+	'state_tracker/st_cb_clear.c',
+	'state_tracker/st_cb_flush.c',
+	'state_tracker/st_cb_drawpixels.c',
+	'state_tracker/st_cb_fbo.c',
+	'state_tracker/st_cb_feedback.c',
+	'state_tracker/st_cb_program.c',
+	'state_tracker/st_cb_queryobj.c',
+	'state_tracker/st_cb_rasterpos.c',
+	'state_tracker/st_cb_readpixels.c',
+	'state_tracker/st_cb_strings.c',
+	'state_tracker/st_cb_texture.c',
+	'state_tracker/st_cache.c',
+	'state_tracker/st_context.c',
+	'state_tracker/st_debug.c',
+	'state_tracker/st_draw.c',
+	'state_tracker/st_extensions.c',
+	'state_tracker/st_format.c',
+	'state_tracker/st_framebuffer.c',
+	'state_tracker/st_mesa_to_tgsi.c',
+	'state_tracker/st_program.c',
+	'state_tracker/st_texture.c',
+]
+
+SHADER_SOURCES = [
+	'shader/arbprogparse.c',
+	'shader/arbprogram.c',
+	'shader/atifragshader.c',
+	'shader/grammar/grammar_mesa.c',
+	'shader/nvfragparse.c',
+	'shader/nvprogram.c',
+	'shader/nvvertparse.c',
+	'shader/program.c',
+	'shader/prog_cache.c',
+	'shader/prog_debug.c',
+	'shader/prog_execute.c',
+	'shader/prog_instruction.c',
+	'shader/prog_parameter.c',
+	'shader/prog_print.c',
+	'shader/prog_statevars.c',
+	'shader/programopt.c',
+	'shader/shader_api.c',
+]
+
+SLANG_SOURCES = [
+	'shader/slang/slang_builtin.c',
+	'shader/slang/slang_codegen.c',
+	'shader/slang/slang_compile.c',
+	'shader/slang/slang_compile_function.c',
+	'shader/slang/slang_compile_operation.c',
+	'shader/slang/slang_compile_struct.c',
+	'shader/slang/slang_compile_variable.c',
+	'shader/slang/slang_emit.c',
+	'shader/slang/slang_ir.c',
+	'shader/slang/slang_label.c',
+	'shader/slang/slang_library_noise.c',
+	'shader/slang/slang_link.c',
+	'shader/slang/slang_log.c',
+	'shader/slang/slang_mem.c',
+	'shader/slang/slang_preprocess.c',
+	'shader/slang/slang_print.c',
+	'shader/slang/slang_simplify.c',
+	'shader/slang/slang_storage.c',
+	'shader/slang/slang_typeinfo.c',
+	'shader/slang/slang_vartable.c',
+	'shader/slang/slang_utility.c',
+]
+
+
+#######################################################################
+# Assembly sources
+
+ASM_C_SOURCES = [
+	'x86/common_x86.c',
+	'x86/x86.c',
+	'x86/3dnow.c',
+	'x86/sse.c',
+	'x86/rtasm/x86sse.c',
+	'sparc/sparc.c',
+	'ppc/common_ppc.c',
+	'x86-64/x86-64.c',
+]
+
+X86_SOURCES = [
+	'x86/common_x86_asm.S',
+	'x86/x86_xform2.S',
+	'x86/x86_xform3.S',
+	'x86/x86_xform4.S',
+	'x86/x86_cliptest.S',
+	'x86/mmx_blend.S',
+	'x86/3dnow_xform1.S',
+	'x86/3dnow_xform2.S',
+	'x86/3dnow_xform3.S',
+	'x86/3dnow_xform4.S',
+	'x86/3dnow_normal.S',
+	'x86/sse_xform1.S',
+	'x86/sse_xform2.S',
+	'x86/sse_xform3.S',
+	'x86/sse_xform4.S',
+	'x86/sse_normal.S',
+	'x86/read_rgba_span_x86.S',
+]
+
+X86_API = [
+	'x86/glapi_x86.S',
+]
+
+X86_64_SOURCES = [
+	'x86-64/xform4.S',
+]
+
+X86_64_API = [
+	'x86-64/glapi_x86-64.S',
+]
+
+SPARC_SOURCES = [
+	'sparc/clip.S',
+	'sparc/norm.S',
+	'sparc/xform.S',
+]
+
+SPARC_API = [
+	'sparc/glapi_sparc.S',
+]
+
+if x86:
+	ASM_SOURCES = ASM_C_SOURCES + X86_SOURCES 
+	API_SOURCES = X86_API
+else:
+	ASM_SOURCES = []
+	API_SOURCES = []
+
+
+#######################################################################
+# Driver sources
+
+
+X11_DRIVER_SOURCES = [
+	'pipe/xlib/glxapi.c',
+	'pipe/xlib/fakeglx.c',
+	'pipe/xlib/xfonts.c',
+	'pipe/xlib/xm_api.c',
+	'pipe/xlib/xm_winsys.c',
+	'pipe/xlib/xm_winsys_aub.c',
+	'pipe/xlib/brw_aub.c',
+]
+
+OSMESA_DRIVER_SOURCES = [
+	'drivers/osmesa/osmesa.c',
+]
+
+GLIDE_DRIVER_SOURCES = [
+	'drivers/glide/fxapi.c',
+	'drivers/glide/fxdd.c',
+	'drivers/glide/fxddspan.c',
+	'drivers/glide/fxddtex.c',
+	'drivers/glide/fxsetup.c',
+	'drivers/glide/fxtexman.c',
+	'drivers/glide/fxtris.c',
+	'drivers/glide/fxvb.c',
+	'drivers/glide/fxglidew.c',
+	'drivers/glide/fxg.c',
+]
+
+SVGA_DRIVER_SOURCES = [
+	'drivers/svga/svgamesa.c',
+	'drivers/svga/svgamesa8.c',
+	'drivers/svga/svgamesa15.c',
+	'drivers/svga/svgamesa16.c',
+	'drivers/svga/svgamesa24.c',
+	'drivers/svga/svgamesa32.c',
+]
+
+FBDEV_DRIVER_SOURCES = [
+	'drivers/fbdev/glfbdev.c',
+]
+
+
+### All the core C sources
+
+SOLO_SOURCES = \
+	MAIN_SOURCES + \
+	MATH_SOURCES + \
+	VBO_SOURCES + \
+	VF_SOURCES + \
+	DRAW_SOURCES + \
+	TGSIEXEC_SOURCES + \
+	TGSIUTIL_SOURCES + \
+	PIPEUTIL_SOURCES + \
+	STATECACHE_SOURCES + \
+	STATETRACKER_SOURCES + \
+	SHADER_SOURCES + \
+	ASM_SOURCES + \
+	SLANG_SOURCES
+
+CORE_SOURCES = \
+	GLAPI_SOURCES + API_SOURCES + \
+	SOLO_SOURCES
+
+ALL_SOURCES = \
+	GLAPI_SOURCES + API_SOURCES + \
+	SOLO_SOURCES + \
+	ASM_SOURCES + \
+	X11_DRIVER_SOURCES + \
+	FBDEV_DRIVER_SOURCES + \
+	OSMESA_DRIVER_SOURCES
+
+
+######################################################################
+# Gallium sources
+
+SConscript([
+	'pipe/SConscript',
+])
+
+
+######################################################################
+# libGL
+
+if not dri:
+	STAND_ALONE_DRIVER_SOURCES = \
+		CORE_SOURCES + \
+		X11_DRIVER_SOURCES
+	
+	Import(
+		'softpipe', 
+		'i915simple',
+		'i965simple'
+	)
+	
+	pipe_drivers = [
+		softpipe,
+		i965simple
+	]
+	
+	env.SharedLibrary(
+		target ='GL',
+		source = STAND_ALONE_DRIVER_SOURCES,
+		LIBS = [softpipe, i965simple] + env['LIBS'],
+	)
+
+
+######################################################################
+# Driver sources
+
+if dri:
+	mesa = env.ConvenienceLibrary(
+		target = 'mesa',
+		source = SOLO_SOURCES,
+	)
+	env.Prepend(LIBS = [mesa])
+
+	SConscript([
+		'drivers/dri/SConscript',
+	])
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 33caf7dae1..b5b383b4e4 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -28,6 +28,7 @@
 #include "buffers.h"
 #include "context.h"
 #include "framebuffer.h"
+#include "mipmap.h"
 #include "program.h"
 #include "prog_execute.h"
 #include "queryobj.h"
@@ -99,6 +100,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->CopyTexSubImage1D = _swrast_copy_texsubimage1d;
    driver->CopyTexSubImage2D = _swrast_copy_texsubimage2d;
    driver->CopyTexSubImage3D = _swrast_copy_texsubimage3d;
+   driver->GenerateMipmap = _mesa_generate_mipmap;
    driver->TestProxyTexImage = _mesa_test_proxy_teximage;
    driver->CompressedTexImage1D = _mesa_store_compressed_teximage1d;
    driver->CompressedTexImage2D = _mesa_store_compressed_teximage2d;
diff --git a/src/mesa/drivers/dri/SConscript b/src/mesa/drivers/dri/SConscript
new file mode 100644
index 0000000000..d32bd08669
--- /dev/null
+++ b/src/mesa/drivers/dri/SConscript
@@ -0,0 +1,48 @@
+Import('*')
+
+drienv = env.Clone()
+
+drienv.Replace(CPPPATH = [
+	'#src/mesa/drivers/dri/common',
+	'#include',
+	'#include/GL/internal',
+	'#src/mesa',
+	'#src/mesa/main',
+	'#src/mesa/glapi',
+	'#src/mesa/math',
+	'#src/mesa/transform',
+	'#src/mesa/shader',
+	'#src/mesa/swrast',
+	'#src/mesa/swrast_setup',
+	'#src/egl/main',
+	'#src/egl/drivers/dri',
+])
+
+drienv.ParseConfig('pkg-config --cflags --libs libdrm')
+
+COMMON_GALLIUM_SOURCES = [
+	'../common/utils.c',
+	'../common/vblank.c',
+	'../common/dri_util.c',
+	'../common/xmlconfig.c',
+]
+
+COMMON_BM_SOURCES = [
+	'../common/dri_bufmgr.c',
+	'../common/dri_drmpool.c',
+]
+
+Export([
+	'drienv',
+	'COMMON_GALLIUM_SOURCES',
+	'COMMON_BM_SOURCES',
+])
+
+# TODO: Installation
+#install: $(LIBNAME)
+#	$(INSTALL) -d $(DRI_DRIVER_INSTALL_DIR)
+#	$(INSTALL) -m 755 $(LIBNAME) $(DRI_DRIVER_INSTALL_DIR)
+
+SConscript([
+	'intel_winsys/SConscript',
+])
diff --git a/src/mesa/drivers/dri/intel_winsys/SConscript b/src/mesa/drivers/dri/intel_winsys/SConscript
new file mode 100644
index 0000000000..a7cc10450e
--- /dev/null
+++ b/src/mesa/drivers/dri/intel_winsys/SConscript
@@ -0,0 +1,41 @@
+Import('*')
+
+env = drienv.Clone()
+
+env.Append(CPPPATH = [
+	'../intel',
+	'server'
+])
+
+#MINIGLX_SOURCES = server/intel_dri.c
+
+pipe_drivers = [
+	softpipe,
+	i915simple
+]
+
+DRIVER_SOURCES = [
+	'intel_winsys_pipe.c',
+	'intel_winsys_softpipe.c',
+	'intel_winsys_i915.c',
+	'intel_batchbuffer.c',
+	'intel_swapbuffers.c',
+	'intel_context.c',
+	'intel_lock.c',
+	'intel_screen.c',
+	'intel_batchpool.c',
+]
+
+sources = \
+	COMMON_GALLIUM_SOURCES + \
+	COMMON_BM_SOURCES + \
+	DRIVER_SOURCES
+
+# DRIVER_DEFINES = -I../intel $(shell pkg-config libdrm --atleast-version=2.3.1 \
+#				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
+
+env.SharedLibrary(
+	target ='i915tex_dri.so',
+	source = sources,
+	LIBS = pipe_drivers + env['LIBS'],
+)
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c b/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
index 910c0d2cc5..789a386500 100644
--- a/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
+++ b/src/mesa/drivers/dri/intel_winsys/intel_winsys_pipe.c
@@ -224,11 +224,6 @@ intel_i915_surface_alloc_storage(struct pipe_winsys *winsys,
    if(!surf->buffer)
       return -1;
 
-   if(ret) {
-      pipe_buffer_reference(winsys, &surf->buffer, NULL);
-      return ret;
-   }
-   
    return 0;
 }
 
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 3bec3bd433..37ef2a865b 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -333,6 +333,12 @@ struct dd_function_table {
                               GLsizei width, GLsizei height );
 
    /**
+    * Called by glGenerateMipmap() or when GL_GENERATE_MIPMAP_SGIS is enabled.
+    */
+   void (*GenerateMipmap)(GLcontext *ctx, GLenum target,
+                          struct gl_texture_object *texObj);
+
+   /**
     * Called by glTexImage[123]D when user specifies a proxy texture
     * target.  
     *
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 963e35d678..6a8cba4d8a 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -1560,7 +1560,7 @@ _mesa_GenerateMipmapEXT(GLenum target)
 
    /* XXX this might not handle cube maps correctly */
    _mesa_lock_texture(ctx, texObj);
-   _mesa_generate_mipmap(ctx, target, texUnit, texObj);
+   ctx->Driver.GenerateMipmap(ctx, target, texObj);
    _mesa_unlock_texture(ctx, texObj);
 }
 
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 9f3db22b75..1e61829e8f 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -933,7 +933,6 @@ make_2d_stack_mipmap(const struct gl_texture_format *format, GLint border,
  */
 void
 _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
-                      const struct gl_texture_unit *texUnit,
                       struct gl_texture_object *texObj)
 {
    const struct gl_texture_image *srcImage;
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index df78603283..46e16902c8 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -30,7 +30,6 @@
 
 extern void
 _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
-                      const struct gl_texture_unit *texUnit,
                       struct gl_texture_object *texObj);
 
 
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 30be65525e..a6a18910fc 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -2917,9 +2917,7 @@ _mesa_store_teximage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3003,9 +3001,7 @@ _mesa_store_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3079,9 +3075,7 @@ _mesa_store_teximage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3127,9 +3121,7 @@ _mesa_store_texsubimage1d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3182,9 +3174,7 @@ _mesa_store_texsubimage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3237,9 +3227,7 @@ _mesa_store_texsubimage3d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, packing);
@@ -3313,9 +3301,7 @@ _mesa_store_compressed_teximage2d(GLcontext *ctx, GLenum target, GLint level,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
@@ -3425,9 +3411,7 @@ _mesa_store_compressed_texsubimage2d(GLcontext *ctx, GLenum target,
 
    /* GL_SGIS_generate_mipmap */
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      _mesa_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 
    _mesa_unmap_teximage_pbo(ctx, &ctx->Unpack);
diff --git a/src/mesa/pipe/README.portability b/src/mesa/pipe/README.portability
new file mode 100644
index 0000000000..c70ca774da
--- /dev/null
+++ b/src/mesa/pipe/README.portability
@@ -0,0 +1,43 @@
+	      CROSS-PLATFORM PORTABILITY GUIDELINES FOR GALLIUM3D 
+
+
+= General Considerations =
+
+The state tracker and winsys driver support a rather limited number of
+platforms. However, the pipe drivers are meant to run in a wide number of
+platforms. Hence the pipe drivers, the auxiliary modules, and all public
+headers in general, should stricly follow these guidelines to ensure
+
+
+= Compiler Support =
+
+* Include the p_compiler.h.
+
+* Don't use the 'inline' keyword, use the INLINE macro in p_compiler.h instead.
+
+* Cast explicitly when converting to integer types of smaller sizes.
+
+* Cast explicitly when converting between float, double and integral types.
+
+* Don't use named struct initializers.
+
+* Don't use variable number of macro arguments. Use static inline functions
+instead.
+
+
+= Standard Library =
+
+* Avoid including standard library headers. Most standard library functions are
+not available in Windows Kernel Mode. Use the appropriate p_*.h include.
+
+== Memory Allocation ==
+
+* Use MALLOC, CALLOC, FREE instead of the malloc, calloc, free functions.
+
+* Use align_pointer() function defined in p_util.h for aligning pointers in a
+portable way.
+
+== Debugging ==
+
+TODO
+
diff --git a/src/mesa/pipe/SConscript b/src/mesa/pipe/SConscript
new file mode 100644
index 0000000000..d9c20e0100
--- /dev/null
+++ b/src/mesa/pipe/SConscript
@@ -0,0 +1,9 @@
+Import('*')
+
+#env = env.Clone()
+
+SConscript([
+	'softpipe/SConscript',
+	'i915simple/SConscript',
+	'i965simple/SConscript',
+])
diff --git a/src/mesa/pipe/cell/common.h b/src/mesa/pipe/cell/common.h
index 0b63ed39be..4de514c358 100644
--- a/src/mesa/pipe/cell/common.h
+++ b/src/mesa/pipe/cell/common.h
@@ -51,16 +51,21 @@
 
 /** for sanity checking */
 #define ASSERT_ALIGN16(ptr) \
-   assert((((unsigned long) (ptr)) & 0xf) == 0);
+  ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
 
 
 /** round up value to next multiple of 4 */
 #define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
 
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
 /** round up value to next multiple of 16 */
 #define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
 
 
+#define CELL_MAX_SPUS 6
+
 #define TILE_SIZE 32
 
 
@@ -68,21 +73,27 @@
  * The low byte of a mailbox word contains the command opcode.
  * Remaining higher bytes are command specific.
  */
-#define CELL_CMD_OPCODE_MASK 0xf
+#define CELL_CMD_OPCODE_MASK 0xff
 
 #define CELL_CMD_EXIT                 1
 #define CELL_CMD_CLEAR_SURFACE        2
 #define CELL_CMD_FINISH               3
 #define CELL_CMD_RENDER               4
 #define CELL_CMD_BATCH                5
+#define CELL_CMD_RELEASE_VERTS        6
 #define CELL_CMD_STATE_FRAMEBUFFER   10
 #define CELL_CMD_STATE_DEPTH_STENCIL 11
 #define CELL_CMD_STATE_SAMPLER       12
-#define CELL_CMD_STATE_VERTEX_INFO   13
+#define CELL_CMD_STATE_TEXTURE       13
+#define CELL_CMD_STATE_VERTEX_INFO   14
+#define CELL_CMD_STATE_VIEWPORT      15
+#define CELL_CMD_STATE_VS_ARRAY_INFO 16
+#define CELL_CMD_STATE_BLEND         17
+#define CELL_CMD_VS_EXECUTE          18
 
 
-#define CELL_NUM_BATCH_BUFFERS 3
-#define CELL_BATCH_BUFFER_SIZE 1024  /**< 16KB would be the max */
+#define CELL_NUM_BUFFERS 4
+#define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
 
 #define CELL_BUFFER_STATUS_FREE 10
 #define CELL_BUFFER_STATUS_USED 20
@@ -94,11 +105,11 @@
  */
 struct cell_command_framebuffer
 {
-   uint opcode;
+   uint64_t opcode;     /**< CELL_CMD_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
-} ALIGN16_ATTRIB;
+};
 
 
 /**
@@ -106,38 +117,90 @@ struct cell_command_framebuffer
  */
 struct cell_command_clear_surface
 {
-   uint opcode;
+   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
+};
+
+
+/**
+ * Array info used by the vertex shader's vertex puller.
+ */
+struct cell_array_info
+{
+    uint64_t base;      /**< Base address of the 0th element. */
+    uint attr;          /**< Attribute that this state is for. */
+    uint pitch;         /**< Byte pitch from one entry to the next. */
+    uint format;        /**< Pipe format of each entry. */
 } ALIGN16_ATTRIB;
 
 
-#define CELL_MAX_VBUF_SIZE    (16 * 1024)
-#define CELL_MAX_VBUF_INDEXES 1024
+struct cell_shader_info
+{
+   unsigned num_outputs;
+
+   uint64_t declarations;
+   unsigned num_declarations;
+   uint64_t instructions;
+   unsigned num_instructions;
+   uint64_t uniforms;
+   uint64_t  immediates;
+   unsigned num_immediates;
+} ALIGN16_ATTRIB;
+
+
+#define SPU_VERTS_PER_BATCH 64
+struct cell_command_vs
+{
+   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   struct cell_shader_info   shader;
+   unsigned num_elts;
+   unsigned elts[SPU_VERTS_PER_BATCH];
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
+   float plane[12][4];
+   unsigned nr_planes;
+   unsigned nr_attrs;
+} ALIGN16_ATTRIB;
 
 
 struct cell_command_render
 {
-   uint opcode;       /**< CELL_CMD_RENDER */
+   uint64_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
-   uint dummy;        /* XXX this dummy field works around a compiler bug */
    uint num_indexes;
-   const void *vertex_data;
-   const ushort *index_data;
-   float xmin, ymin, xmax, ymax;
-   boolean inline_indexes;
+   uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
+   float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
+   uint min_index;
    boolean inline_verts;
-} ALIGN16_ATTRIB;
+};
+
+
+struct cell_command_release_verts
+{
+   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
+struct cell_command_texture
+{
+   void *start;         /**< Address in main memory */
+   uint width, height;
+};
 
 
 /** XXX unions don't seem to work */
+/* XXX this should go away; all commands should be placed in batch buffers */
 struct cell_command
 {
+#if 0
    struct cell_command_framebuffer fb;
    struct cell_command_clear_surface clear;
    struct cell_command_render render;
+#endif
+   struct cell_command_vs vs;
 } ALIGN16_ATTRIB;
 
 
@@ -147,7 +210,9 @@ struct cell_init_info
    unsigned id;
    unsigned num_spus;
    struct cell_command *cmd;
-   ubyte *batch_buffers[CELL_NUM_BATCH_BUFFERS];
+
+   /** Buffers for command batches, vertex/index data */
+   ubyte *buffers[CELL_NUM_BUFFERS];
    uint *buffer_status;  /**< points at cell_context->buffer_status */
 } ALIGN16_ATTRIB;
 
diff --git a/src/mesa/pipe/cell/ppu/Makefile b/src/mesa/pipe/cell/ppu/Makefile
index e7f2562da7..50060f5cd3 100644
--- a/src/mesa/pipe/cell/ppu/Makefile
+++ b/src/mesa/pipe/cell/ppu/Makefile
@@ -34,6 +34,7 @@ SOURCES = \
 	cell_surface.c \
 	cell_texture.c \
 	cell_vbuf.c \
+	cell_vertex_shader.c \
 	cell_winsys.c
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.c b/src/mesa/pipe/cell/ppu/cell_batch.c
index c894ef8608..f45e5f25b6 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.c
+++ b/src/mesa/pipe/cell/ppu/cell_batch.c
@@ -31,12 +31,55 @@
 #include "cell_spu.h"
 
 
+
+uint
+cell_get_empty_buffer(struct cell_context *cell)
+{
+   uint buf = 0, tries = 0;
+
+   /* Find a buffer that's marked as free by all SPUs */
+   while (1) {
+      uint spu, num_free = 0;
+
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) {
+            num_free++;
+
+            if (num_free == cell->num_spus) {
+               /* found a free buffer, now mark status as used */
+               for (spu = 0; spu < cell->num_spus; spu++) {
+                  cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+               }
+               /*
+               printf("PPU: ALLOC BUFFER %u\n", buf);
+               */
+               return buf;
+            }
+         }
+         else {
+            break;
+         }
+      }
+
+      /* try next buf */
+      buf = (buf + 1) % CELL_NUM_BUFFERS;
+
+      tries++;
+      if (tries == 100) {
+         /*
+         printf("PPU WAITING for buffer...\n");
+         */
+      }
+   }
+}
+
+
 void
 cell_batch_flush(struct cell_context *cell)
 {
    static boolean flushing = FALSE;
    uint batch = cell->cur_batch;
-   const uint size = cell->batch_buffer_size[batch];
+   const uint size = cell->buffer_size[batch];
    uint spu, cmd_word;
 
    assert(!flushing);
@@ -46,7 +89,7 @@ cell_batch_flush(struct cell_context *cell)
 
    flushing = TRUE;
 
-   assert(batch < CELL_NUM_BATCH_BUFFERS);
+   assert(batch < CELL_NUM_BUFFERS);
 
    /*
    printf("cell_batch_dispatch: buf %u at %p, size %u\n",
@@ -68,28 +111,9 @@ cell_batch_flush(struct cell_context *cell)
     * array indicating that the PPU can re-use the buffer.
     */
 
+   batch = cell_get_empty_buffer(cell);
 
-   /* Find a buffer that's marked as free by all SPUs */
-   while (1) {
-      uint num_free = 0;
-
-      batch = (batch + 1) % CELL_NUM_BATCH_BUFFERS;
-
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         if (cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_FREE)
-            num_free++;
-      }
-
-      if (num_free == cell->num_spus) {
-         /* found a free buffer, now mark status as used */
-         for (spu = 0; spu < cell->num_spus; spu++) {
-            cell->buffer_status[spu][batch][0] = CELL_BUFFER_STATUS_USED;
-         }
-         break;
-      }
-   }
-
-   cell->batch_buffer_size[batch] = 0;  /* empty */
+   cell->buffer_size[batch] = 0;  /* empty */
    cell->cur_batch = batch;
 
    flushing = FALSE;
@@ -99,61 +123,95 @@ cell_batch_flush(struct cell_context *cell)
 uint
 cell_batch_free_space(const struct cell_context *cell)
 {
-   uint free = CELL_BATCH_BUFFER_SIZE
-      - cell->batch_buffer_size[cell->cur_batch];
+   uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
    return free;
 }
 
 
 /**
- * \param cmd  command to append
- * \param length  command size in bytes
+ * Append data to current batch.
  */
 void
-cell_batch_append(struct cell_context *cell, const void *cmd, uint length)
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 {
    uint size;
 
-   assert(length % 4 == 0);
-   assert(cell->cur_batch >= 0);
+   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
 
-   size = cell->batch_buffer_size[cell->cur_batch];
+   size = cell->buffer_size[cell->cur_batch];
 
-   if (size + length > CELL_BATCH_BUFFER_SIZE) {
+   if (size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
 
-   assert(size + length <= CELL_BATCH_BUFFER_SIZE);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
-   memcpy(cell->batch_buffer[cell->cur_batch] + size, cmd, length);
+   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
 
-   cell->batch_buffer_size[cell->cur_batch] = size + length;
+   cell->buffer_size[cell->cur_batch] = size + bytes;
 }
 
 
 void *
 cell_batch_alloc(struct cell_context *cell, uint bytes)
 {
+   return cell_batch_alloc_aligned(cell, bytes, 1);
+}
+
+
+void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment)
+{
    void *pos;
-   uint size;
+   uint size, padbytes;
 
-   ASSERT(bytes % 4 == 0);
+   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(alignment > 0);
+   ASSERT(cell->cur_batch >= 0);
 
-   assert(cell->cur_batch >= 0);
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
 
-   size = cell->batch_buffer_size[cell->cur_batch];
+   size = cell->buffer_size[cell->cur_batch];
 
-   if (size + bytes > CELL_BATCH_BUFFER_SIZE) {
+   padbytes = (alignment - (size % alignment)) % alignment;
+
+   if (padbytes + size + bytes > CELL_BUFFER_SIZE) {
       cell_batch_flush(cell);
       size = 0;
    }
+   else {
+      size += padbytes;
+   }
 
-   assert(size + bytes <= CELL_BATCH_BUFFER_SIZE);
+   ASSERT(size % alignment == 0);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
-   pos = (void *) (cell->batch_buffer[cell->cur_batch] + size);
+   pos = (void *) (cell->buffer[cell->cur_batch] + size);
 
-   cell->batch_buffer_size[cell->cur_batch] = size + bytes;
+   cell->buffer_size[cell->cur_batch] = size + bytes;
 
    return pos;
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_batch.h b/src/mesa/pipe/cell/ppu/cell_batch.h
index c4ba7feb3d..a6eee0a8b1 100644
--- a/src/mesa/pipe/cell/ppu/cell_batch.h
+++ b/src/mesa/pipe/cell/ppu/cell_batch.h
@@ -35,6 +35,9 @@
 struct cell_context;
 
 
+extern uint
+cell_get_empty_buffer(struct cell_context *cell);
+
 extern void
 cell_batch_flush(struct cell_context *cell);
 
@@ -42,10 +45,14 @@ extern uint
 cell_batch_free_space(const struct cell_context *cell);
 
 extern void
-cell_batch_append(struct cell_context *cell, const void *cmd, uint length);
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
 
 extern void *
 cell_batch_alloc(struct cell_context *cell, uint bytes);
 
+extern void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment);
+
 
 #endif /* CELL_BATCH_H */
diff --git a/src/mesa/pipe/cell/ppu/cell_clear.c b/src/mesa/pipe/cell/ppu/cell_clear.c
index e01640b994..07b908eec5 100644
--- a/src/mesa/pipe/cell/ppu/cell_clear.c
+++ b/src/mesa/pipe/cell/ppu/cell_clear.c
@@ -48,9 +48,12 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                    unsigned clearValue)
 {
    struct cell_context *cell = cell_context(pipe);
-   /*uint i;*/
    uint surfIndex;
 
+   if (cell->dirty)
+      cell_update_derived(cell);
+
+
    if (!cell->cbuf_map[0])
       cell->cbuf_map[0] = pipe_surface_map(ps);
 
@@ -61,29 +64,7 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       surfIndex = 0;
    }
 
-#if 0
-   for (i = 0; i < cell->num_spus; i++) {
-#if 1
-      uint clr = clearValue;
-      if (surfIndex == 0) {
-         /* XXX debug: clear color varied per-SPU to visualize tiles */
-         if ((clr & 0xff) == 0)
-            clr |= 64 + i * 8;
-         if ((clr & 0xff00) == 0)
-            clr |= (64 + i * 8) << 8;
-         if ((clr & 0xff0000) == 0)
-            clr |= (64 + i * 8) << 16;
-         if ((clr & 0xff000000) == 0)
-            clr |= (64 + i * 8) << 24;
-      }
-      cell_global.command[i].clear.value = clr;
-#else
-      cell_global.command[i].clear.value = clearValue;
-#endif
-      cell_global.command[i].clear.surface = surfIndex;
-      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_CLEAR_SURFACE);
-   }
-#else
+
    {
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
@@ -92,9 +73,4 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
-#endif
-
-   /* XXX temporary */
-   cell_flush(&cell->pipe, 0x0);
-
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_context.c b/src/mesa/pipe/cell/ppu/cell_context.c
index 8cb0c48f40..bbe1fd7a11 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.c
+++ b/src/mesa/pipe/cell/ppu/cell_context.c
@@ -39,6 +39,7 @@
 #include "pipe/p_winsys.h"
 #include "pipe/cell/common.h"
 #include "pipe/draw/draw_context.h"
+#include "pipe/draw/draw_private.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
@@ -156,6 +157,19 @@ cell_destroy_context( struct pipe_context *pipe )
 }
 
 
+static struct draw_context *
+cell_draw_create(struct cell_context *cell)
+{
+   struct draw_context *draw = draw_create();
+
+   if (getenv("GALLIUM_CELL_VS")) {
+      /* plug in SPU-based vertex transformation code */
+      draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+      draw->driver_private = cell;
+   }
+
+   return draw;
+}
 
 
 struct pipe_context *
@@ -242,7 +256,7 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws)
 
    cell_init_surface_functions(cell);
 
-   cell->draw = draw_create();
+   cell->draw = cell_draw_create(cell);
 
    cell_init_vbuf(cell);
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
@@ -254,8 +268,9 @@ cell_create_context(struct pipe_winsys *winsys, struct cell_winsys *cws)
 
    cell_start_spus(cell);
 
-   for (buf = 0; buf < CELL_NUM_BATCH_BUFFERS; buf++) {
-      cell->batch_buffer_size[buf] = 0;
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
 
       /* init batch buffer status values,
        * mark 0th buffer as used, rest as free.
diff --git a/src/mesa/pipe/cell/ppu/cell_context.h b/src/mesa/pipe/cell/ppu/cell_context.h
index 3bd88bfd5b..3b63419b5e 100644
--- a/src/mesa/pipe/cell/ppu/cell_context.h
+++ b/src/mesa/pipe/cell/ppu/cell_context.h
@@ -38,9 +38,6 @@
 #include "pipe/cell/common.h"
 
 
-#define CELL_MAX_SPUS 6
-
-
 struct cell_vbuf_render;
 
 struct cell_vertex_shader_state
@@ -76,7 +73,7 @@ struct cell_context
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
    struct pipe_scissor_state scissor;
-   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct cell_texture *texture[PIPE_MAX_SAMPLERS];
    struct pipe_viewport_state viewport;
    struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
    struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
@@ -84,6 +81,9 @@ struct cell_context
    ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
    ubyte *zsbuf_map;
 
+   struct pipe_surface *tex_surf;
+   uint *tex_map;
+
    uint dirty;
 
    /** The primitive drawing context */
@@ -102,12 +102,14 @@ struct cell_context
 
    uint num_spus;
 
-   uint batch_buffer_size[CELL_NUM_BATCH_BUFFERS];
-   ubyte batch_buffer[CELL_NUM_BATCH_BUFFERS][CELL_BATCH_BUFFER_SIZE] ALIGN16_ATTRIB;
-   int cur_batch;  /**< which batch buffer is being filled */
+   /** Buffers for command batches, vertex/index data */
+   uint buffer_size[CELL_NUM_BUFFERS];
+   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+
+   int cur_batch;  /**< which buffer is being filled w/ commands */
 
    /** [4] to ensure 16-byte alignment for each status word */
-   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BATCH_BUFFERS][4] ALIGN16_ATTRIB;
+   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
 
 };
 
@@ -124,6 +126,8 @@ cell_context(struct pipe_context *pipe)
 extern struct pipe_context *
 cell_create_context(struct pipe_winsys *ws, struct cell_winsys *cws);
 
+extern void
+cell_vertex_shader_queue_flush(struct draw_context *draw);
 
 
 
diff --git a/src/mesa/pipe/cell/ppu/cell_flush.c b/src/mesa/pipe/cell/ppu/cell_flush.c
index b98bb566b1..f62bc4650c 100644
--- a/src/mesa/pipe/cell/ppu/cell_flush.c
+++ b/src/mesa/pipe/cell/ppu/cell_flush.c
@@ -39,6 +39,9 @@ cell_flush(struct pipe_context *pipe, unsigned flags)
 {
    struct cell_context *cell = cell_context(pipe);
 
+   if (flags & PIPE_FLUSH_SWAPBUFFERS)
+      flags |= PIPE_FLUSH_WAIT;
+
    draw_flush( cell->draw );
    cell_flush_int(pipe, flags);
 }
@@ -56,7 +59,7 @@ cell_flush_int(struct pipe_context *pipe, unsigned flags)
    flushing = TRUE;
 
    if (flags & PIPE_FLUSH_WAIT) {
-      uint *cmd = (uint *) cell_batch_alloc(cell, sizeof(uint));
+      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
       *cmd = CELL_CMD_FINISH;
    }
 
diff --git a/src/mesa/pipe/cell/ppu/cell_spu.c b/src/mesa/pipe/cell/ppu/cell_spu.c
index 4627bc8d1f..7c83a47e57 100644
--- a/src/mesa/pipe/cell/ppu/cell_spu.c
+++ b/src/mesa/pipe/cell/ppu/cell_spu.c
@@ -111,8 +111,8 @@ cell_start_spus(struct cell_context *cell)
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
       cell_global.inits[i].cmd = &cell_global.command[i];
-      for (j = 0; j < CELL_NUM_BATCH_BUFFERS; j++) {
-         cell_global.inits[i].batch_buffers[j] = cell->batch_buffer[j];
+      for (j = 0; j < CELL_NUM_BUFFERS; j++) {
+         cell_global.inits[i].buffers[j] = cell->buffer[j];
       }
       cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_blend.c b/src/mesa/pipe/cell/ppu/cell_state_blend.c
index 34ae0128ea..4fc60548c8 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_blend.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_blend.c
@@ -29,6 +29,7 @@
  */
 
 #include "pipe/p_util.h"
+#include "pipe/draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
 
@@ -38,9 +39,7 @@ void *
 cell_create_blend_state(struct pipe_context *pipe,
                         const struct pipe_blend_state *blend)
 {
-   struct pipe_blend_state *state = MALLOC(sizeof(struct pipe_blend_state));
-   memcpy(state, blend, sizeof(struct pipe_blend_state));
-   return state;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 
@@ -49,6 +48,8 @@ cell_bind_blend_state(struct pipe_context *pipe, void *blend)
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->blend = (const struct pipe_blend_state *)blend;
 
    cell->dirty |= CELL_NEW_BLEND;
@@ -68,6 +69,8 @@ cell_set_blend_color(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->blend_color = *blend_color;
 
    cell->dirty |= CELL_NEW_BLEND;
@@ -80,10 +83,7 @@ void *
 cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
                  const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   struct pipe_depth_stencil_alpha_state *state =
-      MALLOC(sizeof(struct pipe_depth_stencil_alpha_state));
-   memcpy(state, depth_stencil, sizeof(struct pipe_depth_stencil_alpha_state));
-   return state;
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
 }
 
 
@@ -93,6 +93,8 @@ cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->depth_stencil
       = (const struct pipe_depth_stencil_alpha_state *) depth_stencil;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_emit.c b/src/mesa/pipe/cell/ppu/cell_state_emit.c
index dbca900c35..5d2a786449 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_emit.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_emit.c
@@ -30,6 +30,18 @@
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_batch.h"
+#include "cell_texture.h"
+
+
+static void
+emit_state_cmd(struct cell_context *cell, uint cmd,
+               const void *state, uint state_size)
+{
+   uint64_t *dst = (uint64_t *) 
+       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
+   *dst = cmd;
+   memcpy(dst + 1, state, state_size);
+}
 
 
 
@@ -50,23 +62,42 @@ cell_emit_state(struct cell_context *cell)
       fb->height = cell->framebuffer.cbufs[0]->height;
    }
 
+   if (cell->dirty & CELL_NEW_BLEND) {
+      emit_state_cmd(cell, CELL_CMD_STATE_BLEND,
+                     cell->blend,
+                     sizeof(struct pipe_blend_state));
+   }
+
    if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      uint cmd = CELL_CMD_STATE_DEPTH_STENCIL;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, cell->depth_stencil,
-                        sizeof(struct pipe_depth_stencil_alpha_state));
+      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL,
+                     cell->depth_stencil,
+                     sizeof(struct pipe_depth_stencil_alpha_state));
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
-      uint cmd = CELL_CMD_STATE_SAMPLER;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, cell->sampler[0],
-                        sizeof(struct pipe_sampler_state));
+      emit_state_cmd(cell, CELL_CMD_STATE_SAMPLER,
+                     cell->sampler[0], sizeof(struct pipe_sampler_state));
+   }
+
+   if (cell->dirty & CELL_NEW_TEXTURE) {
+      struct cell_command_texture texture;
+      if (cell->texture[0]) {
+         texture.start = cell->texture[0]->tiled_data;
+         texture.width = cell->texture[0]->base.width[0];
+         texture.height = cell->texture[0]->base.height[0];
+      }
+      else {
+         texture.start = NULL;
+         texture.width = 0;
+         texture.height = 0;
+      }
+
+      emit_state_cmd(cell, CELL_CMD_STATE_TEXTURE,
+                     &texture, sizeof(struct cell_command_texture));
    }
 
    if (cell->dirty & CELL_NEW_VERTEX_INFO) {
-      uint cmd = CELL_CMD_STATE_VERTEX_INFO;
-      cell_batch_append(cell, &cmd, 4);
-      cell_batch_append(cell, &cell->vertex_info, sizeof(struct vertex_info));
+      emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
+                     &cell->vertex_info, sizeof(struct vertex_info));
    }
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_state_fs.c b/src/mesa/pipe/cell/ppu/cell_state_fs.c
index 81c2ac14dd..96a52273b0 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_fs.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_fs.c
@@ -45,7 +45,7 @@ void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
 {
-   struct cell_context *cell = cell_context(pipe);
+   /*struct cell_context *cell = cell_context(pipe);*/
    struct cell_fragment_shader_state *state;
 
    state = CALLOC_STRUCT(cell_fragment_shader_state);
@@ -94,8 +94,6 @@ cell_bind_fs_state(struct pipe_context *pipe, void *fs)
 void
 cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
-   struct cell_context *cell = cell_context(pipe);
-
    struct cell_fragment_shader_state *state =
       (struct cell_fragment_shader_state *) fs;
 
diff --git a/src/mesa/pipe/cell/ppu/cell_state_sampler.c b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
index ae1eeb4620..ade6cc8338 100644
--- a/src/mesa/pipe/cell/ppu/cell_state_sampler.c
+++ b/src/mesa/pipe/cell/ppu/cell_state_sampler.c
@@ -30,21 +30,17 @@
  */
 
 #include "pipe/p_util.h"
+#include "pipe/draw/draw_context.h"
 #include "cell_context.h"
 #include "cell_state.h"
-#if 0
 #include "cell_texture.h"
-#include "cell_tile_cache.h"
-#endif
 
 
 void *
 cell_create_sampler_state(struct pipe_context *pipe,
                           const struct pipe_sampler_state *sampler)
 {
-   struct pipe_sampler_state *state = MALLOC( sizeof(struct pipe_sampler_state) );
-   memcpy(state, sampler, sizeof(struct pipe_sampler_state));
-   return state;
+   return mem_dup(sampler, sizeof(*sampler));
 }
 
 void
@@ -53,6 +49,8 @@ cell_bind_sampler_state(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    assert(unit < PIPE_MAX_SAMPLERS);
    cell->sampler[unit] = (struct pipe_sampler_state *)sampler;
 
@@ -76,7 +74,11 @@ cell_set_sampler_texture(struct pipe_context *pipe,
 {
    struct cell_context *cell = cell_context(pipe);
 
+   draw_flush(cell->draw);
+
    cell->texture[sampler] = texture;
 
+   cell_update_texture_mapping(cell);
+
    cell->dirty |= CELL_NEW_TEXTURE;
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.c b/src/mesa/pipe/cell/ppu/cell_texture.c
index 0a8190d983..df178d9ca2 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.c
+++ b/src/mesa/pipe/cell/ppu/cell_texture.c
@@ -79,31 +79,30 @@ cell_texture_layout(struct cell_texture * spt)
 }
 
 
-void
-cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+struct pipe_texture *
+cell_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat)
 {
-   struct cell_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture),
-					  sizeof(struct cell_texture));
+   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
+   if (!spt)
+      return NULL;
 
-   if (spt) {
-      memset(&spt->base + 1, 0,
-	     sizeof(struct cell_texture) - sizeof(struct pipe_texture));
+   spt->base = *templat;
 
-      cell_texture_layout(spt);
+   cell_texture_layout(spt);
 
-      spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                                PIPE_BUFFER_USAGE_PIXEL,
-                                                spt->buffer_size);
+   spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
+                                             PIPE_BUFFER_USAGE_PIXEL,
+                                             spt->buffer_size);
 
-      if (!spt->buffer) {
-	 FREE(spt);
-	 spt = NULL;
-      }
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
    }
 
-   *pt = &spt->base;
+   return &spt->base;
 }
 
+
 void
 cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
@@ -163,3 +162,91 @@ cell_get_tex_surface(struct pipe_context *pipe,
    }
    return ps;
 }
+
+
+
+static void
+tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = h / tile_size, w_t = w / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* fill in tile (i, j) */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+         for (i = 0; i < tile_size; i++) {
+            for (j = 0; j < tile_size; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               *tdst++ = src[srci * h + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
+static void
+cell_tile_texture(struct cell_context *cell,
+                  struct cell_texture *texture)
+{
+   uint face = 0, level = 0, zslice = 0;
+   struct pipe_surface *surf;
+   const uint w = texture->base.width[0], h = texture->base.height[0];
+   const uint *src;
+
+   /* temporary restrictions: */
+   assert(w >= TILE_SIZE);
+   assert(h >= TILE_SIZE);
+   assert(w % TILE_SIZE == 0);
+   assert(h % TILE_SIZE == 0);
+
+   surf = cell_get_tex_surface(&cell->pipe, &texture->base, face, level, zslice);
+   ASSERT(surf);
+
+   src = (const uint *) pipe_surface_map(surf);
+
+   if (texture->tiled_data) {
+      align_free(texture->tiled_data);
+   }
+   texture->tiled_data = align_malloc(w * h * 4, 16);
+
+   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
+
+   pipe_surface_unmap(surf);
+
+   pipe_surface_reference(&surf, NULL);
+}
+
+
+
+void
+cell_update_texture_mapping(struct cell_context *cell)
+{
+   uint face = 0, level = 0, zslice = 0;
+
+   if (cell->texture[0])
+      cell_tile_texture(cell, cell->texture[0]);
+#if 0
+   if (cell->tex_surf && cell->tex_map) {
+      pipe_surface_unmap(cell->tex_surf);
+      cell->tex_map = NULL;
+   }
+
+   /* XXX free old surface */
+
+   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
+                                         &cell->texture[0]->base,
+                                         face, level, zslice);
+
+   cell->tex_map = pipe_surface_map(cell->tex_surf);
+#endif
+}
diff --git a/src/mesa/pipe/cell/ppu/cell_texture.h b/src/mesa/pipe/cell/ppu/cell_texture.h
index ef5808c086..0264fed88e 100644
--- a/src/mesa/pipe/cell/ppu/cell_texture.h
+++ b/src/mesa/pipe/cell/ppu/cell_texture.h
@@ -46,6 +46,8 @@ struct cell_texture
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
+
+   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
@@ -58,8 +60,9 @@ cell_texture(struct pipe_texture *pt)
 
 
 
-extern void
-cell_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+cell_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat);
 
 extern void
 cell_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
@@ -70,4 +73,8 @@ cell_get_tex_surface(struct pipe_context *pipe,
                      unsigned face, unsigned level, unsigned zslice);
 
 
+extern void
+cell_update_texture_mapping(struct cell_context *cell);
+
+
 #endif /* CELL_TEXTURE */
diff --git a/src/mesa/pipe/cell/ppu/cell_vbuf.c b/src/mesa/pipe/cell/ppu/cell_vbuf.c
index ee572b3a51..e9fafe492e 100644
--- a/src/mesa/pipe/cell/ppu/cell_vbuf.c
+++ b/src/mesa/pipe/cell/ppu/cell_vbuf.c
@@ -39,8 +39,7 @@
 #include "pipe/draw/draw_vbuf.h"
 
 
-/** Allow prim indexes, verts to be inlined after RENDER command */
-#define ALLOW_INLINE_INDEXES 1
+/** Allow vertex data to be inlined after RENDER command */
 #define ALLOW_INLINE_VERTS 1
 
 
@@ -52,9 +51,10 @@ struct cell_vbuf_render
 {
    struct vbuf_render base;
    struct cell_context *cell;
-   uint prim;
-   uint vertex_size;
-   void *vertex_buffer;
+   uint prim;            /**< PIPE_PRIM_x */
+   uint vertex_size;     /**< in bytes */
+   void *vertex_buffer;  /**< just for debug, really */
+   uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
 };
 
 
@@ -81,14 +81,46 @@ cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
-   assert(!cvbr->vertex_buffer);
-   cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
+
+   assert(cvbr->vertex_buf == ~0);
+   cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
+   cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
    cvbr->vertex_size = vertex_size;
    return cvbr->vertex_buffer;
 }
 
 
 static void
+cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+
+   /*
+   printf("%s vertex_buf = %u  count = %u\n",
+          __FUNCTION__, cvbr->vertex_buf, vertices_used);
+   */
+
+   /* Tell SPUs they can release the vert buf */
+   if (cvbr->vertex_buf != ~0U) {
+      struct cell_command_release_verts *release
+         = (struct cell_command_release_verts *)
+         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
+      release->opcode = CELL_CMD_RELEASE_VERTS;
+      release->vertex_buf = cvbr->vertex_buf;
+   }
+
+   cvbr->vertex_buf = ~0;
+   cell_flush_int(&cell->pipe, 0x0);
+
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+
+static void
 cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
@@ -106,17 +138,24 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    struct cell_context *cell = cvbr->cell;
    float xmin, ymin, xmax, ymax;
    uint i;
-   uint nr_vertices = 0;
+   uint nr_vertices = 0, min_index = ~0;
    const void *vertices = cvbr->vertex_buffer;
    const uint vertex_size = cvbr->vertex_size;
 
    for (i = 0; i < nr_indices; i++) {
       if (indices[i] > nr_vertices)
          nr_vertices = indices[i];
+      if (indices[i] < min_index)
+         min_index = indices[i];
    }
    nr_vertices++;
 
 #if 0
+   /*if (min_index > 0)*/
+      printf("%s min_index = %u\n", __FUNCTION__, min_index);
+#endif
+
+#if 0
    printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n",
           nr_indices, nr_vertices);
    printf("  ");
@@ -137,7 +176,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
    /* compute x/y bounding box */
    xmin = ymin = 1e50;
    xmax = ymax = -1e50;
-   for (i = 0; i < nr_vertices; i++) {
+   for (i = min_index; i < nr_vertices; i++) {
       const float *v = (float *) ((ubyte *) vertices + i * vertex_size);
       if (v[0] < xmin)
          xmin = v[0];
@@ -148,83 +187,68 @@ cell_vbuf_draw(struct vbuf_render *vbr,
       if (v[1] > ymax)
          ymax = v[1];
    }
+#if 0
+   printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax);
+   fflush(stdout);
+#endif
 
    if (cvbr->prim != PIPE_PRIM_TRIANGLES)
       return; /* only render tris for now */
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP4(nr_indices * 2);
+      const uint index_bytes = ROUNDUP8(nr_indices * 2);
       const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
+      const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
 
       struct cell_command_render *render
          = (struct cell_command_render *)
-         cell_batch_alloc(cell, sizeof(*render));
+         cell_batch_alloc(cell, batch_size);
+
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
-      if (ALLOW_INLINE_INDEXES &&
-          index_bytes <= cell_batch_free_space(cell)) {
-         /* indices inlined, right after render cmd */
-         void *dst = cell_batch_alloc(cell, index_bytes);
-         memcpy(dst, indices, nr_indices * 2);
-         render->inline_indexes = TRUE;
-         render->index_data = NULL;
-      }
-      else {
-         /* indices in separate buffer */
-         render->inline_indexes = FALSE;
-         render->index_data = indices;
-         ASSERT_ALIGN16(render->index_data);
-      }
+      render->min_index = min_index;
+
+      /* append indices after render command */
+      memcpy(render + 1, indices, nr_indices * 2);
 
+      /* if there's room, append vertices after the indices, else leave
+       * vertices in the original/separate buffer.
+       */
       render->vertex_size = 4 * cell->vertex_info.size;
       render->num_verts = nr_vertices;
       if (ALLOW_INLINE_VERTS &&
-         render->inline_indexes &&
-          vertex_bytes <= cell_batch_free_space(cell)) {
-         /* vertex data inlined, after indices */
-         void *dst = cell_batch_alloc(cell, vertex_bytes);
+          min_index == 0 &&
+          vertex_bytes + 16 <= cell_batch_free_space(cell)) {
+         /* vertex data inlined, after indices, at 16-byte boundary */
+         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
-         render->vertex_data = NULL;
+         render->vertex_buf = ~0;
       }
       else {
+         /* vertex data in separate buffer */
          render->inline_verts = FALSE;
-         render->vertex_data = vertices;
-         ASSERT_ALIGN16(render->vertex_data);
+         ASSERT(cvbr->vertex_buf >= 0);
+         render->vertex_buf = cvbr->vertex_buf;
       }
 
-
       render->xmin = xmin;
       render->ymin = ymin;
       render->xmax = xmax;
       render->ymax = ymax;
    }
 
-#if 01
-   /* XXX this is temporary */
+#if 0
+   /* helpful for debug */
    cell_flush_int(&cell->pipe, PIPE_FLUSH_WAIT);
 #endif
 }
 
 
 static void
-cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
-                           unsigned vertex_size, unsigned vertices_used)
-{
-   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
-
-   /*printf("Free verts %u * %u\n", vertex_size, vertices_used);*/
-   align_free(vertices);
-
-   assert(vertices == cvbr->vertex_buffer);
-   cvbr->vertex_buffer = NULL;
-}
-
-
-static void
 cell_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
@@ -244,8 +268,15 @@ cell_init_vbuf(struct cell_context *cell)
 
    cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
 
-   cell->vbuf_render->base.max_indices = CELL_MAX_VBUF_INDEXES;
-   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_MAX_VBUF_SIZE;
+   /* The max number of indexes is what can fix into a batch buffer,
+    * minus the render and release-verts commands.
+    */
+   cell->vbuf_render->base.max_indices
+      = (CELL_BUFFER_SIZE
+         - sizeof(struct cell_command_render)
+         - sizeof(struct cell_command_release_verts))
+      / sizeof(ushort);
+   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
 
    cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
    cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
@@ -255,6 +286,9 @@ cell_init_vbuf(struct cell_context *cell)
    cell->vbuf_render->base.destroy = cell_vbuf_destroy;
 
    cell->vbuf_render->cell = cell;
+#if 1
+   cell->vbuf_render->vertex_buf = ~0;
+#endif
 
    cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base);
 }
diff --git a/src/mesa/pipe/cell/ppu/cell_vertex_shader.c b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
new file mode 100644
index 0000000000..80dd500b34
--- /dev/null
+++ b/src/mesa/pipe/cell/ppu/cell_vertex_shader.c
@@ -0,0 +1,120 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file cell_vertex_shader.c
+ * Vertex shader interface routines for Cell.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_spu.h"
+#include "cell_batch.h"
+
+#include "pipe/cell/common.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/draw/draw_private.h"
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+cell_vertex_shader_queue_flush(struct draw_context *draw)
+{
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct cell_command_vs *const vs = &cell_global.command[0].vs;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
+   unsigned i, j;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch(draw);
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
+
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->format = draw->vertex_element[i].src_format;
+   }
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
+                            + sizeof(struct pipe_viewport_state));
+   batch[0] = CELL_CMD_STATE_VIEWPORT;
+   (void) memcpy(&batch[1], &draw->viewport,
+                 sizeof(struct pipe_viewport_state));
+
+   cell_batch_flush(cell);
+
+   vs->opcode = CELL_CMD_VS_EXECUTE;
+   vs->shader.num_outputs = draw->num_vs_outputs;
+   vs->shader.declarations = (uintptr_t) draw->machine.Declarations;
+   vs->shader.num_declarations = draw->machine.NumDeclarations;
+   vs->shader.instructions = (uintptr_t) draw->machine.Instructions;
+   vs->shader.num_instructions = draw->machine.NumInstructions;
+   vs->shader.uniforms = (uintptr_t) draw->user.constants;
+   vs->shader.immediates = (uintptr_t) draw->machine.Imms;
+   vs->shader.num_immediates = draw->machine.ImmLimit / 4;
+   vs->nr_attrs = draw->vertex_fetch.nr_attrs;
+
+   (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
+   vs->nr_planes = draw->nr_planes;
+
+   for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) {
+      const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         vs->elts[j] = draw->vs.queue[i + j].elt;
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].dest;
+      }
+
+      for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) {
+         vs->elts[j] = vs->elts[0];
+         vs->vOut[j] = vs->vOut[0];
+      }
+
+      vs->num_elts = n;
+      send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
+
+      cell_flush_int(& cell->pipe, PIPE_FLUSH_WAIT);
+   }
+
+   draw->vs.queue_nr = 0;
+}
diff --git a/src/mesa/pipe/cell/spu/Makefile b/src/mesa/pipe/cell/spu/Makefile
index 417ae1b072..f202971d73 100644
--- a/src/mesa/pipe/cell/spu/Makefile
+++ b/src/mesa/pipe/cell/spu/Makefile
@@ -17,8 +17,15 @@ PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
 
 SOURCES = \
 	spu_main.c \
+	spu_blend.c \
+	spu_render.c \
+	spu_texture.c \
 	spu_tile.c \
-	spu_tri.c
+	spu_tri.c \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
 
 SPU_OBJECTS = $(SOURCES:.c=.o) \
 
diff --git a/src/mesa/pipe/cell/spu/spu_blend.c b/src/mesa/pipe/cell/spu/spu_blend.c
new file mode 100644
index 0000000000..23ec0eeb45
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_blend.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "spu_main.h"
+#include "spu_blend.h"
+#include "spu_colorpack.h"
+
+
+void
+blend_quad(uint itx, uint ity, vector float colors[4])
+{
+   /* simple SRC_ALPHA, ONE_MINUS_SRC_ALPHA blending */
+   vector float fbc00 = spu_unpack_color(spu.ctile.ui[ity][itx]);
+   vector float fbc01 = spu_unpack_color(spu.ctile.ui[ity][itx+1]);
+   vector float fbc10 = spu_unpack_color(spu.ctile.ui[ity+1][itx]);
+   vector float fbc11 = spu_unpack_color(spu.ctile.ui[ity+1][itx+1]);
+
+   vector float alpha00 = spu_splats(spu_extract(colors[0], 3));
+   vector float alpha01 = spu_splats(spu_extract(colors[1], 3));
+   vector float alpha10 = spu_splats(spu_extract(colors[2], 3));
+   vector float alpha11 = spu_splats(spu_extract(colors[3], 3));
+
+   vector float one_minus_alpha00 = spu_sub(spu_splats(1.0f), alpha00);
+   vector float one_minus_alpha01 = spu_sub(spu_splats(1.0f), alpha01);
+   vector float one_minus_alpha10 = spu_sub(spu_splats(1.0f), alpha10);
+   vector float one_minus_alpha11 = spu_sub(spu_splats(1.0f), alpha11);
+
+   colors[0] = spu_add(spu_mul(colors[0], alpha00),
+                       spu_mul(fbc00, one_minus_alpha00));
+   colors[1] = spu_add(spu_mul(colors[1], alpha01),
+                       spu_mul(fbc01, one_minus_alpha01));
+   colors[2] = spu_add(spu_mul(colors[2], alpha10),
+                       spu_mul(fbc10, one_minus_alpha10));
+   colors[3] = spu_add(spu_mul(colors[3], alpha11),
+                       spu_mul(fbc11, one_minus_alpha11));
+}
+
diff --git a/src/mesa/pipe/cell/spu/spu_blend.h b/src/mesa/pipe/cell/spu/spu_blend.h
new file mode 100644
index 0000000000..2b594b578b
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_blend.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_BLEND_H
+#define SPU_BLEND_H
+
+
+extern void
+blend_quad(uint itx, uint ity, vector float colors[4]);
+
+
+#endif /* SPU_BLEND_H */
diff --git a/src/mesa/pipe/cell/spu/spu_colorpack.h b/src/mesa/pipe/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..e9fee8a3a6
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_colorpack.h
@@ -0,0 +1,110 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
+static INLINE vector float
+spu_unpack_color(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             0, 0, 0, 0,
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             5, 5, 5, 5,
+                             10, 10, 10, 10,
+                             15, 15, 15, 15,
+                             0, 0, 0, 0}) );
+
+   return spu_convtf(color_u4, 32);
+}
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/mesa/pipe/cell/spu/spu_exec.c b/src/mesa/pipe/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..e51008b9b3
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_exec.c
@@ -0,0 +1,1948 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include <libmisc.h>
+#include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+#include "spu_vertex_shader.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   qword zero;
+   qword not_zero;
+   uint i;
+
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   zero = si_xor(zero, zero);
+   not_zero = si_xori(zero, 0xff);
+
+   /* Setup constants. */
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
+}
+
+
+static INLINE qword
+micro_abs(qword src)
+{
+   return si_rotmi(si_shli(src, 1), -1);
+}
+
+static INLINE qword
+micro_ceil(qword src)
+{
+   return (qword) _ceilf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_cos(qword src)
+{
+   return (qword) _cosf4((vec_float4) src);
+}
+
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
+{
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(bottom_right, bottom_left);
+}
+
+static qword
+micro_ddy(qword src)
+{
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(top_left, bottom_left);
+}
+
+static INLINE qword
+micro_div(qword src0, qword src1)
+{
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_flr(qword src)
+{
+   return (qword) _floorf4((vec_float4) src);
+}
+
+static qword
+micro_frc(qword src)
+{
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
+}
+
+static INLINE qword
+micro_ge(qword src0, qword src1)
+{
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+}
+
+static qword
+micro_lg2(qword src)
+{
+   return (qword) _log2f4((vec_float4) src);
+}
+
+static INLINE qword
+micro_lt(qword src0, qword src1)
+{
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+
+   return si_xori(tmp, 0xff);
+}
+
+static INLINE qword
+micro_max(qword src0, qword src1)
+{
+   return si_selb(src1, src0, si_fcgt(src0, src1));
+}
+
+static INLINE qword
+micro_min(qword src0, qword src1)
+{
+   return si_selb(src0, src1, si_fcgt(src0, src1));
+}
+
+static qword
+micro_neg(qword src)
+{
+   return si_xor(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_set_sign(qword src)
+{
+   return si_or(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_pow(qword src0, qword src1)
+{
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_rnd(qword src)
+{
+   const qword half = (qword) spu_splats(0.5f);
+
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
+}
+
+static INLINE qword
+micro_ishr(qword src0, qword src1)
+{
+   return si_rotma(src0, si_sfi(src1, 0));
+}
+
+static qword
+micro_trunc(qword src)
+{
+   return (qword) _truncf4((vec_float4) src);
+}
+
+static qword
+micro_sin(qword src)
+{
+   return (qword) _sinf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_sqrt(qword src)
+{
+   return (qword) _sqrtf4((vec_float4) src);
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT: {
+         unsigned char buffer[32] ALIGN16_ATTRIB;
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            const uint64_t addr = (uint64_t)(uintptr_t) ptr;
+            const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+
+            mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+            wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+            (void) memcpy(& chan->f[i], &buffer[(addr & 0x0f) 
+                + (sizeof(float) * swizzle)], sizeof(float));
+         }
+         break;
+      }
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         assert( index->i[0] < (int) mach->ImmLimit );
+         assert( index->i[1] < (int) mach->ImmLimit );
+         assert( index->i[2] < (int) mach->ImmLimit );
+         assert( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.q = si_a(index.q, indir_index.q);
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.q = si_mpyi(index.q, 17);
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.q = si_shli(index.q, 12);
+         break;
+      default:
+         assert( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.q = si_a(index.q, indir_index.q);
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      chan->q = micro_abs(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      chan->q = micro_set_sign(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      chan->q = micro_neg(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      assert( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kilp(struct spu_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   qword rgba[4];
+   qword out[4];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, (float *) rgba);
+
+   _transpose_matrix4x4(out, rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[1], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[1].q);
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+
+         first = decl->u.DeclarationRange.First;
+         last = decl->u.DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Interpolation.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+         r[0].q = si_cflts(r[0].q, 0);
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   /* TGSI_OPCODE_SWZ */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fm(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fa(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+      r[0].q = si_fm(r[0].q, r[1].q);
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_min(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_max(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fs(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_frc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_flr(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_rnd(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_lg2(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = micro_pow(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+      FETCH(&r[5], 0, CHAN_X);
+
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       assert (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          r[0].q = micro_abs(r[0].q);
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      r[0].q = si_fa(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_cos(r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddx(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddy(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* for enabled ExecMask bits, set the killed bit */
+      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sin(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         assert(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         r[1].q = micro_cos(r[0].q);
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         r[1].q = micro_sin(r[0].q);
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         assert(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      assert(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       assert (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ceil(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_csflt(r[0].q, 0);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_xorbi(r[0].q, 0xff);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_trunc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_and(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_or(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_xor(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      assert(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this assertion */
+   assert(mach->CondStackTop == 0);
+   assert(mach->LoopStackTop == 0);
+   assert(mach->ContStackTop == 0);
+   assert(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+	 uint8_t buffer[sizeof(struct tgsi_full_declaration) + 32] ALIGN16_ATTRIB;
+	 struct tgsi_full_declaration decl;
+	 unsigned long decl_addr = (unsigned long) (mach->Declarations+i);
+	 unsigned size = ((sizeof(decl) + (decl_addr & 0x0f) + 0x0f) & ~0x0f);
+
+	 mfc_get(buffer, decl_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+	 wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+	 memcpy(& decl, buffer + (decl_addr & 0x0f), sizeof(decl));
+	 exec_declaration( mach, &decl );
+      }
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      uint8_t buffer[sizeof(struct tgsi_full_instruction) + 32] ALIGN16_ATTRIB;
+      struct tgsi_full_instruction inst;
+      unsigned long inst_addr = (unsigned long) (mach->Instructions + pc);
+      unsigned size = ((sizeof(inst) + (inst_addr & 0x0f) + 0x0f) & ~0x0f);
+
+      assert(pc < mach->NumInstructions);
+      mfc_get(buffer, inst_addr & ~0x0f, size, TAG_INSTRUCTION_FETCH, 0, 0);
+      wait_on_mask(1 << TAG_INSTRUCTION_FETCH);
+
+      memcpy(& inst, buffer + (inst_addr & 0x0f), sizeof(inst));
+      exec_instruction( mach, & inst, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/mesa/pipe/cell/spu/spu_exec.h b/src/mesa/pipe/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..b4c7661ef6
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_exec.h
@@ -0,0 +1,172 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/tgsi/exec/tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+   qword    q;
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+				      + TGSI_EXEC_NUM_ADDRS + 1]
+       ALIGN16_ATTRIB;
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/mesa/pipe/cell/spu/spu_main.c b/src/mesa/pipe/cell/spu/spu_main.c
index 0c83900a18..e375197fe6 100644
--- a/src/mesa/pipe/cell/spu/spu_main.c
+++ b/src/mesa/pipe/cell/spu/spu_main.c
@@ -31,11 +31,13 @@
 
 #include <stdio.h>
 #include <libmisc.h>
-#include <spu_mfcio.h>
 
 #include "spu_main.h"
-#include "spu_tri.h"
+#include "spu_render.h"
+#include "spu_texture.h"
 #include "spu_tile.h"
+//#include "spu_test.h"
+#include "spu_vertex_shader.h"
 #include "pipe/cell/common.h"
 #include "pipe/p_defines.h"
 
@@ -46,28 +48,37 @@ helpful headers:
 /opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
 */
 
-static boolean Debug = FALSE;
+boolean Debug = FALSE;
 
 struct spu_global spu;
 
+struct spu_vs_context draw;
 
-void
-wait_on_mask(unsigned tagMask)
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
 {
-   mfc_write_tag_mask( tagMask );
-   /* wait for completion of _any_ DMAs specified by tagMask */
-   mfc_read_tag_status_any();
-}
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const uint status[4] ALIGN16_ATTRIB
+      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
 
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
 
-static void
-wait_on_mask_all(unsigned tagMask)
-{
-   mfc_write_tag_mask( tagMask );
-   /* wait for completion of _any_ DMAs specified by tagMask */
-   mfc_read_tag_status_all();
-}
+   ASSERT(buffer < CELL_NUM_BUFFERS);
 
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
 
 
 /**
@@ -81,24 +92,24 @@ really_clear_tiles(uint surfaceIndex)
    uint i;
 
    if (surfaceIndex == 0) {
-      clear_c_tile(&ctile);
+      clear_c_tile(&spu.ctile);
 
       for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
          uint tx = i % spu.fb.width_tiles;
          uint ty = i / spu.fb.width_tiles;
-         if (tile_status[ty][tx] == TILE_STATUS_CLEAR) {
-            put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0);
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
          }
       }
    }
    else {
-      clear_z_tile(&ztile);
+      clear_z_tile(&spu.ztile);
 
       for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
          uint tx = i % spu.fb.width_tiles;
          uint ty = i / spu.fb.width_tiles;
-         if (tile_status_z[ty][tx] == TILE_STATUS_CLEAR)
-            put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 1);
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
       }
    }
 
@@ -122,11 +133,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 #if CLEAR_OPT
    /* set all tile's status to CLEAR */
    if (clear->surface == 0) {
-      memset(tile_status, TILE_STATUS_CLEAR, sizeof(tile_status));
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
       spu.fb.color_clear_value = clear->value;
    }
    else {
-      memset(tile_status_z, TILE_STATUS_CLEAR, sizeof(tile_status_z));
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
       spu.fb.depth_clear_value = clear->value;
    }
    return;
@@ -134,11 +145,11 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 
    if (clear->surface == 0) {
       spu.fb.color_clear_value = clear->value;
-      clear_c_tile(&ctile);
+      clear_c_tile(&spu.ctile);
    }
    else {
       spu.fb.depth_clear_value = clear->value;
-      clear_z_tile(&ztile);
+      clear_z_tile(&spu.ztile);
    }
 
    /*
@@ -150,9 +161,9 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
       uint tx = i % spu.fb.width_tiles;
       uint ty = i / spu.fb.width_tiles;
       if (clear->surface == 0)
-         put_tile(tx, ty, &ctile, TAG_SURFACE_CLEAR, 0);
+         put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
       else
-         put_tile(tx, ty, &ztile, TAG_SURFACE_CLEAR, 1);
+         put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
       /* XXX we don't want this here, but it fixes bad tile results */
    }
 
@@ -165,229 +176,14 @@ cmd_clear_surface(const struct cell_command_clear_surface *clear)
 }
 
 
-/**
- * Given a rendering command's bounding box (in pixels) compute the
- * location of the corresponding screen tile bounding box.
- */
-static INLINE void
-tile_bounding_box(const struct cell_command_render *render,
-                  uint *txmin, uint *tymin,
-                  uint *box_num_tiles, uint *box_width_tiles)
-{
-#if 1
-   /* Debug: full-window bounding box */
-   uint txmax = spu.fb.width_tiles - 1;
-   uint tymax = spu.fb.height_tiles - 1;
-   *txmin = 0;
-   *tymin = 0;
-   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   *box_width_tiles = spu.fb.width_tiles;
-   (void) render;
-   (void) txmax;
-   (void) tymax;
-#else
-   uint txmax, tymax, box_height_tiles;
-
-   *txmin = (uint) render->xmin / TILE_SIZE;
-   *tymin = (uint) render->ymin / TILE_SIZE;
-   txmax = (uint) render->xmax / TILE_SIZE;
-   tymax = (uint) render->ymax / TILE_SIZE;
-   *box_width_tiles = txmax - *txmin + 1;
-   box_height_tiles = tymax - *tymin + 1;
-   *box_num_tiles = *box_width_tiles * box_height_tiles;
-#endif
-#if 0
-   printf("Render bounds: %g, %g  ...  %g, %g\n",
-          render->xmin, render->ymin, render->xmax, render->ymax);
-   printf("Render tiles:  %u, %u .. %u, %u\n", *txmin, *tymin, txmax, tymax);
-#endif
-}
-
-
-/**
- * Render primitives
- * \param pos_incr  returns value indicating how may words to skip after
- *                  this command in the batch buffer
- */
 static void
-cmd_render(const struct cell_command_render *render, uint *pos_incr)
+cmd_release_verts(const struct cell_command_release_verts *release)
 {
-   /* we'll DMA into these buffers */
-   ubyte vertex_data[CELL_MAX_VBUF_SIZE] ALIGN16_ATTRIB;
-   ushort index_data[CELL_MAX_VBUF_INDEXES] ALIGN16_ATTRIB;
-   const uint vertex_size = render->vertex_size; /* in bytes */
-   const uint total_vertex_bytes = render->num_verts * vertex_size;
-   const ubyte *vertices;
-   const ushort *indexes;
-   uint mask;
-   uint i, j;
-
-
-   if (Debug) {
-      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
-             "inline_vert=%u  inline_ind=%u\n",
-             spu.init.id,
-             render->prim_type,
-             render->num_verts,
-             render->num_indexes,
-             render->inline_verts,
-             render->inline_indexes);
-
-      /*
-      printf("       bound: %g, %g .. %g, %g\n",
-             render->xmin, render->ymin, render->xmax, render->ymax);
-      */
-      printf("SPU %u: indices at %p  vertices at %p\n",
-             spu.init.id,
-             render->index_data, render->vertex_data);
-   }
-
-   ASSERT(sizeof(*render) % 4 == 0);
-   ASSERT_ALIGN16(render->vertex_data);
-   ASSERT_ALIGN16(render->index_data);
-
-
-   /**
-    ** Get vertex, index buffers if not inlined
-    **/
-   if (!render->inline_verts) {
-      ASSERT(total_vertex_bytes % 16 == 0);
-
-      mfc_get(vertex_data,  /* dest */
-              (unsigned int) render->vertex_data,  /* src */
-              total_vertex_bytes,  /* size */
-              TAG_VERTEX_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-
-      vertices = vertex_data;
-   }
-
-   if (!render->inline_indexes) {
-      uint total_index_bytes;
-
-      *pos_incr = 0;
-
-      total_index_bytes = render->num_indexes * sizeof(ushort);
-      if (total_index_bytes < 16)
-         total_index_bytes = 16;
-      else
-         total_index_bytes = ROUNDUP16(total_index_bytes);
-
-      indexes = index_data;
-
-      /* get index data from main memory */
-      mfc_get(index_data,  /* dest */
-              (unsigned int) render->index_data,  /* src */
-              total_index_bytes,
-              TAG_INDEX_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-   }
-
-
-   /**
-    ** Get pointers to inlined indexes, verts, if present
-    **/
-   if (render->inline_indexes) {
-      /* indexes are right after the render command in the batch buffer */
-      indexes = (ushort *) (render + 1);
-      *pos_incr = (render->num_indexes * 2 + 3) / 4;
-
-      if (render->inline_verts) {
-         /* vertices are after indexes, if inlined */
-         vertices = (const ubyte *) (render + 1) + *pos_incr * 4;
-         *pos_incr = *pos_incr + total_vertex_bytes / 4;
-      }
-   }
-
-
-   /* wait for vertex and/or index buffers if not inlined */
-   mask = 0x0;
-   if (!render->inline_verts)
-      mask |= (1 << TAG_VERTEX_BUFFER);
-   if (!render->inline_indexes)
-      mask |= (1 << TAG_INDEX_BUFFER);
-   wait_on_mask_all(mask);
-
-
-   /**
-    ** find tiles which intersect the prim bounding box
-    **/
-   uint txmin, tymin, box_width_tiles, box_num_tiles;
-#if 0
-   tile_bounding_box(render, &txmin, &tymin,
-                     &box_num_tiles, &box_width_tiles);
-#else
-   txmin = 0;
-   tymin = 0;
-   box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   box_width_tiles = spu.fb.width_tiles;
-#endif
-
-   /* make sure any pending clears have completed */
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
-
-
-   /**
-    ** loop over tiles, rendering tris
-    **/
-   for (i = spu.init.id; i < box_num_tiles; i += spu.init.num_spus) {
-      const uint tx = txmin + i % box_width_tiles;
-      const uint ty = tymin + i / box_width_tiles;
-
-      ASSERT(tx < spu.fb.width_tiles);
-      ASSERT(ty < spu.fb.height_tiles);
-
-      /* Start fetching color/z tiles.  We'll wait for completion when
-       * we need read/write to them later in triangle rasterization.
-       */
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] != TILE_STATUS_CLEAR) {
-            get_tile(tx, ty, &ztile, TAG_READ_TILE_Z, 1);
-         }
-      }
-
-      if (tile_status[ty][tx] != TILE_STATUS_CLEAR) {
-         get_tile(tx, ty, &ctile, TAG_READ_TILE_COLOR, 0);
-      }
-
-      ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
-      ASSERT(render->num_indexes % 3 == 0);
-
-      /* loop over tris */
-      for (j = 0; j < render->num_indexes; j += 3) {
-         const float *v0, *v1, *v2;
-
-         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
-         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
-         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
-
-         tri_draw(v0, v1, v2, tx, ty);
-      }
-
-      /* write color/z tiles back to main framebuffer, if dirtied */
-      if (tile_status[ty][tx] == TILE_STATUS_DIRTY) {
-         put_tile(tx, ty, &ctile, TAG_WRITE_TILE_COLOR, 0);
-         tile_status[ty][tx] = TILE_STATUS_DEFINED;
-      }
-      if (spu.depth_stencil.depth.enabled) {
-         if (tile_status_z[ty][tx] == TILE_STATUS_DIRTY) {
-            put_tile(tx, ty, &ztile, TAG_WRITE_TILE_Z, 1);
-            tile_status_z[ty][tx] = TILE_STATUS_DEFINED;
-         }
-      }
-
-      /* XXX move these... */
-      wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
-      if (spu.depth_stencil.depth.enabled) {
-         wait_on_mask(1 << TAG_WRITE_TILE_Z);
-      }
-   }
-
    if (Debug)
-      printf("SPU %u: RENDER done\n",
-             spu.init.id);
+      printf("SPU %u: RELEASE VERTS %u\n",
+             spu.init.id, release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
 }
 
 
@@ -421,6 +217,29 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
       spu.fb.zsize = 2;
    else
       spu.fb.zsize = 0;
+
+   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
+      spu.color_shuffle = ((vector unsigned char) {
+                              12, 0, 4, 8, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
+   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
+      spu.color_shuffle = ((vector unsigned char) {
+                              8, 4, 0, 12, 0, 0, 0, 0, 
+                              0, 0, 0, 0, 0, 0, 0, 0});
+   else
+      ASSERT(0);
+}
+
+
+static void
+cmd_state_blend(const struct pipe_blend_state *state)
+{
+   if (Debug)
+      printf("SPU %u: BLEND: enabled %d\n",
+             spu.init.id,
+             state->blend_enable);
+
+   memcpy(&spu.blend, state, sizeof(*state));
 }
 
 
@@ -444,19 +263,53 @@ cmd_state_sampler(const struct pipe_sampler_state *state)
              spu.init.id);
 
    memcpy(&spu.sampler[0], state, sizeof(*state));
+   if (spu.sampler[0].min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      spu.sample_texture = sample_texture_bilinear;
+   else
+      spu.sample_texture = sample_texture_nearest;
 }
 
 
 static void
-cmd_state_vertex_info(const struct vertex_info *vinfo)
+cmd_state_texture(const struct cell_command_texture *texture)
 {
    if (Debug)
+      printf("SPU %u: TEXTURE at %p  size %u x %u\n",
+             spu.init.id, texture->start, texture->width, texture->height);
+
+   memcpy(&spu.texture, texture, sizeof(*texture));
+   spu.tex_size = (vector float)
+      { spu.texture.width, spu.texture.height, 0.0, 0.0};
+   spu.tex_size_mask = (vector unsigned int)
+      { spu.texture.width - 1, spu.texture.height - 1, 0, 0 };
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   if (Debug) {
       printf("SPU %u: VERTEX_INFO num_attribs=%u\n", spu.init.id,
              vinfo->num_attribs);
+   }
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
    memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
 }
 
 
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_ATTRIB_MAX);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.format[attr] = vs_info->format;
+   draw.vertex_fetch.dirty = 1;
+}
+
 
 static void
 cmd_finish(void)
@@ -473,38 +326,6 @@ cmd_finish(void)
 
 
 /**
- * Tell the PPU that this SPU has finished copying a batch buffer to
- * local store and that it may be reused by the PPU.
- * This is done by writting a 16-byte batch-buffer-status block back into
- * main memory (in cell_contex->buffer_status[]).
- */
-static void
-release_batch_buffer(uint buffer)
-{
-   /* Evidently, using less than a 16-byte status doesn't work reliably */
-   static const uint status[4] ALIGN16_ATTRIB
-      = {CELL_BUFFER_STATUS_FREE, 0, 0, 0};
-
-   const uint index = 4 * (spu.init.id * CELL_NUM_BATCH_BUFFERS + buffer);
-   uint *dst = spu.init.buffer_status + index;
-
-   ASSERT(buffer < CELL_NUM_BATCH_BUFFERS);
-
-   /*
-   printf("SPU %u: Set batch status buf=%u, index %u, at %p to FREE\n",
-          spu.init.id, buffer, index, dst);
-   */
-
-   mfc_put((void *) &status,    /* src in local memory */
-           (unsigned int) dst,  /* dst in main memory */
-           sizeof(status),      /* size */
-           TAG_MISC,            /* tag is unimportant */
-           0, /* tid */
-           0  /* rid */);
-}
-
-
-/**
  * Execute a batch of commands
  * The opcode param encodes the location of the buffer and its size.
  */
@@ -513,24 +334,24 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint buffer[CELL_BATCH_BUFFER_SIZE / 4] ALIGN16_ATTRIB;
-   const uint usize = size / sizeof(uint);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
    uint pos;
 
    if (Debug)
       printf("SPU %u: BATCH buffer %u, len %u, from %p\n",
-             spu.init.id, buf, size, spu.init.batch_buffers[buf]);
+             spu.init.id, buf, size, spu.init.buffers[buf]);
 
    ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
 
-   ASSERT_ALIGN16(spu.init.batch_buffers[buf]);
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
 
    size = ROUNDUP16(size);
 
-   ASSERT_ALIGN16(spu.init.batch_buffers[buf]);
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
 
    mfc_get(buffer,  /* dest */
-           (unsigned int) spu.init.batch_buffers[buf],  /* src */
+           (unsigned int) spu.init.buffers[buf],  /* src */
            size,
            TAG_BATCH_BUFFER,
            0, /* tid */
@@ -538,7 +359,9 @@ cmd_batch(uint opcode)
    wait_on_mask(1 << TAG_BATCH_BUFFER);
 
    /* Tell PPU we're done copying the buffer to local store */
-   release_batch_buffer(buf);
+   if (Debug)
+      printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
+   release_buffer(buf);
 
    for (pos = 0; pos < usize; /* no incr */) {
       switch (buffer[pos]) {
@@ -547,7 +370,7 @@ cmd_batch(uint opcode)
             struct cell_command_framebuffer *fb
                = (struct cell_command_framebuffer *) &buffer[pos];
             cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 4;
+            pos += sizeof(*fb) / 8;
          }
          break;
       case CELL_CMD_CLEAR_SURFACE:
@@ -555,7 +378,7 @@ cmd_batch(uint opcode)
             struct cell_command_clear_surface *clr
                = (struct cell_command_clear_surface *) &buffer[pos];
             cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 4;
+            pos += sizeof(*clr) / 8;
          }
          break;
       case CELL_CMD_RENDER:
@@ -564,28 +387,54 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += sizeof(*render) / 4 + pos_incr;
+            pos += pos_incr;
+         }
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
          }
          break;
       case CELL_CMD_FINISH:
          cmd_finish();
          pos += 1;
          break;
+      case CELL_CMD_STATE_BLEND:
+         cmd_state_blend((struct pipe_blend_state *)
+                                 &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_blend_state)) / 8);
+         break;
       case CELL_CMD_STATE_DEPTH_STENCIL:
          cmd_state_depth_stencil((struct pipe_depth_stencil_alpha_state *)
                                  &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_depth_stencil_alpha_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_depth_stencil_alpha_state)) / 8);
          break;
       case CELL_CMD_STATE_SAMPLER:
          cmd_state_sampler((struct pipe_sampler_state *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct pipe_sampler_state) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_sampler_state)) / 8);
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         cmd_state_texture((struct cell_command_texture *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_command_texture)) / 8);
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + sizeof(struct vertex_info) / 4);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       default:
-         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, buffer[pos]);
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
          ASSERT(0);
          break;
       }
@@ -633,31 +482,22 @@ main_loop(void)
               0  /* rid */);
       wait_on_mask( 1 << tag );
 
+      /*
+       * NOTE: most commands should be contained in a batch buffer
+       */
+
       switch (opcode & CELL_CMD_OPCODE_MASK) {
       case CELL_CMD_EXIT:
          if (Debug)
             printf("SPU %u: EXIT\n", spu.init.id);
          exitFlag = 1;
          break;
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         cmd_state_framebuffer(&cmd.fb);
-         break;
-      case CELL_CMD_CLEAR_SURFACE:
-         cmd_clear_surface(&cmd.clear);
-         break;
-      case CELL_CMD_RENDER:
-         {
-            uint pos_incr;
-            cmd_render(&cmd.render, &pos_incr);
-            assert(pos_incr == 0);
-         }
+      case CELL_CMD_VS_EXECUTE:
+         spu_execute_vertex_shader(&draw, &cmd.vs);
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
          break;
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         break;
       default:
          printf("Bad opcode!\n");
       }
@@ -673,11 +513,13 @@ main_loop(void)
 static void
 one_time_init(void)
 {
-   memset(tile_status, TILE_STATUS_DEFINED, sizeof(tile_status));
-   memset(tile_status_z, TILE_STATUS_DEFINED, sizeof(tile_status_z));
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
+   invalidate_tex_cache();
 }
 
 
+
 /* In some versions of the SDK the SPE main takes 'unsigned long' as a
  * parameter.  In others it takes 'unsigned long long'.  Use a define to
  * select between the two.
@@ -698,6 +540,9 @@ main(main_param_t speid, main_param_t argp)
 
    (void) speid;
 
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+
    one_time_init();
 
    if (Debug)
@@ -711,6 +556,10 @@ main(main_param_t speid, main_param_t argp)
            0  /* rid */);
    wait_on_mask( 1 << tag );
 
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc();
+#endif
 
    main_loop();
 
diff --git a/src/mesa/pipe/cell/spu/spu_main.h b/src/mesa/pipe/cell/spu/spu_main.h
index 5bc5d9fa99..1710a17512 100644
--- a/src/mesa/pipe/cell/spu/spu_main.h
+++ b/src/mesa/pipe/cell/spu/spu_main.h
@@ -29,11 +29,33 @@
 #define SPU_MAIN_H
 
 
+#include <spu_mfcio.h>
+
 #include "pipe/cell/common.h"
 #include "pipe/draw/draw_vertex.h"
 #include "pipe/p_state.h"
 
 
+
+#define MAX_WIDTH 1024
+#define MAX_HEIGHT 1024
+
+
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
 struct spu_framebuffer {
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
@@ -57,18 +79,42 @@ struct spu_global
    struct cell_init_info init;
 
    struct spu_framebuffer fb;
+   struct pipe_blend_state blend_stencil;
    struct pipe_depth_stencil_alpha_state depth_stencil;
    struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct cell_command_texture texture;
 
    struct vertex_info vertex_info;
 
    /* XXX more state to come */
 
+
+   /** current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
+
+   /** for converting RGBA to PIPE_FORMAT_x colors */
+   vector unsigned char color_shuffle;
+
+   vector float tex_size;
+   vector unsigned int tex_size_mask; /**< == int(size - 1) */
+
+   vector float (*sample_texture)(vector float texcoord);
+
 } ALIGN16_ATTRIB;
 
 
 extern struct spu_global spu;
+extern boolean Debug;
 
 
 
@@ -84,10 +130,30 @@ extern struct spu_global spu;
 #define TAG_INDEX_BUFFER      16
 #define TAG_BATCH_BUFFER      17
 #define TAG_MISC              18
+#define TAG_TEXTURE_TILE      19
+#define TAG_INSTRUCTION_FETCH 20
+
+
+
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
 
 
-extern void
-wait_on_mask(unsigned tag);
 
 
 static INLINE void
diff --git a/src/mesa/pipe/cell/spu/spu_render.c b/src/mesa/pipe/cell/spu/spu_render.c
new file mode 100644
index 0000000000..932fb500b3
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_render.c
@@ -0,0 +1,301 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "pipe/cell/common.h"
+
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.depth_stencil.depth.enabled) {
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
+      }
+   }
+
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
+   }
+
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.depth_stencil.depth.enabled) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+
+
+   if (Debug) {
+      printf("SPU %u: RENDER prim %u, num_vert=%u  num_ind=%u  "
+             "inline_vert=%u\n",
+             spu.init.id,
+             render->prim_type,
+             render->num_verts,
+             render->num_indexes,
+             render->inline_verts);
+
+      /*
+      printf("       bound: %g, %g .. %g, %g\n",
+             render->xmin, render->ymin, render->xmax, render->ymax);
+      */
+   }
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
+
+
+   if (render->inline_verts) {
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
+
+      get_cz_tiles(tx, ty);
+
+      uint drawn = 0;
+
+      /* loop over tris */
+      for (j = 0; j < render->num_indexes; j += 3) {
+         const float *v0, *v1, *v2;
+
+         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
+         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
+         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
+
+         drawn += tri_draw(v0, v1, v2, tx, ty);
+      }
+
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      put_cz_tiles(tx, ty);
+
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
+   }
+
+   if (Debug)
+      printf("SPU %u: RENDER done\n",
+             spu.init.id);
+}
+
+
diff --git a/src/mesa/pipe/cell/spu/spu_render.h b/src/mesa/pipe/cell/spu/spu_render.h
new file mode 100644
index 0000000000..fbcdc5ec31
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "pipe/cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
diff --git a/src/mesa/pipe/cell/spu/spu_texture.c b/src/mesa/pipe/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..3962aaa4a9
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_texture.c
@@ -0,0 +1,217 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_colorpack.h"
+
+
+/**
+ * Number of texture tiles to cache.
+ * Note that this will probably be the largest consumer of SPU local store/
+ * memory for this driver!
+ */
+#define CACHE_SIZE 16
+
+static tile_t tex_tiles[CACHE_SIZE]  ALIGN16_ATTRIB;
+
+static vector unsigned int tex_tile_xy[CACHE_SIZE];
+
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   /* XXX memset? */
+   uint i;
+   for (i = 0; i < CACHE_SIZE; i++) {
+      tex_tile_xy[i] = ((vector unsigned int) { ~0U, ~0U, ~0U, ~0U });
+   }
+}
+
+
+/**
+ * Return the cache pos/index which corresponds to tile (tx,ty)
+ */
+static INLINE uint
+cache_pos(vector unsigned int txty)
+{
+   uint pos = (spu_extract(txty,0) + spu_extract(txty,1) * 4) % CACHE_SIZE;
+   return pos;
+}
+
+
+/**
+ * Make sure the tile for texel (i,j) is present, return its position/index
+ * in the cache.
+ */
+static uint
+get_tex_tile(vector unsigned int ij)
+{
+   /* tile address: tx,ty */
+   const vector unsigned int txty = spu_rlmask(ij, -5);  /* divide by 32 */
+   const uint pos = cache_pos(txty);
+
+   if ((spu_extract(tex_tile_xy[pos], 0) != spu_extract(txty, 0)) ||
+       (spu_extract(tex_tile_xy[pos], 1) != spu_extract(txty, 1))) {
+
+      /* texture cache miss, fetch tile from main memory */
+      const uint tiles_per_row = spu.texture.width / TILE_SIZE;
+      const uint bytes_per_tile = sizeof(tile_t);
+      const void *src = (const ubyte *) spu.texture.start
+         + (spu_extract(txty,1) * tiles_per_row + spu_extract(txty,0)) * bytes_per_tile;
+
+      printf("SPU %u: tex cache miss at %d, %d  pos=%u  old=%d,%d\n",
+             spu.init.id,
+             spu_extract(txty,0),
+             spu_extract(txty,1),
+             pos,
+             spu_extract(tex_tile_xy[pos],0),
+             spu_extract(tex_tile_xy[pos],1));
+
+      ASSERT_ALIGN16(tex_tiles[pos].ui);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(tex_tiles[pos].ui,  /* dest */
+              (unsigned int) src,
+              bytes_per_tile,      /* size */
+              TAG_TEXTURE_TILE,
+              0, /* tid */
+              0  /* rid */);
+
+      wait_on_mask(1 << TAG_TEXTURE_TILE);
+
+      tex_tile_xy[pos] = txty;
+   }
+   else {
+#if 0
+      printf("SPU %u: tex cache HIT at %d, %d\n",
+             spu.init.id, tx, ty);
+#endif
+   }
+
+   return pos;
+}
+
+
+/**
+ * Get texture sample at texcoord.
+ * XXX this is extremely primitive for now.
+ */
+vector float
+sample_texture_nearest(vector float texcoord)
+{
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
+   itc = spu_and(itc, spu.tex_size_mask);        /* mask (GL_REPEAT) */
+   vector unsigned int ij = spu_and(itc, TILE_SIZE-1); /* intra tile addr */
+   uint pos = get_tex_tile(itc);
+   uint texel = tex_tiles[pos].ui[spu_extract(ij, 1)][spu_extract(ij, 0)];
+   return spu_unpack_A8R8G8B8(texel);
+}
+
+
+vector float
+sample_texture_bilinear(vector float texcoord)
+{
+   static const vector unsigned int offset10 = {1, 0, 0, 0};
+   static const vector unsigned int offset01 = {0, 1, 0, 0};
+
+   vector float tc = spu_mul(texcoord, spu.tex_size);
+   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+
+   /* integer texcoords S,T: */
+   vector unsigned int itc00 = spu_convtu(tc, 0);  /* convert to int */
+   vector unsigned int itc01 = spu_add(itc00, offset01);
+   vector unsigned int itc10 = spu_add(itc00, offset10);
+   vector unsigned int itc11 = spu_add(itc10, offset01);
+
+   /* mask (GL_REPEAT) */
+   itc00 = spu_and(itc00, spu.tex_size_mask);
+   itc01 = spu_and(itc01, spu.tex_size_mask);
+   itc10 = spu_and(itc10, spu.tex_size_mask);
+   itc11 = spu_and(itc11, spu.tex_size_mask);
+
+   /* intra tile addr */
+   vector unsigned int ij00 = spu_and(itc00, TILE_SIZE-1);
+   vector unsigned int ij01 = spu_and(itc01, TILE_SIZE-1);
+   vector unsigned int ij10 = spu_and(itc10, TILE_SIZE-1);
+   vector unsigned int ij11 = spu_and(itc11, TILE_SIZE-1);
+
+   /* get tile cache positions */
+   uint pos00 = get_tex_tile(itc00);
+   uint pos01, pos10, pos11;
+   if ((spu_extract(ij00, 0) < TILE_SIZE-1) &&
+       (spu_extract(ij00, 1) < TILE_SIZE-1)) {
+      /* all texels are in the same tile */
+      pos01 = pos10 = pos11 = pos00;
+   }
+   else {
+      pos01 = get_tex_tile(itc01);
+      pos10 = get_tex_tile(itc10);
+      pos11 = get_tex_tile(itc11);
+   }
+
+   /* get texels from tiles and convert to float[4] */
+   vector float texel00 = spu_unpack_A8R8G8B8(tex_tiles[pos00].ui[spu_extract(ij00, 1)][spu_extract(ij00, 0)]);
+   vector float texel01 = spu_unpack_A8R8G8B8(tex_tiles[pos01].ui[spu_extract(ij01, 1)][spu_extract(ij01, 0)]);
+   vector float texel10 = spu_unpack_A8R8G8B8(tex_tiles[pos10].ui[spu_extract(ij10, 1)][spu_extract(ij10, 0)]);
+   vector float texel11 = spu_unpack_A8R8G8B8(tex_tiles[pos11].ui[spu_extract(ij11, 1)][spu_extract(ij11, 0)]);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
+   vector signed int itc1024 = spu_convts(tc1024, 0);
+   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
+   vector float weight = spu_convtf(itc1024, 10);
+
+   /* smeared frac and 1-frac */
+   vector float sfrac = spu_splats(spu_extract(weight, 0));
+   vector float tfrac = spu_splats(spu_extract(weight, 1));
+   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
+   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
+
+   /* multiply the samples (colors) by the S/T weights */
+   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
+   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
+   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
+   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
+
+   /* compute sum of weighted samples */
+   vector float texel_sum = spu_add(texel00, texel01);
+   texel_sum = spu_add(texel_sum, texel10);
+   texel_sum = spu_add(texel_sum, texel11);
+
+   return texel_sum;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_texture.h b/src/mesa/pipe/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..95eb87080f
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_texture.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern vector float
+sample_texture_nearest(vector float texcoord);
+
+
+extern vector float
+sample_texture_bilinear(vector float texcoord);
+
+
+#endif /* SPU_TEXTURE_H */
diff --git a/src/mesa/pipe/cell/spu/spu_tile.c b/src/mesa/pipe/cell/spu/spu_tile.c
index ca1352f9f8..12dc246328 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.c
+++ b/src/mesa/pipe/cell/spu/spu_tile.c
@@ -28,15 +28,7 @@
 
 
 #include "spu_tile.h"
-
-
-
-tile_t ctile ALIGN16_ATTRIB;
-tile_t ztile ALIGN16_ATTRIB;
-
-ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-
+#include "spu_main.h"
 
 
 void
@@ -55,7 +47,7 @@ get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
    printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
           tile, (unsigned int) src, bytesPerTile);
    */
-   mfc_get(tile->t32,  /* dest in local memory */
+   mfc_get(tile->ui,  /* dest in local memory */
            (unsigned int) src, /* src in main memory */
            bytesPerTile,
            tag,
@@ -81,7 +73,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
           spu.init.id,
           tile, (unsigned int) dst, bytesPerTile);
    */
-   mfc_put((void *) tile->t32,  /* src in local memory */
+   mfc_put((void *) tile->ui,  /* src in local memory */
            (unsigned int) dst,  /* dst in main memory */
            bytesPerTile,
            tag,
diff --git a/src/mesa/pipe/cell/spu/spu_tile.h b/src/mesa/pipe/cell/spu/spu_tile.h
index f83dc009c2..e53340a55a 100644
--- a/src/mesa/pipe/cell/spu/spu_tile.h
+++ b/src/mesa/pipe/cell/spu/spu_tile.h
@@ -35,27 +35,6 @@
 #include "pipe/cell/common.h"
 
 
-#define MAX_WIDTH 1024
-#define MAX_HEIGHT 1024
-
-
-typedef union {
-   ushort t16[TILE_SIZE][TILE_SIZE];
-   uint   t32[TILE_SIZE][TILE_SIZE];
-} tile_t;
-
-
-extern tile_t ctile ALIGN16_ATTRIB;
-extern tile_t ztile ALIGN16_ATTRIB;
-
-
-#define TILE_STATUS_CLEAR   1
-#define TILE_STATUS_DEFINED 2  /**< defined pixel data */
-#define TILE_STATUS_DIRTY   3  /**< modified, but not put back yet */
-
-extern ubyte tile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-extern ubyte tile_status_z[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
-
 
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
@@ -68,7 +47,7 @@ put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
 static INLINE void
 clear_c_tile(tile_t *ctile)
 {
-   memset32((uint*) ctile->t32,
+   memset32((uint*) ctile->ui,
             spu.fb.color_clear_value,
             TILE_SIZE * TILE_SIZE);
 }
@@ -78,12 +57,13 @@ static INLINE void
 clear_z_tile(tile_t *ztile)
 {
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      memset16((ushort*) ztile->t16,
+      memset16((ushort*) ztile->us,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
    }
    else {
-      memset32((uint*) ztile->t32,
+      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
+      memset32((uint*) ztile->ui,
                spu.fb.depth_clear_value,
                TILE_SIZE * TILE_SIZE);
    }
diff --git a/src/mesa/pipe/cell/spu/spu_tri.c b/src/mesa/pipe/cell/spu/spu_tri.c
index 3d0d106c10..be9624cf7d 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.c
+++ b/src/mesa/pipe/cell/spu/spu_tri.c
@@ -32,22 +32,33 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "pipe/p_util.h"
+#include "spu_blend.h"
+#include "spu_colorpack.h"
 #include "spu_main.h"
+#include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
 
+#include "spu_ztest.h"
+
+
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
+typedef vector unsigned int mask_t;
+
+typedef union
+{
+   vector float v;
+   float f[4];
+} float4;
 
 
 /**
  * Simplified types taken from other parts of Gallium
  */
 struct vertex_header {
-   float data[0][4];
+   vector float data[1];
 };
 
-struct prim_header {
-   struct vertex_header *v[3];
-};
 
 
 /* XXX fix this */
@@ -82,9 +93,9 @@ struct edge {
 
 struct interp_coef
 {
-   float a0[4];
-   float dadx[4];
-   float dady[4];
+   float4 a0;
+   float4 dadx;
+   float4 dady;
 };
 
 
@@ -133,6 +144,12 @@ struct setup_stage {
 };
 
 
+
+static struct setup_stage setup;
+
+
+
+
 #if 0
 /**
  * Basically a cast wrapper.
@@ -145,33 +162,33 @@ static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
 
 #if 0
 /**
- * Clip setup->quad against the scissor/surface bounds.
+ * Clip setup.quad against the scissor/surface bounds.
  */
 static INLINE void
 quad_clip(struct setup_stage *setup)
 {
-   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
    const int minx = (int) cliprect->minx;
    const int maxx = (int) cliprect->maxx;
    const int miny = (int) cliprect->miny;
    const int maxy = (int) cliprect->maxy;
 
-   if (setup->quad.x0 >= maxx ||
-       setup->quad.y0 >= maxy ||
-       setup->quad.x0 + 1 < minx ||
-       setup->quad.y0 + 1 < miny) {
+   if (setup.quad.x0 >= maxx ||
+       setup.quad.y0 >= maxy ||
+       setup.quad.x0 + 1 < minx ||
+       setup.quad.y0 + 1 < miny) {
       /* totally clipped */
-      setup->quad.mask = 0x0;
+      setup.quad.mask = 0x0;
       return;
    }
-   if (setup->quad.x0 < minx)
-      setup->quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup->quad.y0 < miny)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup->quad.x0 == maxx - 1)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup->quad.y0 == maxy - 1)
-      setup->quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+   if (setup.quad.x0 < minx)
+      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (setup.quad.y0 < miny)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (setup.quad.x0 == maxx - 1)
+      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (setup.quad.y0 == maxy - 1)
+      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
 }
 #endif
 
@@ -183,9 +200,9 @@ static INLINE void
 clip_emit_quad(struct setup_stage *setup)
 {
    quad_clip(setup);
-   if (setup->quad.mask) {
-      struct softpipe_context *sp = setup->softpipe;
-      sp->quad.first->run(sp->quad.first, &setup->quad);
+   if (setup.quad.mask) {
+      struct softpipe_context *sp = setup.softpipe;
+      sp->quad.first->run(sp->quad.first, &setup.quad);
    }
 }
 #endif
@@ -196,200 +213,145 @@ clip_emit_quad(struct setup_stage *setup)
  * Eg: four colors will be compute.
  */
 static INLINE void
-eval_coeff( struct setup_stage *setup, uint slot,
-            float x, float y, float result[4][4])
+eval_coeff(uint slot, float x, float y, vector float result[4])
 {
-   uint i;
-   const float *dadx = setup->coef[slot].dadx;
-   const float *dady = setup->coef[slot].dady;
+   switch (spu.vertex_info.interp_mode[slot]) {
+   case INTERP_CONSTANT:
+      result[QUAD_TOP_LEFT] =
+      result[QUAD_TOP_RIGHT] =
+      result[QUAD_BOTTOM_LEFT] =
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
+      break;
+
+   case INTERP_LINEAR:
+      /* fall-through, for now */
+   default:
+      {
+         register vector float dadx = setup.coef[slot].dadx.v;
+         register vector float dady = setup.coef[slot].dady.v;
+         register vector float topLeft
+            = spu_add(setup.coef[slot].a0.v,
+                      spu_add(spu_mul(spu_splats(x), dadx),
+                              spu_mul(spu_splats(y), dady)));
 
-   /* loop over XYZW comps */
-   for (i = 0; i < 4; i++) {
-      result[QUAD_TOP_LEFT][i] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-      result[QUAD_TOP_RIGHT][i] = result[0][i] + dadx[i];
-      result[QUAD_BOTTOM_LEFT][i] = result[0][i] + dady[i];
-      result[QUAD_BOTTOM_RIGHT][i] = result[0][i] + dadx[i] + dady[i];
+         result[QUAD_TOP_LEFT] = topLeft;
+         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
+      }
    }
 }
 
 
-static INLINE void
-eval_z( struct setup_stage *setup,
-        float x, float y, float result[4])
+static INLINE vector float
+eval_z(float x, float y)
 {
    const uint slot = 0;
-   const uint i = 2;
-   const float *dadx = setup->coef[slot].dadx;
-   const float *dady = setup->coef[slot].dady;
-
-   result[QUAD_TOP_LEFT] = setup->coef[slot].a0[i] + x * dadx[i] + y * dady[i];
-   result[QUAD_TOP_RIGHT] = result[0] + dadx[i];
-   result[QUAD_BOTTOM_LEFT] = result[0] + dady[i];
-   result[QUAD_BOTTOM_RIGHT] = result[0] + dadx[i] + dady[i];
+   const float dzdx = setup.coef[slot].dadx.f[2];
+   const float dzdy = setup.coef[slot].dady.f[2];
+   const float topLeft = setup.coef[slot].a0.f[2] + x * dzdx + y * dzdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   return spu_add(topLeftv, derivs);
 }
 
 
-static INLINE uint
-pack_color(const float color[4])
+static INLINE mask_t
+do_depth_test(int x, int y, mask_t quadmask)
 {
-   uint r = (uint) (color[0] * 255.0);
-   uint g = (uint) (color[1] * 255.0);
-   uint b = (uint) (color[2] * 255.0);
-   uint a = (uint) (color[3] * 255.0);
-   r = MIN2(r, 255);
-   g = MIN2(g, 255);
-   b = MIN2(b, 255);
-   a = MIN2(a, 255);
-   switch (spu.fb.color_format) {
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return (a << 24) | (r << 16) | (g << 8) | b;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return (b << 24) | (g << 16) | (r << 8) | a;
-   default:
-      ASSERT(0);
-      return 0;
-   }
-}
-
-
-static uint
-do_depth_test(struct setup_stage *setup, int x, int y, unsigned mask)
-{
-   int ix = x - setup->cliprect_minx;
-   int iy = y - setup->cliprect_miny;
-   float zvals[4];
-
-   eval_z(setup, (float) x, (float) y, zvals);
-
-   if (tile_status_z[setup->ty][setup->tx] == TILE_STATUS_CLEAR) {
-      /* now, _really_ clear the tile */
-      clear_z_tile(&ztile);
-   }
-   else {
-      /* make sure we've got the tile from main mem */
-      wait_on_mask(1 << TAG_READ_TILE_Z);
-   }
-   tile_status_z[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
+   float4 zvals;
+   mask_t mask;
 
+   zvals.v = eval_z((float) x, (float) y);
 
    if (spu.fb.depth_format == PIPE_FORMAT_Z16_UNORM) {
-      const float zscale = 65535.0;
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) (zvals[0] * zscale);
-         if (z < ztile.t16[iy][ix])
-            ztile.t16[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) (zvals[1] * zscale);
-         if (z < ztile.t16[iy][ix+1])
-            ztile.t16[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) (zvals[2] * zscale);
-         if (z < ztile.t16[iy+1][ix])
-            ztile.t16[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) (zvals[3] * zscale);
-         if (z < ztile.t16[iy+1][ix+1])
-            ztile.t16[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 4;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z16_test_less(zvals.v, &spu.ztile.us8[iy][ix], x>>1, quadmask);
    }
    else {
-      const float zscale = (float) 0xffffffff;
-      ASSERT(spu.fb.depth_format == PIPE_FORMAT_Z32_UNORM);
-      if (mask & MASK_TOP_LEFT) {
-         uint z = (uint) (zvals[0] * zscale);
-         if (z < ztile.t32[iy][ix])
-            ztile.t32[iy][ix] = z;
-         else
-            mask &= ~MASK_TOP_LEFT;
-      }
-
-      if (mask & MASK_TOP_RIGHT) {
-         uint z = (uint) (zvals[1] * zscale);
-         if (z < ztile.t32[iy][ix+1])
-            ztile.t32[iy][ix+1] = z;
-         else
-            mask &= ~MASK_TOP_RIGHT;
-      }
-
-      if (mask & MASK_BOTTOM_LEFT) {
-         uint z = (uint) (zvals[2] * zscale);
-         if (z < ztile.t32[iy+1][ix])
-            ztile.t32[iy+1][ix] = z;
-         else
-            mask &= ~MASK_BOTTOM_LEFT;
-      }
-
-      if (mask & MASK_BOTTOM_RIGHT) {
-         uint z = (uint) (zvals[3] * zscale);
-         if (z < ztile.t32[iy+1][ix+1])
-            ztile.t32[iy+1][ix+1] = z;
-         else
-            mask &= ~MASK_BOTTOM_RIGHT;
-      }
+      int ix = (x - setup.cliprect_minx) / 2;
+      int iy = (y - setup.cliprect_miny) / 2;
+      mask = spu_z32_test_less(zvals.v, &spu.ztile.ui4[iy][ix], quadmask);
    }
 
+   if (spu_extract(spu_orx(mask), 0))
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
    return mask;
 }
 
 
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
  */
 static INLINE void
-emit_quad( struct setup_stage *setup, int x, int y, unsigned mask )
+emit_quad( int x, int y, mask_t mask )
 {
 #if 0
-   struct softpipe_context *sp = setup->softpipe;
-   setup->quad.x0 = x;
-   setup->quad.y0 = y;
-   setup->quad.mask = mask;
-   sp->quad.first->run(sp->quad.first, &setup->quad);
+   struct softpipe_context *sp = setup.softpipe;
+   setup.quad.x0 = x;
+   setup.quad.y0 = y;
+   setup.quad.mask = mask;
+   sp->quad.first->run(sp->quad.first, &setup.quad);
 #else
-   /* Cell: "write" quad fragments to the tile by setting prim color */
-   const int ix = x - setup->cliprect_minx;
-   const int iy = y - setup->cliprect_miny;
-   float colors[4][4];
-
-   eval_coeff(setup, 1, (float) x, (float) y, colors);
 
    if (spu.depth_stencil.depth.enabled) {
-      mask &= do_depth_test(setup, x, y, mask);
+      mask = do_depth_test(x, y, mask);
    }
 
-   if (mask) {
-      if (tile_status[setup->ty][setup->tx] == TILE_STATUS_CLEAR) {
-         /* now, _really_ clear the tile */
-         clear_c_tile(&ctile);
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+      const vector unsigned char shuffle = spu.color_shuffle;
+      vector float colors[4];
+
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+
+      if (spu.texture.start) {
+         /* texture mapping */
+         vector float texcoords[4];
+         eval_coeff(2, (float) x, (float) y, texcoords);
+
+         if (spu_extract(mask, 0))
+            colors[0] = spu.sample_texture(texcoords[0]);
+         if (spu_extract(mask, 1))
+            colors[1] = spu.sample_texture(texcoords[1]);
+         if (spu_extract(mask, 2))
+            colors[2] = spu.sample_texture(texcoords[2]);
+         if (spu_extract(mask, 3))
+            colors[3] = spu.sample_texture(texcoords[3]);
       }
       else {
-         /* make sure we've got the tile from main mem */
-         wait_on_mask(1 << TAG_READ_TILE_COLOR);
+         /* simple shading */
+         eval_coeff(1, (float) x, (float) y, colors);
       }
-      tile_status[setup->ty][setup->tx] = TILE_STATUS_DIRTY;
 
-      if (mask & MASK_TOP_LEFT)
-         ctile.t32[iy][ix] = pack_color(colors[QUAD_TOP_LEFT]);
-      if (mask & MASK_TOP_RIGHT)
-         ctile.t32[iy][ix+1] = pack_color(colors[QUAD_TOP_RIGHT]);
-      if (mask & MASK_BOTTOM_LEFT)
-         ctile.t32[iy+1][ix] = pack_color(colors[QUAD_BOTTOM_LEFT]);
-      if (mask & MASK_BOTTOM_RIGHT)
-         ctile.t32[iy+1][ix+1] = pack_color(colors[QUAD_BOTTOM_RIGHT]);
+#if 1
+      if (spu.blend.blend_enable)
+         blend_quad(ix % TILE_SIZE, iy % TILE_SIZE, colors);
+#endif
+
+      if (spu_extract(mask, 0))
+         spu.ctile.ui[iy][ix] = spu_pack_color_shuffle(colors[0], shuffle);
+      if (spu_extract(mask, 1))
+         spu.ctile.ui[iy][ix+1] = spu_pack_color_shuffle(colors[1], shuffle);
+      if (spu_extract(mask, 2))
+         spu.ctile.ui[iy+1][ix] = spu_pack_color_shuffle(colors[2], shuffle);
+      if (spu_extract(mask, 3))
+         spu.ctile.ui[iy+1][ix+1] = spu_pack_color_shuffle(colors[3], shuffle);
+
+#if 0
+      /* SIMD_Z with swizzled color buffer (someday) */
+      vector unsigned int uicolors = *((vector unsigned int *) &colors);
+      spu.ctile.ui4[iy/2][ix/2] = spu_sel(spu.ctile.ui4[iy/2][ix/2], uicolors, mask);
+#endif
    }
+
 #endif
 }
 
@@ -407,26 +369,19 @@ static INLINE int block( int x )
 /**
  * Compute mask which indicates which pixels in the 2x2 quad are actually inside
  * the triangle's bounds.
- *
- * this is pretty nasty...  may need to rework flush_spans again to
- * fix it, if possible.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static unsigned calculate_mask( struct setup_stage *setup, int x )
+static INLINE mask_t calculate_mask( int x )
 {
-   unsigned mask = 0x0;
-
-   if (x >= setup->span.left[0] && x < setup->span.right[0]) 
-      mask |= MASK_TOP_LEFT;
-
-   if (x >= setup->span.left[1] && x < setup->span.right[1]) 
-      mask |= MASK_BOTTOM_LEFT;
-      
-   if (x+1 >= setup->span.left[0] && x+1 < setup->span.right[0]) 
-      mask |= MASK_TOP_RIGHT;
-
-   if (x+1 >= setup->span.left[1] && x+1 < setup->span.right[1]) 
-      mask |= MASK_BOTTOM_RIGHT;
-
+   /* This is a little tricky.
+    * Use & instead of && to avoid branches.
+    * Use negation to convert true/false to ~0/0 values.
+    */
+   mask_t mask;
+   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
+   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
+   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
    return mask;
 }
 
@@ -434,144 +389,175 @@ static unsigned calculate_mask( struct setup_stage *setup, int x )
 /**
  * Render a horizontal span of quads
  */
-static void flush_spans( struct setup_stage *setup )
+static void flush_spans( void )
 {
    int minleft, maxright;
    int x;
 
-   switch (setup->span.y_flags) {
+   switch (setup.span.y_flags) {
    case 0x3:
       /* both odd and even lines written (both quad rows) */
-      minleft = MIN2(setup->span.left[0], setup->span.left[1]);
-      maxright = MAX2(setup->span.right[0], setup->span.right[1]);
+      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
+      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
       break;
 
    case 0x1:
       /* only even line written (quad top row) */
-      minleft = setup->span.left[0];
-      maxright = setup->span.right[0];
+      minleft = setup.span.left[0];
+      maxright = setup.span.right[0];
       break;
 
    case 0x2:
       /* only odd line written (quad bottom row) */
-      minleft = setup->span.left[1];
-      maxright = setup->span.right[1];
+      minleft = setup.span.left[1];
+      maxright = setup.span.right[1];
       break;
 
    default:
       return;
    }
 
+
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.depth_stencil.depth.enabled) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
+   }
+
    /* XXX this loop could be moved into the above switch cases and
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-      emit_quad( setup, x, setup->span.y, 
-                 calculate_mask( setup, x ) );
+#if 1
+      emit_quad( x, setup.span.y, calculate_mask( x ) );
+#endif
    }
 
-   setup->span.y = 0;
-   setup->span.y_flags = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
 }
 
 #if DEBUG_VERTS
-static void print_vertex(const struct setup_stage *setup,
-                         const struct vertex_header *v)
+static void print_vertex(const struct vertex_header *v)
 {
    int i;
    fprintf(stderr, "Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   for (i = 0; i < setup.quad.nr_attrs; i++) {
       fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
    }
 }
 #endif
 
-static boolean setup_sort_vertices( struct setup_stage *setup,
-				      const struct prim_header *prim )
+
+static boolean setup_sort_vertices(const struct vertex_header *v0,
+                                   const struct vertex_header *v1,
+                                   const struct vertex_header *v2)
 {
-   const struct vertex_header *v0 = prim->v[0];
-   const struct vertex_header *v1 = prim->v[1];
-   const struct vertex_header *v2 = prim->v[2];
 
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-   print_vertex(setup, v2);
+   print_vertex(v0);
+   print_vertex(v1);
+   print_vertex(v2);
 #endif
 
-   setup->vprovoke = v2;
+   setup.vprovoke = v2;
 
    /* determine bottom to top order of vertices */
    {
-      float y0 = v0->data[0][1];
-      float y1 = v1->data[0][1];
-      float y2 = v2->data[0][1];
+      float y0 = spu_extract(v0->data[0], 1);
+      float y1 = spu_extract(v1->data[0], 1);
+      float y2 = spu_extract(v2->data[0], 1);
       if (y0 <= y1) {
 	 if (y1 <= y2) {
 	    /* y0<=y1<=y2 */
-	    setup->vmin = v0;   
-	    setup->vmid = v1;   
-	    setup->vmax = v2;
+	    setup.vmin = v0;   
+	    setup.vmid = v1;   
+	    setup.vmax = v2;
 	 }
 	 else if (y2 <= y0) {
 	    /* y2<=y0<=y1 */
-	    setup->vmin = v2;   
-	    setup->vmid = v0;   
-	    setup->vmax = v1;   
+	    setup.vmin = v2;   
+	    setup.vmid = v0;   
+	    setup.vmax = v1;   
 	 }
 	 else {
 	    /* y0<=y2<=y1 */
-	    setup->vmin = v0;   
-	    setup->vmid = v2;   
-	    setup->vmax = v1;  
+	    setup.vmin = v0;   
+	    setup.vmid = v2;   
+	    setup.vmax = v1;  
 	 }
       }
       else {
 	 if (y0 <= y2) {
 	    /* y1<=y0<=y2 */
-	    setup->vmin = v1;   
-	    setup->vmid = v0;   
-	    setup->vmax = v2;  
+	    setup.vmin = v1;   
+	    setup.vmid = v0;   
+	    setup.vmax = v2;  
 	 }
 	 else if (y2 <= y1) {
 	    /* y2<=y1<=y0 */
-	    setup->vmin = v2;   
-	    setup->vmid = v1;   
-	    setup->vmax = v0;  
+	    setup.vmin = v2;   
+	    setup.vmid = v1;   
+	    setup.vmax = v0;  
 	 }
 	 else {
 	    /* y1<=y2<=y0 */
-	    setup->vmin = v1;   
-	    setup->vmid = v2;   
-	    setup->vmax = v0;
+	    setup.vmin = v1;   
+	    setup.vmid = v2;   
+	    setup.vmax = v0;
 	 }
       }
    }
 
    /* Check if triangle is completely outside the tile bounds */
-   if (setup->vmin->data[0][1] > setup->cliprect_maxy)
+   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
       return FALSE;
-   if (setup->vmax->data[0][1] < setup->cliprect_miny)
+   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
       return FALSE;
-   if (setup->vmin->data[0][0] < setup->cliprect_minx &&
-       setup->vmid->data[0][0] < setup->cliprect_minx &&
-       setup->vmax->data[0][0] < setup->cliprect_minx)
+   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
       return FALSE;
-   if (setup->vmin->data[0][0] > setup->cliprect_maxx &&
-       setup->vmid->data[0][0] > setup->cliprect_maxx &&
-       setup->vmax->data[0][0] > setup->cliprect_maxx)
+   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
       return FALSE;
 
-   setup->ebot.dx = setup->vmid->data[0][0] - setup->vmin->data[0][0];
-   setup->ebot.dy = setup->vmid->data[0][1] - setup->vmin->data[0][1];
-   setup->emaj.dx = setup->vmax->data[0][0] - setup->vmin->data[0][0];
-   setup->emaj.dy = setup->vmax->data[0][1] - setup->vmin->data[0][1];
-   setup->etop.dx = setup->vmax->data[0][0] - setup->vmid->data[0][0];
-   setup->etop.dy = setup->vmax->data[0][1] - setup->vmid->data[0][1];
+   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
+   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
 
    /*
     * Compute triangle's area.  Use 1/area to compute partial
@@ -584,13 +570,13 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
     * use the prim->det value because its sign is correct.
     */
    {
-      const float area = (setup->emaj.dx * setup->ebot.dy - 
-			    setup->ebot.dx * setup->emaj.dy);
+      const float area = (setup.emaj.dx * setup.ebot.dy - 
+			    setup.ebot.dx * setup.emaj.dy);
 
-      setup->oneoverarea = 1.0f / area;
+      setup.oneoverarea = 1.0f / area;
       /*
       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup->oneoverarea, area, prim->det );
+                   __FUNCTION__, setup.oneoverarea, area, prim->det );
       */
    }
 
@@ -599,56 +585,52 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.facing = (prim->det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup.quad.facing = (prim->det > 0.0) ^ (setup.softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
 #endif
 
    return TRUE;
 }
 
 
-#if 0
 /**
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex->data[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
  * \param slot  which attribute slot 
- * \param i  which component of the slot (0..3)
  */
-static void const_coeff( struct setup_stage *setup,
-			 unsigned slot,
-			 unsigned i )
+static INLINE void
+const_coeff(uint slot)
 {
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
-
-   setup->coef[slot].dadx[i] = 0;
-   setup->coef[slot].dady[i] = 0;
-
-   /* need provoking vertex info!
-    */
-   setup->coef[slot].a0[i] = setup->vprovoke->data[slot][i];
+   setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0.v = setup.vprovoke->data[slot];
 }
-#endif
 
 
 /**
  * Compute a0, dadx and dady for a linearly interpolated coefficient,
  * for a triangle.
  */
-static void tri_linear_coeff( struct setup_stage *setup,
-                              uint slot, uint firstComp, uint lastComp )
+static INLINE void
+tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
 {
    uint i;
+   const float *vmin_d = (float *) &setup.vmin->data[slot];
+   const float *vmid_d = (float *) &setup.vmid->data[slot];
+   const float *vmax_d = (float *) &setup.vmax->data[slot];
+   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
+   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+
    for (i = firstComp; i < lastComp; i++) {
-      float botda = setup->vmid->data[slot][i] - setup->vmin->data[slot][i];
-      float majda = setup->vmax->data[slot][i] - setup->vmin->data[slot][i];
-      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+      float botda = vmid_d[i] - vmin_d[i];
+      float majda = vmax_d[i] - vmin_d[i];
+      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
    
       ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
 
-      setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-      setup->coef[slot].dady[i] = b * setup->oneoverarea;
+      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
 
       /* calculate a0 as the value which would be sampled for the
        * fragment at (0,0), taking into account that we want to sample at
@@ -662,21 +644,52 @@ static void tri_linear_coeff( struct setup_stage *setup,
        * to define a0 as the sample at a pixel center somewhere near vmin
        * instead - i'll switch to this later.
        */
-      setup->coef[slot].a0[i] = (setup->vmin->data[slot][i] - 
-                                 (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-                                  setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
+                                 (setup.coef[slot].dadx.f[i] * x + 
+                                  setup.coef[slot].dady.f[i] * y));
    }
 
    /*
    _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
-		setup->coef[slot].a0[i],
-		setup->coef[slot].dadx[i],
-		setup->coef[slot].dady[i]);
+		setup.coef[slot].a0[i],
+		setup.coef[slot].dadx.f[i],
+		setup.coef[slot].dady.f[i]);
    */
 }
 
 
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
 #if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
@@ -686,46 +699,45 @@ static void tri_linear_coeff( struct setup_stage *setup,
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( struct setup_stage *setup,
-                             unsigned slot,
+static void tri_persp_coeff( unsigned slot,
                              unsigned i )
 {
    /* premultiply by 1/w:
     */
-   float mina = setup->vmin->data[slot][i] * setup->vmin->data[0][3];
-   float mida = setup->vmid->data[slot][i] * setup->vmid->data[0][3];
-   float maxa = setup->vmax->data[slot][i] * setup->vmax->data[0][3];
+   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
+   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
+   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
 
    float botda = mida - mina;
    float majda = maxa - mina;
-   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
+   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
       
    /*
    printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup->vmin->data[slot][i],
-          setup->vmid->data[slot][i],
-          setup->vmax->data[slot][i]
+          setup.vmin->data[slot][i],
+          setup.vmid->data[slot][i],
+          setup.vmax->data[slot][i]
           );
    */
 
    assert(slot < PIPE_MAX_SHADER_INPUTS);
    assert(i <= 3);
 
-   setup->coef[slot].dadx[i] = a * setup->oneoverarea;
-   setup->coef[slot].dady[i] = b * setup->oneoverarea;
-   setup->coef[slot].a0[i] = (mina - 
-			    (setup->coef[slot].dadx[i] * (setup->vmin->data[0][0] - 0.5f) + 
-			     setup->coef[slot].dady[i] * (setup->vmin->data[0][1] - 0.5f)));
+   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
+   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
+   setup.coef[slot].a0.f[i] = (mina - 
+			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
+			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
 }
 #endif
 
 
 /**
- * Compute the setup->coef[] array dadx, dady, a0 values.
- * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients( struct setup_stage *setup )
+static void setup_tri_coefficients(void)
 {
 #if 1
    uint i;
@@ -735,15 +747,18 @@ static void setup_tri_coefficients( struct setup_stage *setup )
       case INTERP_NONE:
          break;
       case INTERP_POS:
-         tri_linear_coeff(setup, i, 2, 3);  /* slot 0, z */
+         /*tri_linear_coeff(i, 2, 3);*/
          /* XXX interp W if PERSPECTIVE... */
+         tri_linear_coeff4(i);
          break;
       case INTERP_CONSTANT:
-         /* fall-through */
+         const_coeff(i);
+         break;
       case INTERP_LINEAR:
-         tri_linear_coeff(setup, i, 0, 4);  /* slot 1, color */
+         tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
+         tri_linear_coeff4(i);  /* temporary */
          break;
       default:
          ASSERT(0);
@@ -753,35 +768,35 @@ static void setup_tri_coefficients( struct setup_stage *setup )
    ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
    ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
           spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(setup, 0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(setup, 1, 0, 4);  /* slot 1, color */
+   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
+   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
 #endif
 }
 
 
-static void setup_tri_edges( struct setup_stage *setup )
+static void setup_tri_edges(void)
 {
-   float vmin_x = setup->vmin->data[0][0] + 0.5f;
-   float vmid_x = setup->vmid->data[0][0] + 0.5f;
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
 
-   float vmin_y = setup->vmin->data[0][1] - 0.5f;
-   float vmid_y = setup->vmid->data[0][1] - 0.5f;
-   float vmax_y = setup->vmax->data[0][1] - 0.5f;
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
 
-   setup->emaj.sy = CEILF(vmin_y);
-   setup->emaj.lines = (int) CEILF(vmax_y - setup->emaj.sy);
-   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
-   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
 
-   setup->etop.sy = CEILF(vmid_y);
-   setup->etop.lines = (int) CEILF(vmax_y - setup->etop.sy);
-   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
-   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
 
-   setup->ebot.sy = CEILF(vmin_y);
-   setup->ebot.lines = (int) CEILF(vmid_y - setup->ebot.sy);
-   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
-   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
 }
 
 
@@ -789,15 +804,14 @@ static void setup_tri_edges( struct setup_stage *setup )
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct setup_stage *setup,
-			 struct edge *eleft,
+static void subtriangle( struct edge *eleft,
 			 struct edge *eright,
 			 unsigned lines )
 {
-   const int minx = setup->cliprect_minx;
-   const int maxx = setup->cliprect_maxx;
-   const int miny = setup->cliprect_miny;
-   const int maxy = setup->cliprect_maxy;
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
    int y, start_y, finish_y;
    int sy = (int)eleft->sy;
 
@@ -839,14 +853,14 @@ static void subtriangle( struct setup_stage *setup,
 
       if (left < right) {
          int _y = sy + y;
-         if (block(_y) != setup->span.y) {
-            flush_spans(setup);
-            setup->span.y = block(_y);
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
          }
 
-         setup->span.left[_y&1] = left;
-         setup->span.right[_y&1] = right;
-         setup->span.y_flags |= 1<<(_y&1);
+         setup.span.left[_y&1] = left;
+         setup.span.right[_y&1] = right;
+         setup.span.y_flags |= 1<<(_y&1);
       }
    }
 
@@ -861,70 +875,52 @@ static void subtriangle( struct setup_stage *setup,
 
 
 /**
- * Do setup for triangle rasterization, then render the triangle.
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
  */
-static void
-setup_tri(struct setup_stage *setup, struct prim_header *prim)
+boolean
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
 {
-   if (!setup_sort_vertices( setup, prim )) {
-      return; /* totally clipped */
-   }
+   setup.tx = tx;
+   setup.ty = ty;
 
-   setup_tri_coefficients( setup );
-   setup_tri_edges( setup );
+   /* set clipping bounds to tile bounds */
+   setup.cliprect_minx = tx * TILE_SIZE;
+   setup.cliprect_miny = ty * TILE_SIZE;
+   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
+   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
-#if 0
-   setup->quad.prim = PRIM_TRI;
-#endif
+   if (!setup_sort_vertices((struct vertex_header *) v0,
+                            (struct vertex_header *) v1,
+                            (struct vertex_header *) v2)) {
+      return FALSE; /* totally clipped */
+   }
 
-   setup->span.y = 0;
-   setup->span.y_flags = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
-   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+   setup_tri_coefficients();
+   setup_tri_edges();
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 
    /*   init_constant_attribs( setup ); */
       
-   if (setup->oneoverarea < 0.0) {
+   if (setup.oneoverarea < 0.0) {
       /* emaj on left:
        */
-      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
-      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
       /* emaj on right:
        */
-      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
-      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
 
-   flush_spans( setup );
-}
-
-
+   flush_spans();
 
-/**
- * Draw triangle into tile at (tx, ty) (tile coords)
- * The tile data should have already been fetched.
- */
-void
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
-{
-   struct prim_header tri;
-   struct setup_stage setup;
-
-   tri.v[0] = (struct vertex_header *) v0;
-   tri.v[1] = (struct vertex_header *) v1;
-   tri.v[2] = (struct vertex_header *) v2;
-
-   setup.tx = tx;
-   setup.ty = ty;
-
-   /* set clipping bounds to tile bounds */
-   setup.cliprect_minx = tx * TILE_SIZE;
-   setup.cliprect_miny = ty * TILE_SIZE;
-   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
-   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
-
-   setup_tri(&setup, &tri);
+   return TRUE;
 }
diff --git a/src/mesa/pipe/cell/spu/spu_tri.h b/src/mesa/pipe/cell/spu/spu_tri.h
index 86c42b6339..aa694dd7c9 100644
--- a/src/mesa/pipe/cell/spu/spu_tri.h
+++ b/src/mesa/pipe/cell/spu/spu_tri.h
@@ -30,7 +30,7 @@
 #define SPU_TRI_H
 
 
-extern void
+extern boolean
 tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
 
 
diff --git a/src/mesa/pipe/cell/spu/spu_util.c b/src/mesa/pipe/cell/spu/spu_util.c
new file mode 100644
index 0000000000..ac373240c1
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_util.c
@@ -0,0 +1,165 @@
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/tgsi/util/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "pipe/tgsi/util/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_fetch.c b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..6e86a919ce
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,393 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include <spu_mfcio.h>
+#include <transpose_matrix4x4.h>
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+#include "spu_main.h"
+
+
+#define DRAW_DBG 0
+
+
+static const vec_float4 defaults = { 0.0, 0.0, 0.0, 1.0 };
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static qword						\
+fetch_##NAME(const void *ptr)				\
+{							\
+   vec_float4 attrib = defaults;			\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib = spu_insert(CVT, attrib, i);		\
+   }							\
+   return (qword) attrib;				\
+}
+
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static spu_fetch_func get_fetch_func( enum pipe_format format )
+{
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      _mesa_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case 0:
+      return NULL;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return NULL;
+   }
+}
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      qword p[4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) {
+         uint8_t buffer[32] ALIGN16_ATTRIB;
+         const uint64_t addr = src + (elts[i] * pitch);
+         const unsigned size = ((addr & 0x0f) == 0) ? 16 : 32;
+
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
+         mfc_get(buffer, addr & ~0x0f, size, TAG_VERTEX_BUFFER, 0, 0);
+         wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+         p[i] = (*fetch)(buffer + (addr & 0x0f));
+      }
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for (/* empty */; i < 4; i++) 
+          p[i] = si_xor(p[i], p[i]);
+
+      /* Transpose/swizzle into vector-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      _transpose_matrix4x4(&machine->Inputs[attr].xyzw[0].q, p);
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   unsigned i;
+
+   
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      draw->vertex_fetch.fetch[i] =
+          get_fetch_func(draw->vertex_fetch.format[i]);
+   }
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.c b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..c1cbbb6d1e
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,231 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include <spu_mfcio.h>
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "pipe/draw/draw_private.h"
+#include "pipe/draw/draw_context.h"
+#include "pipe/cell/common.h"
+#include "spu_main.h"
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   const uint64_t *vOut)
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   assert(count <= 4);
+
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
+					   draw->nr_planes);
+      tmpOut->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+   } /* loop over vertices */
+}
+
+
+static void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       void *uniforms,
+		       void *planes,
+		       unsigned nr_planes,
+		       unsigned num_outputs
+		       )
+{
+   draw->constants = (float (*)[4]) uniforms;
+
+   (void) memcpy(draw->plane, planes, sizeof(float) * 4 * nr_planes);
+   draw->nr_planes = nr_planes;
+   draw->num_vs_outputs = num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+}
+
+
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
+    ALIGN16_ATTRIB;
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+                          const struct cell_command_vs *vs)
+{
+   unsigned i;
+
+   const uint64_t immediate_addr = vs->shader.immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->shader.num_immediates)
+                 + (immediate_addr & 0x0f));
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->shader.instructions;
+   draw->machine.NumInstructions = vs->shader.num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->shader.declarations;
+   draw->machine.NumDeclarations = vs->shader.num_declarations;
+
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->shader.num_immediates);
+
+   spu_bind_vertex_shader(draw, vs->shader.uniforms,
+                          vs->plane, vs->nr_planes,
+                          vs->shader.num_outputs);
+
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
+   }
+}
diff --git a/src/mesa/pipe/cell/spu/spu_vertex_shader.h b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..c96b93ff0a
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,61 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef qword (*spu_fetch_func)(const void *ptr);
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      uint64_t src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      enum pipe_format format[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_fetch_func fetch[PIPE_ATTRIB_MAX];
+      spu_full_fetch_func fetch_func;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */
diff --git a/src/mesa/pipe/cell/spu/spu_ztest.h b/src/mesa/pipe/cell/spu/spu_ztest.h
new file mode 100644
index 0000000000..ce8ad00339
--- /dev/null
+++ b/src/mesa/pipe/cell/spu/spu_ztest.h
@@ -0,0 +1,135 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Zbuffer/depth test code.
+ */
+
+
+#ifndef SPU_ZTEST_H
+#define SPU_ZTEST_H
+
+
+#ifdef __SPU__
+#include <spu_intrinsics.h>
+#endif
+
+
+
+/**
+ * Perform Z testing for a 16-bit/value Z buffer.
+ *
+ * \param zvals  vector of four fragment zvalues as floats
+ * \param zbuf   ptr to vector of ushort[8] zbuffer values.  Note that this
+ *               contains the Z values for 2 quads, 8 pixels.
+ * \param x      x coordinate of quad (only lsbit is significant)
+ * \param inMask indicates which fragments in the quad are alive
+ * \return new mask indicating which fragments are alive after ztest
+ */
+static INLINE vector unsigned int
+spu_z16_test_less(vector float zvals, vector unsigned short *zbuf,
+                  uint x, vector unsigned int inMask)
+{
+#define ZERO 0x80
+   vector unsigned int zvals_ui4, zbuf_ui4, mask;
+
+   /* convert floats to uints in [0, 65535] */
+   zvals_ui4 = spu_convtu(zvals, 32); /* convert to [0, 2^32] */
+   zvals_ui4 = spu_rlmask(zvals_ui4, -16);  /* right shift 16 */
+
+   /* XXX this conditional could be removed with a bit of work */
+   if (x & 1) {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather lower four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             ((vector unsigned char) {
+                                ZERO, ZERO,  8,  9, ZERO, ZERO, 10, 11,
+                                ZERO, ZERO, 12, 13, ZERO, ZERO, 14, 15}));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve lower 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     ((vector unsigned char) {
+                        16, 17, 18, 19, 20, 21, 22, 23,
+                        2, 3, 6, 7, 10, 11, 14, 15}));
+   }
+   else {
+      /* convert zbuffer values from ushorts to uints */
+      /* gather upper four ushorts */
+      zbuf_ui4 = spu_shuffle((vector unsigned int) *zbuf,
+                             (vector unsigned int) *zbuf,
+                             ((vector unsigned char) {
+                                ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
+                                ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7}));
+      /* mask = (zbuf_ui4 < zvals_ui4) ? ~0 : 0 */
+      mask = spu_cmpgt(zbuf_ui4, zvals_ui4);
+      /* mask &= inMask */
+      mask = spu_and(mask, inMask);
+      /* zbuf = mask ? zval : zbuf */
+      zbuf_ui4 = spu_sel(zbuf_ui4, zvals_ui4, mask);
+      /* convert zbuffer values from uints back to ushorts, preserve upper 4 */
+      *zbuf = (vector unsigned short)
+         spu_shuffle(zbuf_ui4, (vector unsigned int) *zbuf,
+                     ((vector unsigned char) {
+                        2, 3, 6, 7, 10, 11, 14, 15,
+                        24, 25, 26, 27, 28, 29, 30, 31}));
+   }
+   return mask;
+#undef ZERO
+}
+
+
+/**
+ * As above, but Zbuffer values as 32-bit uints
+ */
+static INLINE vector unsigned int
+spu_z32_test_less(vector float zvals, vector unsigned int *zbuf_ptr,
+                  vector unsigned int inMask)
+{
+   vector unsigned int zvals_ui4, mask, zbuf = *zbuf_ptr;
+
+   /* convert floats to uints in [0, 0xffffffff] */
+   zvals_ui4 = spu_convtu(zvals, 32);
+   /* mask = (zbuf < zvals_ui4) ? ~0 : 0 */
+   mask = spu_cmpgt(zbuf, zvals_ui4);
+   /* mask &= inMask */
+   mask = spu_and(mask, inMask);
+   /* zbuf = mask ? zval : zbuf */
+   *zbuf_ptr = spu_sel(zbuf, zvals_ui4, mask);
+
+   return mask;
+}
+
+
+#endif /* SPU_ZTEST_H */
diff --git a/src/mesa/pipe/draw/Makefile b/src/mesa/pipe/draw/Makefile
new file mode 100644
index 0000000000..451911a354
--- /dev/null
+++ b/src/mesa/pipe/draw/Makefile
@@ -0,0 +1,2 @@
+default:
+	cd .. ; make
diff --git a/src/mesa/pipe/draw/draw_clip.c b/src/mesa/pipe/draw/draw_clip.c
index 2d410e3244..61130c5600 100644
--- a/src/mesa/pipe/draw/draw_clip.c
+++ b/src/mesa/pipe/draw/draw_clip.c
@@ -33,6 +33,8 @@
 
 
 #include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
 #include "draw_context.h"
 #include "draw_private.h"
 
@@ -54,6 +56,12 @@
 struct clipper {
    struct draw_stage stage;      /**< base class */
 
+   /* Basically duplicate some of the flatshading logic here:
+    */
+   boolean flat;
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+
    float (*plane)[4];
 };
 
@@ -82,6 +90,17 @@ static void interp_attr( float *fdst,
    fdst[3] = LINTERP( t, fout[3], fin[3] );
 }
 
+static void copy_colors( struct draw_stage *stage,
+			 struct vertex_header *dst,
+			 const struct vertex_header *src )
+{
+   const struct clipper *clipper = clipper_stage(stage);
+   uint i;
+   for (i = 0; i < clipper->num_color_attribs; i++) {
+      const uint attr = clipper->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
 
 
 
@@ -134,27 +153,11 @@ static void interp( const struct clipper *clip,
    }
 }
 
-#if 0   
-static INLINE void do_tri( struct draw_stage *next,
-			   struct prim_header *header )
-{
-   unsigned i;
-   for (i = 0; i < 3; i++) {
-      float *ndc = header->v[i]->data[0];
-      _mesa_printf("ndc %f %f %f\n", ndc[0], ndc[1], ndc[2]);
-      assert(ndc[0] >= -1 && ndc[0] <= 641);
-      assert(ndc[1] >= 30 && ndc[1] <= 481);
-   }
-   _mesa_printf("\n");
-   next->tri(next, header);
-}
-#endif
-
 
 static void emit_poly( struct draw_stage *stage,
 		       struct vertex_header **inlist,
 		       unsigned n,
-                       const struct prim_header *origPrim)
+		       const struct prim_header *origPrim)
 {
    struct prim_header header;
    unsigned i;
@@ -163,16 +166,16 @@ static void emit_poly( struct draw_stage *stage,
    header.det = origPrim->det;
 
    for (i = 2; i < n; i++) {
-      header.v[0] = inlist[0];
-      header.v[1] = inlist[i-1];
-      header.v[2] = inlist[i];
+      header.v[0] = inlist[i-1];
+      header.v[1] = inlist[i];
+      header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
 	
       {
-	 unsigned tmp0 = header.v[0]->edgeflag;
+	 unsigned tmp1 = header.v[1]->edgeflag;
 	 unsigned tmp2 = header.v[2]->edgeflag;
 
-	 if (i != 2)   header.v[0]->edgeflag = 0;
-	 if (i != n-1) header.v[2]->edgeflag = 0;
+	 if (i != n-1) header.v[1]->edgeflag = 0;
+	 if (i != 2)   header.v[2]->edgeflag = 0;
 
          header.edgeflags = ((header.v[0]->edgeflag << 0) | 
                              (header.v[1]->edgeflag << 1) | 
@@ -180,27 +183,13 @@ static void emit_poly( struct draw_stage *stage,
 
 	 stage->next->tri( stage->next, &header );
 
-	 header.v[0]->edgeflag = tmp0;
+	 header.v[1]->edgeflag = tmp1;
 	 header.v[2]->edgeflag = tmp2;
       }
    }
 }
 
 
-#if 0
-static void emit_poly( struct draw_stage *stage )
-{
-   unsigned i;
-
-   for (i = 2; i < n; i++) {
-      header->v[0] = inlist[0];
-      header->v[1] = inlist[i-1];
-      header->v[2] = inlist[i];
-	 
-      stage->next->tri( stage->next, header );
-   }
-}
-#endif
 
 
 /* Clip a triangle against the viewport and user clip planes.
@@ -281,6 +270,18 @@ do_clip_tri( struct draw_stage *stage,
       }
    }
 
+   /* If flat-shading, copy color to new provoking vertex.
+    */
+   if (clipper->flat && inlist[0] != header->v[2]) {
+      if (1) {
+	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+      }
+
+      copy_colors(stage, inlist[0], header->v[2]);
+   }
+
+
+
    /* Emit the polygon as triangles to the setup stage:
     */
    if (n >= 3)
@@ -328,6 +329,10 @@ do_clip_line( struct draw_stage *stage,
 
    if (v0->clipmask) {
       interp( clipper, stage->tmp[0], t0, v0, v1 );
+
+      if (clipper->flat)
+	 copy_colors(stage, stage->tmp[0], v0);
+
       newprim.v[0] = stage->tmp[0];
    }
    else {
@@ -393,8 +398,55 @@ clip_tri( struct draw_stage *stage,
    }
 }
 
-static void clip_flush( struct draw_stage *stage, unsigned flags )
+/* Update state.  Could further delay this until we hit the first
+ * primitive that really requires clipping.
+ */
+static void 
+clip_init_state( struct draw_stage *stage )
+{
+   struct clipper *clipper = clipper_stage( stage );
+
+   clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
+
+   if (clipper->flat) {
+      const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+      uint i;
+
+      clipper->num_color_attribs = 0;
+      for (i = 0; i < vs->num_outputs; i++) {
+	 if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+	     vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+	    clipper->color_attribs[clipper->num_color_attribs++] = i;
+	 }
+      }
+   }
+   
+   stage->tri = clip_tri;
+   stage->line = clip_line;
+}
+
+
+
+static void clip_first_tri( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void clip_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void clip_flush( struct draw_stage *stage, 
+			     unsigned flags )
 {
+   stage->tri = clip_first_tri;
+   stage->line = clip_first_line;
    stage->next->flush( stage->next, flags );
 }
 
@@ -420,12 +472,12 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw )
 {
    struct clipper *clipper = CALLOC_STRUCT(clipper);
 
-   draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES );
+   draw_alloc_tmps( &clipper->stage, MAX_CLIPPED_VERTICES+1 );
 
    clipper->stage.draw = draw;
    clipper->stage.point = clip_point;
-   clipper->stage.line = clip_line;
-   clipper->stage.tri = clip_tri;
+   clipper->stage.line = clip_first_line;
+   clipper->stage.tri = clip_first_tri;
    clipper->stage.flush = clip_flush;
    clipper->stage.reset_stipple_counter = clip_reset_stipple_counter;
    clipper->stage.destroy = clip_destroy;
diff --git a/src/mesa/pipe/draw/draw_context.c b/src/mesa/pipe/draw/draw_context.c
index e8ca1f035b..b15f57c824 100644
--- a/src/mesa/pipe/draw/draw_context.c
+++ b/src/mesa/pipe/draw/draw_context.c
@@ -71,12 +71,15 @@ struct draw_context *draw_create( void )
     */
    {
       uint i;
-      char *tmp = (char*) MALLOC( Elements(draw->vcache.vertex) * MAX_VERTEX_SIZE );
+      const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+      char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
 
       for (i = 0; i < Elements(draw->vcache.vertex); i++)
-	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * MAX_VERTEX_SIZE);
+	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
    }
 
+   draw->shader_queue_flush = draw_vertex_shader_queue_flush;
+
    draw->convert_wide_points = TRUE;
    draw->convert_wide_lines = TRUE;
 
@@ -103,7 +106,7 @@ void draw_destroy( struct draw_context *draw )
    if (draw->pipeline.rasterize)
       draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
    tgsi_exec_machine_free_data(&draw->machine);
-   FREE( draw->vcache.vertex[0] ); /* Frees all the vertices. */
+   align_free( draw->vcache.vertex[0] ); /* Frees all the vertices. */
    FREE( draw );
 }
 
diff --git a/src/mesa/pipe/draw/draw_prim.c b/src/mesa/pipe/draw/draw_prim.c
index 243381aec0..51e2242719 100644
--- a/src/mesa/pipe/draw/draw_prim.c
+++ b/src/mesa/pipe/draw/draw_prim.c
@@ -30,6 +30,8 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
+#include "pipe/p_debug.h"
+
 #include "draw_private.h"
 #include "draw_context.h"
 
@@ -60,38 +62,55 @@ static void draw_prim_queue_flush( struct draw_context *draw )
    unsigned i;
 
    if (0)
-      fprintf(stdout,"Flushing with %d prims, %d verts\n",
-             draw->pq.queue_nr, draw->vs.queue_nr);
+      debug_printf("Flushing with %d prims, %d verts\n",
+                   draw->pq.queue_nr, draw->vs.queue_nr);
 
-   if (draw->pq.queue_nr == 0)
-      return;
+   assert (draw->pq.queue_nr != 0);
 
    /* NOTE: we cannot save draw->pipeline->first in a local var because
     * draw->pipeline->first is often changed by the first call to tri(),
     * line(), etc.
     */
-   switch (draw->reduced_prim) {
-   case RP_TRI:
-      for (i = 0; i < draw->pq.queue_nr; i++) {
-	 if (draw->pq.queue[i].reset_line_stipple)
-	    draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
-	 draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+   if (draw->rasterizer->line_stipple_enable) {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_POINT:
+	 draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
       }
-      break;
-   case RP_LINE:
-      for (i = 0; i < draw->pq.queue_nr; i++) {
-	 if (draw->pq.queue[i].reset_line_stipple)
-	    draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-
-	 draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+   }
+   else {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_POINT:
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
       }
-      break;
-   case RP_POINT:
-      draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-      for (i = 0; i < draw->pq.queue_nr; i++)
-	 draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
-      break;
    }
 
    draw->pq.queue_nr = 0;   
@@ -103,16 +122,18 @@ static void draw_prim_queue_flush( struct draw_context *draw )
 void draw_do_flush( struct draw_context *draw, unsigned flags )
 {
    if (0)
-      fprintf(stdout,"Flushing with %d verts, %d prims\n",
-	      draw->vs.queue_nr,
-	      draw->pq.queue_nr );
+      debug_printf("Flushing with %d verts, %d prims\n",
+                   draw->vs.queue_nr,
+                   draw->pq.queue_nr );
 
 
    if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
-      draw_vertex_shader_queue_flush(draw);
+      if (draw->vs.queue_nr)
+         (*draw->shader_queue_flush)(draw);
 
       if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
-         draw_prim_queue_flush(draw);
+	 if (draw->pq.queue_nr)
+	    draw_prim_queue_flush(draw);
 
 	 if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
             draw_vertex_cache_invalidate(draw);
@@ -138,11 +159,11 @@ static struct prim_header *get_queued_prim( struct draw_context *draw,
 					    unsigned nr_verts )
 {
    if (!draw_vertex_cache_check_space( draw, nr_verts )) {
-//      fprintf(stderr, "v");
+//      debug_printf("v");
       draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
    }
    else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
-//      fprintf(stderr, "p");
+//      debug_printf("p");
       draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
    }
 
@@ -230,7 +251,7 @@ static void do_ef_triangle( struct draw_context *draw,
 }
 
 
-static void do_quad( struct draw_context *draw,
+static void do_ef_quad( struct draw_context *draw,
 		     unsigned v0,
 		     unsigned v1,
 		     unsigned v2,
@@ -242,6 +263,16 @@ static void do_quad( struct draw_context *draw,
    do_ef_triangle( draw, 0, omitEdge3, v1, v2, v3 );
 }
 
+static void do_quad( struct draw_context *draw,
+		     unsigned v0,
+		     unsigned v1,
+		     unsigned v2,
+		     unsigned v3 )
+{
+   do_triangle( draw, v0, v1, v3 );
+   do_triangle( draw, v1, v2, v3 );
+}
+
 
 /**
  * Main entrypoint to draw some number of points/lines/triangles
@@ -251,8 +282,10 @@ draw_prim( struct draw_context *draw,
 	   unsigned prim, unsigned start, unsigned count )
 {
    unsigned i;
+   boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
 
-//   _mesa_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
+//   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
 
    switch (prim) {
    case PIPE_PRIM_POINTS:
@@ -288,24 +321,32 @@ draw_prim( struct draw_context *draw,
       break;
 
    case PIPE_PRIM_LINE_STRIP:
-      if (count >= 2) {
-	 for (i = 1; i < count; i++) {
-	    do_line( draw,
-		     i == 1,
-		     start + i - 1,
-		     start + i );
-	 }
+      for (i = 1; i < count; i++) {
+	 do_line( draw,
+		  i == 1,
+		  start + i - 1,
+		  start + i );
       }
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 0; i+2 < count; i += 3) {
-	 do_ef_triangle( draw,
-			 1, 
-			 ~0,
+      if (unfilled) {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_ef_triangle( draw,
+			    1, 
+			    ~0,
+			    start + i + 0,
+			    start + i + 1,
+			    start + i + 2 );
+	 }
+      } 
+      else {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_triangle( draw,
 			 start + i + 0,
 			 start + i + 1,
 			 start + i + 2 );
+	 }
       }
       break;
 
@@ -339,27 +380,49 @@ draw_prim( struct draw_context *draw,
 
 
    case PIPE_PRIM_QUADS:
-      for (i = 0; i+3 < count; i += 4) {
-	 do_quad( draw,
-		  start + i + 0,
-		  start + i + 1,
-		  start + i + 2,
-		  start + i + 3);
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_ef_quad( draw,
+			start + i + 0,
+			start + i + 1,
+			start + i + 2,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_quad( draw,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 2,
+		     start + i + 3);
+	 }
       }
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 0; i+3 < count; i += 2) {
-	 do_quad( draw,
-		  start + i + 2,
-		  start + i + 0,
-		  start + i + 1,
-		  start + i + 3);
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_ef_quad( draw,
+			start + i + 2,
+			start + i + 0,
+			start + i + 1,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_quad( draw,
+		     start + i + 2,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 3);
+	 }
       }
       break;
 
    case PIPE_PRIM_POLYGON:
-      if (count >= 3) {
+      if (unfilled) {
 	 unsigned ef_mask = (1<<2) | (1<<0);
 
 	 for (i = 0; i+2 < count; i++) {
@@ -377,6 +440,14 @@ draw_prim( struct draw_context *draw,
 	    ef_mask &= ~(1<<2);
 	 }
       }
+      else {
+	 for (i = 0; i+2 < count; i++) {
+	    do_triangle( draw,
+			 start + i + 1,
+			 start + i + 2,
+			 start + 0);
+	 }
+      }
       break;
 
    default:
diff --git a/src/mesa/pipe/draw/draw_private.h b/src/mesa/pipe/draw/draw_private.h
index 1e59f5bd8d..7782db0477 100644
--- a/src/mesa/pipe/draw/draw_private.h
+++ b/src/mesa/pipe/draw/draw_private.h
@@ -141,6 +141,10 @@ struct draw_vertex_shader {
 /* Internal function for vertex fetch.
  */
 typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*full_fetch_func)( struct draw_context *draw,
+				 struct tgsi_exec_machine *machine,
+				 const unsigned *elts,
+				 unsigned count );
 
 
 
@@ -210,6 +214,7 @@ struct draw_context
       unsigned pitch[PIPE_ATTRIB_MAX];
       fetch_func fetch[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
+      full_fetch_func fetch_func;
    } vertex_fetch;
 
    /* Post-tnl vertex cache:
@@ -235,6 +240,11 @@ struct draw_context
       unsigned queue_nr;
    } vs;
 
+   /**
+    * Run the vertex shader on all vertices in the vertex queue.
+    */
+   void (*shader_queue_flush)(struct draw_context *draw);
+
    /* Prim pipeline queue:
     */
    struct {
@@ -249,6 +259,8 @@ struct draw_context
 #ifdef MESA_LLVM
    struct gallivm_cpu_engine *engine;
 #endif
+   
+   void *driver_private;
 };
 
 
@@ -287,10 +299,6 @@ extern void draw_vertex_shader_queue_flush_llvm( struct draw_context *draw );
 struct tgsi_exec_machine;
 
 extern void draw_update_vertex_fetch( struct draw_context *draw );
-extern void draw_vertex_fetch( struct draw_context *draw,
-			       struct tgsi_exec_machine *machine,
-			       const unsigned *elts,
-			       unsigned count );
 
 
 #define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
diff --git a/src/mesa/pipe/draw/draw_validate.c b/src/mesa/pipe/draw/draw_validate.c
index 86d5a5f814..4375ebabbc 100644
--- a/src/mesa/pipe/draw/draw_validate.c
+++ b/src/mesa/pipe/draw/draw_validate.c
@@ -78,6 +78,11 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       precalc_flat = 1;		/* only needed for triangles really */
       need_det = 1;
    }
+
+   if (draw->rasterizer->flatshade && precalc_flat) {
+      draw->pipeline.flatshade->next = next;
+      next = draw->pipeline.flatshade;
+   }
 	 
    if (draw->rasterizer->offset_cw ||
        draw->rasterizer->offset_ccw) {
@@ -110,13 +115,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    {
       draw->pipeline.clip->next = next;
       next = draw->pipeline.clip;
-      precalc_flat = 1;		/* XXX: FIX ME! Only needed for clipped prims */
    }
 
-   if (draw->rasterizer->flatshade && precalc_flat) {
-      draw->pipeline.flatshade->next = next;
-      next = draw->pipeline.flatshade;
-   }
    
    draw->pipeline.first = next;
    return next;
diff --git a/src/mesa/pipe/draw/draw_vbuf.c b/src/mesa/pipe/draw/draw_vbuf.c
index 1e260c6156..be96c8fdeb 100644
--- a/src/mesa/pipe/draw/draw_vbuf.c
+++ b/src/mesa/pipe/draw/draw_vbuf.c
@@ -34,13 +34,14 @@
  */
 
 
-#include <assert.h>
-
-#include "pipe/draw/draw_vbuf.h"
-#include "pipe/draw/draw_private.h"
-#include "pipe/draw/draw_vertex.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 
+#include "draw_vbuf.h"
+#include "draw_private.h"
+#include "draw_vertex.h"
+#include "draw_vf.h"
+
 
 /**
  * Vertex buffer emit stage.
@@ -55,6 +56,8 @@ struct vbuf_stage {
    /** Vertex size in bytes */
    unsigned vertex_size;
 
+   struct draw_vertex_fetch *vf;
+   
    /* FIXME: we have no guarantee that 'unsigned' is 32bit */
 
    /** Vertices in hardware format */
@@ -110,88 +113,175 @@ check_space( struct vbuf_stage *vbuf, unsigned nr )
 }
 
 
-/**
- * Extract the needed fields from post-transformed vertex and emit
- * a hardware(driver) vertex.
- * Recall that the vertices are constructed by the 'draw' module and
- * have a couple of slots at the beginning (1-dword header, 4-dword
- * clip pos) that we ignore here.  We only use the vertex->data[] fields.
- */
-static INLINE void 
-emit_vertex( struct vbuf_stage *vbuf,
-             struct vertex_header *vertex )
+#if 0
+static INLINE void
+dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
 {
-   const struct vertex_info *vinfo = vbuf->vinfo;
-
-   uint i;
-   uint count = 0;  /* for debug/sanity */
-   
    assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
-
-//   fprintf(stderr, "emit vertex %d to %p\n", 
-//           vbuf->nr_vertices, vbuf->vertex_ptr);
-
-   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
-      if(vertex->vertex_id < vbuf->nr_vertices)
-	 return;
-      else
-	 fprintf(stderr, "Bad vertex id 0x%04x (>= 0x%04x)\n", 
-	         vertex->vertex_id, vbuf->nr_vertices);
-      return;
-   }
-      
-   vertex->vertex_id = vbuf->nr_vertices++;
+   unsigned i, j, k;
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      uint j = vinfo->src_index[i];
+      j = vinfo->src_index[i];
       switch (vinfo->emit[i]) {
       case EMIT_OMIT:
-         /* no-op */
+         debug_printf("EMIT_OMIT:");
          break;
       case EMIT_ALL:
-         /* just copy the whole vertex as-is to the vbuf */
          assert(i == 0);
-         memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
-         vbuf->vertex_ptr += vinfo->size;
-         return;
+         assert(j == 0);
+         debug_printf("EMIT_ALL:\t");
+         for(k = 0; k < vinfo->size*4; ++k)
+            debug_printf("%02x ", *data++);
+         break;
       case EMIT_1F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         count++;
+         debug_printf("EMIT_1F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_1F_PSIZE:
-         *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
-         count++;
+         debug_printf("EMIT_1F_PSIZE:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_2F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         count += 2;
+         debug_printf("EMIT_2F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_3F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-         count += 3;
+         debug_printf("EMIT_3F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         data += sizeof(float);
          break;
       case EMIT_4F:
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-         *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
-         count += 4;
+         debug_printf("EMIT_4F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
          break;
       case EMIT_4UB:
-	 *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
-                                        float_to_ubyte( vertex->data[j][1] ),
-                                        float_to_ubyte( vertex->data[j][0] ),
-                                        float_to_ubyte( vertex->data[j][3] ));
-         count += 1;
+         debug_printf("EMIT_4UB:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
          break;
       default:
          assert(0);
       }
+      debug_printf("\n");
+   }
+   debug_printf("\n");
+}
+#endif
+
+
+/**
+ * Extract the needed fields from post-transformed vertex and emit
+ * a hardware(driver) vertex.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here.  We only use the vertex->data[] fields.
+ */
+static INLINE void 
+emit_vertex( struct vbuf_stage *vbuf,
+             struct vertex_header *vertex )
+{
+#if 0
+   debug_printf("emit vertex %d to %p\n", 
+           vbuf->nr_vertices, vbuf->vertex_ptr);
+#endif
+
+   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+      if(vertex->vertex_id < vbuf->nr_vertices)
+	 return;
+      else
+	 debug_printf("Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	         vertex->vertex_id, vbuf->nr_vertices);
+      return;
+   }
+      
+   vertex->vertex_id = vbuf->nr_vertices++;
+
+   if(!vbuf->vf) {
+      const struct vertex_info *vinfo = vbuf->vinfo;
+      uint i;
+      uint count = 0;  /* for debug/sanity */
+      
+      assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+
+      for (i = 0; i < vinfo->num_attribs; i++) {
+         uint j = vinfo->src_index[i];
+         switch (vinfo->emit[i]) {
+         case EMIT_OMIT:
+            /* no-op */
+            break;
+         case EMIT_ALL:
+            /* just copy the whole vertex as-is to the vbuf */
+            assert(i == 0);
+            assert(j == 0);
+            memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
+            vbuf->vertex_ptr += vinfo->size;
+            count += vinfo->size;
+            break;
+         case EMIT_1F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            count++;
+            break;
+         case EMIT_1F_PSIZE:
+            *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
+            count++;
+            break;
+         case EMIT_2F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            count += 2;
+            break;
+         case EMIT_3F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            count += 3;
+            break;
+         case EMIT_4F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
+            count += 4;
+            break;
+         case EMIT_4UB:
+            *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+                                           float_to_ubyte( vertex->data[j][1] ),
+                                           float_to_ubyte( vertex->data[j][0] ),
+                                           float_to_ubyte( vertex->data[j][3] ));
+            count += 1;
+            break;
+         default:
+            assert(0);
+         }
+      }
+      assert(count == vinfo->size);
+#if 0
+      {
+	 static float data[256]; 
+	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
+	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
+            debug_printf("With VF:\n");
+            dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
+	    debug_printf("Without VF:\n");
+	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
+	    assert(0);
+	 }
+      }
+#endif
+   }
+   else {
+      draw_vf_emit_vertex(vbuf->vf, vertex, vbuf->vertex_ptr);
+   
+      vbuf->vertex_ptr += vbuf->vertex_size/4;
    }
-   assert(count == vinfo->size);
 }
 
 
@@ -269,6 +359,10 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
 
    vbuf->vinfo = vinfo;
    vbuf->vertex_size = vertex_size;
+   if(vbuf->vf)
+      draw_vf_set_vertex_info(vbuf->vf, 
+                              vbuf->vinfo,
+                              vbuf->stage.draw->rasterizer->point_size);
    
    if (!vbuf->vertices)
       vbuf_alloc_vertices(vbuf);
@@ -423,7 +517,12 @@ static void vbuf_destroy( struct draw_stage *stage )
 {
    struct vbuf_stage *vbuf = vbuf_stage( stage );
 
-   align_free( vbuf->indices );
+   if(vbuf->indices)
+      align_free( vbuf->indices );
+   
+   if(vbuf->vf)
+      draw_vf_destroy( vbuf->vf );
+
    FREE( stage );
 }
 
@@ -436,6 +535,9 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
 {
    struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
 
+   if(!vbuf)
+      return NULL;
+   
    vbuf->stage.draw = draw;
    vbuf->stage.point = vbuf_first_point;
    vbuf->stage.line = vbuf_first_line;
@@ -450,11 +552,16 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    vbuf->max_indices = render->max_indices;
    vbuf->indices = (ushort *)
       align_malloc( vbuf->max_indices * sizeof(vbuf->indices[0]), 16 );
+   if(!vbuf->indices)
+      vbuf_destroy(&vbuf->stage);
    
    vbuf->vertices = NULL;
    vbuf->vertex_ptr = vbuf->vertices;
 
    vbuf->prim = ~0;
    
+   if(!GETENV("GALLIUM_NOVF"))
+      vbuf->vf = draw_vf_create();
+   
    return &vbuf->stage;
 }
diff --git a/src/mesa/pipe/draw/draw_vertex_fetch.c b/src/mesa/pipe/draw/draw_vertex_fetch.c
index fb64723a19..e13df04605 100644
--- a/src/mesa/pipe/draw/draw_vertex_fetch.c
+++ b/src/mesa/pipe/draw/draw_vertex_fetch.c
@@ -62,50 +62,244 @@ fetch_##NAME(const void *ptr, float *attrib)		\
    }							\
 }
 
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
 #define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
 #define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
 #define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
 
 FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
 FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
 FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
 FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
 FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
 FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
 FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
 
 
 
 static fetch_func get_fetch_func( enum pipe_format format )
 {
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      debug_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
    switch (format) {
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return fetch_R32G32B32A32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return fetch_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
    case PIPE_FORMAT_R32_FLOAT:
       return fetch_R32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return fetch_R32G32B32A32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return fetch_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
    case PIPE_FORMAT_R32_SSCALED:
       return fetch_R32_SSCALED;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return fetch_A8R8G8B8_UNORM;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
    case PIPE_FORMAT_R8G8B8A8_UNORM:
       return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
    case 0:
-      return NULL;
+      return NULL;		/* not sure why this is needed */
+
    default:
-      /* Lots of missing cases! */
       assert(0);
       return NULL;
    }
@@ -126,47 +320,108 @@ transpose_4x4( float *out, const float *in )
 }
 
 
-			       
-void draw_update_vertex_fetch( struct draw_context *draw )
+
+static void fetch_xyz_rgb( struct draw_context *draw,
+			   struct tgsi_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
 {
-   unsigned nr_attrs, i;
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
 
-   /* this may happend during context init */
-   if (!draw->vertex_shader)
-      return;
+   assert(count <= 4);
 
-   nr_attrs = draw->vertex_shader->state->num_inputs;
+//   debug_printf("%s\n", __FUNCTION__);
 
-   for (i = 0; i < nr_attrs; i++) {
-      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
-      enum pipe_format format  = draw->vertex_element[i].src_format;
+   /* loop over vertex attributes (vertex shader inputs)
+    */
 
-      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
-						       draw->vertex_buffer[buf].buffer_offset + 
-						       draw->vertex_element[i].src_offset;
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
 
-      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
-      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
    }
+}
 
-   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+
+
+static void fetch_xyz_rgb_st( struct draw_context *draw,
+			      struct tgsi_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[2].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
 }
 
 
+
+
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-void draw_vertex_fetch( struct draw_context *draw,
-			struct tgsi_exec_machine *machine,
-			const unsigned *elts,
-			unsigned count )
+static void generic_vertex_fetch( struct draw_context *draw,
+				  struct tgsi_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
 {
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
    assert(count <= 4);
 
-//   _mesa_printf("%s %d\n", __FUNCTION__, count);
+//   debug_printf("%s %d\n", __FUNCTION__, count);
 
    /* loop over vertex attributes (vertex shader inputs)
     */
@@ -206,3 +461,50 @@ void draw_vertex_fetch( struct draw_context *draw,
    }
 }
 
+
+			       
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+   unsigned nr_attrs, i;
+
+//   debug_printf("%s\n", __FUNCTION__);
+   
+   /* this may happend during context init */
+   if (!draw->vertex_shader)
+      return;
+
+   nr_attrs = draw->vertex_shader->state->num_inputs;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+      enum pipe_format format  = draw->vertex_element[i].src_format;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset;
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+   }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (nr_attrs) {
+   case 2:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[2].src_format == PIPE_FORMAT_R32G32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+
+}
diff --git a/src/mesa/pipe/draw/draw_vertex_shader.c b/src/mesa/pipe/draw/draw_vertex_shader.c
index 3041974b9a..5ca93aa615 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader.c
@@ -110,7 +110,7 @@ run_vertex_program(struct draw_context *draw,
    machine->Inputs = ALIGN16_ASSIGN(inputs);
    machine->Outputs = ALIGN16_ASSIGN(outputs);
 
-   draw_vertex_fetch( draw, machine, elts, count );
+   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
 
    /* run shader */
 #if defined(__i386__) || defined(__386__)
@@ -121,11 +121,16 @@ run_vertex_program(struct draw_context *draw,
          = (struct draw_vertex_shader *)draw->vertex_shader;
       codegen_function func
          = (codegen_function) x86_get_func( &shader->sse2_program );
-      func(
-         machine->Inputs,
-         machine->Outputs,
-         machine->Consts,
-         machine->Temps );
+
+      if (func)
+         func(
+            machine->Inputs,
+            machine->Outputs,
+            machine->Consts,
+            machine->Temps );
+      else
+         /* interpreter */
+         tgsi_exec_machine_run( machine );
    }
    else
 #endif
@@ -166,7 +171,7 @@ run_vertex_program(struct draw_context *draw,
       vOut[j]->data[0][3] = w;
 
 #if DBG_VS
-      printf("output[%d]win: %f %f %f %f\n", j,
+      debug_printf("output[%d]win: %f %f %f %f\n", j,
              vOut[j]->data[0][0],
              vOut[j]->data[0][1],
              vOut[j]->data[0][2],
@@ -181,7 +186,7 @@ run_vertex_program(struct draw_context *draw,
          vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
          vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
 #if DBG_VS
-         printf("output[%d][%d]: %f %f %f %f\n", j, slot,
+         debug_printf("output[%d][%d]: %f %f %f %f\n", j, slot,
                 vOut[j]->data[slot][0],
                 vOut[j]->data[slot][1],
                 vOut[j]->data[slot][2],
@@ -199,13 +204,15 @@ run_vertex_program(struct draw_context *draw,
 void
 draw_vertex_shader_queue_flush(struct draw_context *draw)
 {
-   unsigned i, j;
+   unsigned i;
+
+   assert(draw->vs.queue_nr != 0);
 
    /* XXX: do this on statechange: 
     */
    draw_update_vertex_fetch( draw );
 
-//   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
+//   debug_printf( " q(%d) ", draw->vs.queue_nr );
 #ifdef MESA_LLVM
    if (draw->vertex_shader->llvm_prog) {
       draw_vertex_shader_queue_flush_llvm(draw);
@@ -217,14 +224,18 @@ draw_vertex_shader_queue_flush(struct draw_context *draw)
    for (i = 0; i < draw->vs.queue_nr; i += 4) {
       struct vertex_header *dests[4];
       unsigned elts[4];
-      int n;
+      int j, n = MIN2(4, draw->vs.queue_nr - i);
 
-      for (j = 0; j < 4; j++) {
+      for (j = 0; j < n; j++) {
          elts[j] = draw->vs.queue[i + j].elt;
          dests[j] = draw->vs.queue[i + j].dest;
       }
 
-      n = MIN2(4, draw->vs.queue_nr - i);
+      for ( ; j < 4; j++) {
+	 elts[j] = elts[0];
+	 dests[j] = dests[0];
+      }
+
       assert(n > 0);
       assert(n <= 4);
 
@@ -263,7 +274,12 @@ draw_create_vertex_shader(struct draw_context *draw,
       struct pipe_shader_state *sh = (struct pipe_shader_state *) shader;
 
       x86_init_func( &vs->sse2_program );
-      tgsi_emit_sse2( (struct tgsi_token *) sh->tokens, &vs->sse2_program );
+      if (!tgsi_emit_sse2( (struct tgsi_token *) sh->tokens,
+                           &vs->sse2_program )) {
+         x86_release_func( (struct x86_function *) &vs->sse2_program );
+	 fprintf(stdout /*err*/,
+		 "tgsi_emit_sse2() failed, falling back to interpreter\n");
+      }
    }
 #endif
 
diff --git a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
index 4228c4f388..63551c993e 100644
--- a/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
+++ b/src/mesa/pipe/draw/draw_vertex_shader_llvm.c
@@ -152,7 +152,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
       z = vOut->clip[2] = dests[0][2];
       w = vOut->clip[3] = dests[0][3];
 #if DBG
-      printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
+      debug_printf("output %d: %f %f %f %f\n", 0, x, y, z, w);
 #endif
 
       vOut->clipmask = compute_clipmask(vOut->clip, draw->plane, draw->nr_planes);
@@ -179,7 +179,7 @@ void draw_vertex_shader_queue_flush_llvm(struct draw_context *draw)
          vOut->data[slot][3] = dests[slot][3];
 
 #if DBG
-         printf("output %d: %f %f %f %f\n", slot,
+         debug_printf("output %d: %f %f %f %f\n", slot,
                 vOut->data[slot][0],
                 vOut->data[slot][1],
                 vOut->data[slot][2],
diff --git a/src/mesa/pipe/draw/draw_vf.c b/src/mesa/pipe/draw/draw_vf.c
new file mode 100644
index 0000000000..f23d7fcec5
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include <stddef.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+#define DRAW_VF_DBG 0
+
+
+/* TODO: remove this */
+extern void 
+_mesa_exec_free( void *addr );
+
+
+static boolean match_fastpath( struct draw_vertex_fetch *vf,
+				 const struct draw_vf_fastpath *fp)
+{
+   unsigned j;
+
+   if (vf->attr_count != fp->attr_count) 
+      return FALSE;
+
+   for (j = 0; j < vf->attr_count; j++) 
+      if (vf->attr[j].format != fp->attr[j].format ||
+	  vf->attr[j].inputsize != fp->attr[j].size ||
+	  vf->attr[j].vertoffset != fp->attr[j].offset) 
+	 return FALSE;
+      
+   if (fp->match_strides) {
+      if (vf->vertex_stride != fp->vertex_stride)
+	 return FALSE;
+
+      for (j = 0; j < vf->attr_count; j++) 
+	 if (vf->attr[j].inputstride != fp->attr[j].stride) 
+	    return FALSE;
+   }
+   
+   return TRUE;
+}
+
+static boolean search_fastpath_emit( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp = vf->fastpath;
+
+   for ( ; fp ; fp = fp->next) {
+      if (match_fastpath(vf, fp)) {
+         vf->emit = fp->func;
+	 return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
+			     boolean match_strides )
+{
+   struct draw_vf_fastpath *fastpath = CALLOC_STRUCT(draw_vf_fastpath);
+   unsigned i;
+
+   fastpath->vertex_stride = vf->vertex_stride;
+   fastpath->attr_count = vf->attr_count;
+   fastpath->match_strides = match_strides;
+   fastpath->func = vf->emit;
+   fastpath->attr = (struct draw_vf_attr_type *)
+      MALLOC(vf->attr_count * sizeof(fastpath->attr[0]));
+
+   for (i = 0; i < vf->attr_count; i++) {
+      fastpath->attr[i].format = vf->attr[i].format;
+      fastpath->attr[i].stride = vf->attr[i].inputstride;
+      fastpath->attr[i].size = vf->attr[i].inputsize;
+      fastpath->attr[i].offset = vf->attr[i].vertoffset;
+   }
+
+   fastpath->next = vf->fastpath;
+   vf->fastpath = fastpath;
+}
+
+
+
+
+/***********************************************************************
+ * Build codegen functions or return generic ones:
+ */
+static void choose_emit_func( struct draw_vertex_fetch *vf, 
+			      unsigned count, 
+			      uint8_t *dest)
+{
+   vf->emit = NULL;
+   
+   /* Does this match an existing (hardwired, codegen or known-bad)
+    * fastpath?
+    */
+   if (search_fastpath_emit(vf)) {
+      /* Use this result.  If it is null, then it is already known
+       * that the current state will fail for codegen and there is no
+       * point trying again.
+       */
+   }
+   else if (vf->codegen_emit) {
+      vf->codegen_emit( vf );
+   }
+
+   if (!vf->emit) {
+      draw_vf_generate_hardwired_emit(vf);
+   }
+
+   /* Otherwise use the generic version:
+    */
+   if (!vf->emit)
+      vf->emit = draw_vf_generic_emit;
+
+   vf->emit( vf, count, dest );
+}
+
+
+
+
+
+/***********************************************************************
+ * Public entrypoints, mostly dispatch to the above:
+ */
+
+
+
+static unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride )
+{
+   unsigned offset = 0;
+   unsigned i, j;
+
+   assert(nr < PIPE_ATTRIB_MAX);
+
+   for (j = 0, i = 0; i < nr; i++) {
+      const unsigned format = map[i].format;
+      if (format == DRAW_EMIT_PAD) {
+#if (DRAW_VF_DBG)
+	    debug_printf("%d: pad %d, offset %d\n", i,  
+			 map[i].offset, offset);  
+#endif
+
+	 offset += map[i].offset;
+
+      }
+      else {
+	 vf->attr[j].attrib = map[i].attrib;
+	 vf->attr[j].format = format;
+	 vf->attr[j].insert = draw_vf_format_info[format].insert;
+	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
+	 vf->attr[j].vertoffset = offset;
+	 vf->attr[j].isconst = draw_vf_format_info[format].isconst;
+	 if(vf->attr[j].isconst)
+	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
+	 
+#if (DRAW_VF_DBG)
+	    debug_printf("%d: %s, offset %d\n", i,  
+			 draw_vf_format_info[format].name,
+			 vf->attr[j].vertoffset);   
+#endif
+
+	 offset += draw_vf_format_info[format].attrsize;
+	 j++;
+      }
+   }
+
+   vf->attr_count = j;
+   vf->vertex_stride = vertex_stride ? vertex_stride : offset;
+   vf->emit = choose_emit_func;
+
+   assert(vf->vertex_stride >= offset);
+   return vf->vertex_stride;
+}
+
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size )
+{
+   unsigned i, j, k;
+   struct draw_vf_attr *a = vf->attr;
+   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+   unsigned count = 0;  /* for debug/sanity */
+   unsigned nr_attrs = 0;
+   
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         /* no-op */
+         break;
+      case EMIT_ALL: {
+         /* just copy the whole vertex as-is to the vbuf */
+	 unsigned s = vinfo->size;
+         assert(i == 0);
+         assert(j == 0);
+         /* copy the vertex header */
+         /* XXX: we actually don't copy the header, just pad it */
+	 attrs[nr_attrs].attrib = 0;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+	 s -= offsetof(struct vertex_header, data)/4;
+         count += offsetof(struct vertex_header, data)/4;
+	 nr_attrs++;
+	 /* copy the vertex data */
+         for(k = 0; k < (s & ~0x3); k += 4) {
+      	    attrs[nr_attrs].attrib = k/4;
+      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
+      	    attrs[nr_attrs].offset = 0;
+      	    nr_attrs++;
+            count += 4;
+         }
+         /* tail */
+         /* XXX: actually, this shouldn't be needed */
+ 	 attrs[nr_attrs].attrib = k/4;
+  	 attrs[nr_attrs].offset = 0;
+         switch(s & 0x3) {
+         case 0:
+            break;
+         case 1:
+      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
+      	    nr_attrs++;
+            count += 1;
+            break;
+         case 2:
+      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
+      	    nr_attrs++;
+            count += 2;
+            break;
+         case 3:
+      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
+      	    nr_attrs++;
+            count += 3;
+            break;
+         }
+         break;
+      }
+      case EMIT_1F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_1F_PSIZE:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
+	 attrs[nr_attrs].offset = 0;
+	 attrs[nr_attrs].data.f[0] = point_size;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_2F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_2F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 2;
+         break;
+      case EMIT_3F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_3F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 3;
+         break;
+      case EMIT_4F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 4;
+         break;
+      case EMIT_4UB:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   
+   assert(count == vinfo->size);  
+   
+   draw_vf_set_vertex_attributes(vf, 
+                                 attrs, 
+                                 nr_attrs, 
+                                 vinfo->size * sizeof(float) );
+
+   for (j = 0; j < vf->attr_count; j++) {
+      a[j].inputsize = 4;
+      a[j].do_insert = a[j].insert[4 - 1];
+      if(a[j].isconst) {
+	 a[j].inputptr = a[j].data;
+	 a[j].inputstride = 0;
+      }
+   }
+}
+
+
+#if 0
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const sources[],
+		     unsigned start )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      const GLvector4f *vptr = sources[a[j].attrib];
+      
+      if ((a[j].inputstride != vptr->stride) ||
+	  (a[j].inputsize != vptr->size))
+	 vf->emit = choose_emit_func;
+      
+      a[j].inputstride = vptr->stride;
+      a[j].inputsize = vptr->size;
+      a[j].do_insert = a[j].insert[vptr->size - 1]; 
+      a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
+   }
+}
+#endif
+
+
+/**
+ * Emit a vertex to dest.  
+ */
+void draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                          struct vertex_header *vertex,
+                          void *dest )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      if(!a[j].isconst) {
+	 a[j].inputptr = (uint8_t *)&vertex->data[a[j].attrib][0];
+	 a[j].inputstride = 0; /* XXX: one-vertex-max ATM */
+      }
+   }
+   
+   vf->emit( vf, 1, (uint8_t*) dest );
+}
+
+
+
+struct draw_vertex_fetch *draw_vf_create( void )
+{
+   struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
+   unsigned i;
+
+   for (i = 0; i < PIPE_ATTRIB_MAX; i++)
+      vf->attr[i].vf = vf;
+
+   vf->identity[0] = 0.0;
+   vf->identity[1] = 0.0;
+   vf->identity[2] = 0.0;
+   vf->identity[3] = 1.0;
+
+   vf->codegen_emit = NULL;
+
+#ifdef USE_SSE_ASM
+   if (!GETENV("GALLIUM_NO_CODEGEN"))
+      vf->codegen_emit = draw_vf_generate_sse_emit;
+#endif
+
+   return vf;
+}
+
+
+void draw_vf_destroy( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp, *tmp;
+
+   for (fp = vf->fastpath ; fp ; fp = tmp) {
+      tmp = fp->next;
+      FREE(fp->attr);
+
+      /* KW: At the moment, fp->func is constrained to be allocated by
+       * _mesa_exec_alloc(), as the hardwired fastpaths in
+       * t_vertex_generic.c are handled specially.  It would be nice
+       * to unify them, but this probably won't change until this
+       * module gets another overhaul.
+       */
+      //_mesa_exec_free((void *) fp->func);
+      FREE(fp);
+   }
+   
+   vf->fastpath = NULL;
+   FREE(vf);
+}
diff --git a/src/mesa/pipe/draw/draw_vf.h b/src/mesa/pipe/draw/draw_vf.h
new file mode 100644
index 0000000000..e694b98675
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VF_H
+#define DRAW_VF_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "draw_vertex.h"
+#include "draw_private.h" // for vertex_header
+
+
+enum draw_vf_attr_format {
+   DRAW_EMIT_1F,
+   DRAW_EMIT_2F,
+   DRAW_EMIT_3F,
+   DRAW_EMIT_4F,
+   DRAW_EMIT_3F_XYW,			/**< for projective texture */
+   DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
+   DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
+   DRAW_EMIT_3UB_3F_BGR,		/**< for specular color */
+   DRAW_EMIT_4UB_4F_RGBA,		/**< for color */
+   DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
+   DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
+   DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
+   DRAW_EMIT_1F_CONST,
+   DRAW_EMIT_2F_CONST,
+   DRAW_EMIT_3F_CONST,
+   DRAW_EMIT_4F_CONST,
+   DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
+   DRAW_EMIT_MAX
+};
+
+struct draw_vf_attr_map 
+{
+   /** Input attribute number */
+   unsigned attrib;
+   
+   enum draw_vf_attr_format format;
+   
+   unsigned offset;
+   
+   /** 
+    * Constant data for DRAW_EMIT_*_CONST 
+    */
+   union {
+      uint8_t ub[4];
+      float f[4];
+   } data;
+};
+
+struct draw_vertex_fetch;
+
+
+
+#if 0
+unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride );
+#endif
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size );
+
+#if 0
+void 
+draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const attrib[],
+		     unsigned start );
+#endif
+
+void 
+draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                     struct vertex_header *vertex,
+                     void *dest );
+
+struct draw_vertex_fetch *
+draw_vf_create( void );
+
+void 
+draw_vf_destroy( struct draw_vertex_fetch *vf );
+
+
+
+/***********************************************************************
+ * Internal functions and structs:
+ */
+
+struct draw_vf_attr;
+
+typedef void (*draw_vf_extract_func)( const struct draw_vf_attr *a, 
+				      float *out, 
+				      const uint8_t *v );
+
+typedef void (*draw_vf_insert_func)( const struct draw_vf_attr *a, 
+				     uint8_t *v, 
+				     const float *in );
+
+typedef void (*draw_vf_emit_func)( struct draw_vertex_fetch *vf,
+      				   unsigned count, 
+      				   uint8_t *dest );
+
+
+
+/**
+ * Describes how to convert/move a vertex attribute from a vertex
+ * array to a vertex structure.
+ */
+struct draw_vf_attr
+{
+   struct draw_vertex_fetch *vf;
+
+   unsigned format;
+   unsigned inputsize;
+   unsigned inputstride;
+   unsigned vertoffset;      /**< position of the attrib in the vertex struct */
+   
+   boolean isconst;              /**< read from const data below */
+   uint8_t data[16];
+
+   unsigned attrib;          /**< which vertex attrib (0=position, etc) */
+   unsigned vertattrsize;    /**< size of the attribute in bytes */
+
+   uint8_t *inputptr;
+   const draw_vf_insert_func *insert;
+   draw_vf_insert_func do_insert;
+   draw_vf_extract_func extract;
+};
+
+struct draw_vertex_fetch
+{
+   struct draw_vf_attr attr[PIPE_ATTRIB_MAX];
+   unsigned attr_count;
+   unsigned vertex_stride;
+
+   draw_vf_emit_func emit;
+
+   /* Parameters and constants for codegen:
+    */
+   float identity[4];
+
+   struct draw_vf_fastpath *fastpath;
+   
+   void (*codegen_emit)( struct draw_vertex_fetch *vf );
+};
+
+
+struct draw_vf_attr_type {
+   unsigned format;
+   unsigned size;
+   unsigned stride;
+   unsigned offset;
+};
+
+struct draw_vf_fastpath {
+   unsigned vertex_stride;
+   unsigned attr_count;
+   boolean match_strides;
+
+   struct draw_vf_attr_type *attr;
+
+   draw_vf_emit_func func;
+   struct draw_vf_fastpath *next;
+};
+
+
+void 
+draw_vf_register_fastpath( struct draw_vertex_fetch *vtx,
+                           boolean match_strides );
+
+void 
+draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+                      unsigned count,
+                      uint8_t *v );
+
+void 
+draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf );
+
+void 
+draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
+
+
+struct draw_vf_format_info {
+   const char *name;
+   draw_vf_insert_func insert[4];
+   const unsigned attrsize;
+   const boolean isconst;
+};
+
+extern const struct draw_vf_format_info 
+draw_vf_format_info[DRAW_EMIT_MAX];
+
+
+#endif
diff --git a/src/mesa/pipe/draw/draw_vf_generic.c b/src/mesa/pipe/draw/draw_vf_generic.c
new file mode 100644
index 0000000000..7a60a9db9c
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_generic.c
@@ -0,0 +1,585 @@
+
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+
+static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[3];
+}
+
+static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+   assert(0);
+}
+
+static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+}
+
+static INLINE void insert_3f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+}
+
+static INLINE void insert_3f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+}
+
+
+static INLINE void insert_2f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+}
+
+static INLINE void insert_2f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+}
+
+static INLINE void insert_1f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+
+   out[0] = in[0];
+}
+
+static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+}
+
+static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_bgra_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_argb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   v[2] = 0x00;
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_abgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   v[2] = 0x00;
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_3ub_3f_rgb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+}
+
+static INLINE void insert_3ub_3f_rgb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_rgb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+}
+
+static INLINE void insert_3ub_3f_bgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+}
+
+
+static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v, 
+				    const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+}
+
+
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
+{
+   { "1f",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), FALSE },
+
+   { "2f",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), FALSE },
+
+   { "3f",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), FALSE },
+
+   { "4f",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), FALSE },
+
+   { "3f_xyw",
+     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
+       insert_3f_xyw_4 },
+     3 * sizeof(float), FALSE },
+
+   { "1ub_1f",
+     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
+     sizeof(uint8_t), FALSE },
+
+   { "3ub_3f_rgb",
+     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
+       insert_3ub_3f_rgb_3 },
+     3 * sizeof(uint8_t), FALSE },
+
+   { "3ub_3f_bgr",
+     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
+       insert_3ub_3f_bgr_3 },
+     3 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_rgba",
+     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
+       insert_4ub_4f_rgba_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_bgra",
+     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
+       insert_4ub_4f_bgra_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_argb",
+     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
+       insert_4ub_4f_argb_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_abgr",
+     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
+       insert_4ub_4f_abgr_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "1f_const",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), TRUE },
+   
+   { "2f_const",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), TRUE },
+   
+   { "3f_const",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), TRUE },
+   
+   { "4f_const",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), TRUE },
+
+   { "pad",
+     { NULL, NULL, NULL, NULL },
+     0, FALSE },
+
+};
+
+
+
+    
+/***********************************************************************
+ * Hardwired fastpaths for emitting whole vertices or groups of
+ * vertices
+ */
+#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
+static void NAME( struct draw_vertex_fetch *vf,				\
+		  unsigned count,						\
+		  uint8_t *v )						\
+{									\
+   struct draw_vf_attr *a = vf->attr;				\
+   unsigned i;								\
+									\
+   for (i = 0 ; i < count ; i++, v += vf->vertex_stride) {		\
+      if (NR > 0) {							\
+	 F0( &a[0], v + a[0].vertoffset, (float *)a[0].inputptr );	\
+	 a[0].inputptr += a[0].inputstride;				\
+      }									\
+      									\
+      if (NR > 1) {							\
+	 F1( &a[1], v + a[1].vertoffset, (float *)a[1].inputptr );	\
+	 a[1].inputptr += a[1].inputstride;				\
+      }									\
+      									\
+      if (NR > 2) {							\
+	 F2( &a[2], v + a[2].vertoffset, (float *)a[2].inputptr );	\
+	 a[2].inputptr += a[2].inputstride;				\
+      }									\
+      									\
+      if (NR > 3) {							\
+	 F3( &a[3], v + a[3].vertoffset, (float *)a[3].inputptr );	\
+	 a[3].inputptr += a[3].inputstride;				\
+      }									\
+									\
+      if (NR > 4) {							\
+	 F4( &a[4], v + a[4].vertoffset, (float *)a[4].inputptr );	\
+	 a[4].inputptr += a[4].inputstride;				\
+      }									\
+   }									\
+}
+
+   
+#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
+				  insert_null, insert_null, NAME)
+
+#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
+				      insert_null, NAME)
+   
+#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
+				          insert_null, NAME)
+   
+
+EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
+
+EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
+
+EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
+
+
+/* Use the codegen paths to select one of a number of hardwired
+ * fastpaths.
+ */
+void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
+{
+   draw_vf_emit_func func = NULL;
+
+   /* Does it fit a hardwired fastpath?  Help! this is growing out of
+    * control!
+    */
+   switch (vf->attr_count) {
+   case 2:
+      if (vf->attr[0].do_insert == insert_3f_3 &&
+	  vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ 	 func = emit_xyz3_rgba4; 
+      }
+      break;
+   case 3:
+      if (vf->attr[2].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2;
+	 }
+      }
+      break;
+   case 4:
+      if (vf->attr[2].do_insert == insert_2f_2 &&
+	  vf->attr[3].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2_st2;
+	 }
+      }
+      break;
+   }
+
+   vf->emit = func;
+}
+
+/***********************************************************************
+ * Generic (non-codegen) functions for whole vertices or groups of
+ * vertices
+ */
+
+void draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+		      unsigned count,
+		      uint8_t *v )
+{
+   struct draw_vf_attr *a = vf->attr;
+   const unsigned attr_count = vf->attr_count;
+   const unsigned stride = vf->vertex_stride;
+   unsigned i, j;
+
+   for (i = 0 ; i < count ; i++, v += stride) {
+      for (j = 0; j < attr_count; j++) {
+	 float *in = (float *)a[j].inputptr;
+	 a[j].inputptr += a[j].inputstride;
+	 a[j].do_insert( &a[j], v + a[j].vertoffset, in );
+      }
+   }
+}
+
+
diff --git a/src/mesa/pipe/draw/draw_vf_sse.c b/src/mesa/pipe/draw/draw_vf_sse.c
new file mode 100644
index 0000000000..1ad2ae756d
--- /dev/null
+++ b/src/mesa/pipe/draw/draw_vf_sse.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "simple_list.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+
+#if defined(USE_SSE_ASM)
+
+#include "x86/rtasm/x86sse.h"
+#include "x86/common_x86_asm.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct x86_program {
+   struct x86_function func;
+
+   struct draw_vertex_fetch *vf;
+   boolean inputs_safe;
+   boolean outputs_safe;
+   boolean have_sse2;
+   
+   struct x86_reg identity;
+   struct x86_reg chan0;
+};
+
+
+static struct x86_reg get_identity( struct x86_program *p )
+{
+   return p->identity;
+}
+
+static void emit_load4f_4( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_load4f_3( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Initialize from identity, then pull in low two words:
+    */
+   sse_movups(&p->func, dest, get_identity(p));
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Pull in low word, then swizzle in identity */
+   sse_movss(&p->func, dest, arg0);
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+}
+
+
+
+static void emit_load3f_3( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Over-reads by 1 dword - potential SEGV if input is a vertex
+    * array.
+    */
+   if (p->inputs_safe) {
+      sse_movups(&p->func, dest, arg0);
+   } 
+   else {
+      /* c 0 0 0
+       * c c c c
+       * a b c c 
+       */
+      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
+      sse_movlps(&p->func, dest, arg0);
+   }
+}
+
+static void emit_load3f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_2(p, dest, arg0);
+}
+
+static void emit_load3f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load2f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load2f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load1f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void (*load[4][4])( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 ) = {
+   { emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1 },
+
+   { emit_load2f_1, 
+     emit_load2f_2, 
+     emit_load2f_2, 
+     emit_load2f_2 },
+
+   { emit_load3f_1, 
+     emit_load3f_2, 
+     emit_load3f_3, 
+     emit_load3f_3 },
+
+   { emit_load4f_1, 
+     emit_load4f_2, 
+     emit_load4f_3, 
+     emit_load4f_4 } 
+};
+
+static void emit_load( struct x86_program *p,
+		       struct x86_reg dest,
+		       unsigned sz,
+		       struct x86_reg src,
+		       unsigned src_sz)
+{
+   load[sz-1][src_sz-1](p, dest, src);
+}
+
+static void emit_store4f( struct x86_program *p, 			   
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_store3f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   if (p->outputs_safe) {
+      /* Emit the extra dword anyway.  This may hurt writecombining,
+       * may cause other problems.
+       */
+      sse_movups(&p->func, dest, arg0);
+   }
+   else {
+      /* Alternate strategy - emit two, shuffle, emit one.
+       */
+      sse_movlps(&p->func, dest, arg0);
+      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
+   }
+}
+
+static void emit_store2f( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_store1f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+
+static void (*store[4])( struct x86_program *p, 
+			 struct x86_reg dest,
+			 struct x86_reg arg0 ) = 
+{
+   emit_store1f, 
+   emit_store2f, 
+   emit_store3f, 
+   emit_store4f 
+};
+
+static void emit_store( struct x86_program *p,
+			struct x86_reg dest,
+			unsigned sz,
+			struct x86_reg temp )
+
+{
+   store[sz-1](p, dest, temp);
+}
+
+static void emit_pack_store_4ub( struct x86_program *p,
+				 struct x86_reg dest,
+				 struct x86_reg temp )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(&p->func, temp, p->chan0);
+
+   if (p->have_sse2) {
+      sse2_cvtps2dq(&p->func, temp, temp);
+      sse2_packssdw(&p->func, temp, temp);
+      sse2_packuswb(&p->func, temp, temp);
+      sse_movss(&p->func, dest, temp);
+   }
+   else {
+      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
+      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
+      sse_cvtps2pi(&p->func, mmx0, temp);
+      sse_movhlps(&p->func, temp, temp);
+      sse_cvtps2pi(&p->func, mmx1, temp);
+      mmx_packssdw(&p->func, mmx0, mmx1);
+      mmx_packuswb(&p->func, mmx0, mmx0);
+      mmx_movd(&p->func, dest, mmx0);
+   }
+}
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+/* Not much happens here.  Eventually use this function to try and
+ * avoid saving/reloading the source pointers each vertex (if some of
+ * them can fit in registers).
+ */
+static void get_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+   /* Load current a[j].inputptr
+    */
+   x86_mov(&p->func, srcREG, ptr_to_src);
+}
+
+static void update_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   if (a->inputstride) {
+      struct draw_vertex_fetch *vf = p->vf;
+      struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+      /* add a[j].inputstride (hardcoded value - could just as easily
+       * pull the stride value from memory each time).
+       */
+      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
+      
+      /* save new value of a[j].inputptr 
+       */
+      x86_mov(&p->func, ptr_to_src, srcREG);
+   }
+}
+
+
+/* Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct x86_program *p )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   unsigned j = 0;
+
+   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
+   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
+   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
+   struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
+   struct x86_reg temp = x86_make_reg(file_XMM, 0);
+   uint8_t *fixup, *label;
+
+   /* Push a few regs?
+    */
+   x86_push(&p->func, countEBP);
+   x86_push(&p->func, vfESI);
+
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(&p->func, srcECX, srcECX);
+   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
+   x86_cmp(&p->func, countEBP, srcECX);
+   fixup = x86_jcc_forward(&p->func, cc_E);
+
+   /* Initialize destination register. 
+    */
+   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
+
+   /* Move argument 1 (vf) into a reg:
+    */
+   x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
+
+   
+   /* always load, needed or not:
+    */
+   sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
+
+   /* Note address for loop jump */
+   label = x86_get_label(&p->func);
+
+   /* Emit code for each of the attributes.  Currently routes
+    * everything through SSE registers, even when it might be more
+    * efficient to stick with regular old x86.  No optimization or
+    * other tricks - enough new ground to cover here just getting
+    * things working.
+    */
+   while (j < vf->attr_count) {
+      struct draw_vf_attr *a = &vf->attr[j];
+      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
+
+      /* Now, load an XMM reg from src, perhaps transform, then save.
+       * Could be shortcircuited in specific cases:
+       */
+      switch (a->format) {
+      case DRAW_EMIT_1F:
+      case DRAW_EMIT_1F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 1, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_2F:
+      case DRAW_EMIT_2F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_3F:
+      case DRAW_EMIT_3F_CONST:
+	 /* Potentially the worst case - hardcode 2+1 copying:
+	  */
+	 if (0) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 3, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 2, temp);
+	    if (a->inputsize > 2) {
+	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
+	       emit_store(p, x86_make_disp(dest,8), 1, temp);
+	    }
+	    else {
+	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
+	    }
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 break;
+      case DRAW_EMIT_4F:
+      case DRAW_EMIT_4F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_3F_XYW:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+
+      case DRAW_EMIT_1UB_1F:	 
+	 /* Test for PAD3 + 1UB:
+	  */
+	 if (j > 0 &&
+	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
+	 {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
+	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    debug_printf("Can't emit 1ub %x %x %d\n", 
+	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    return FALSE;
+	 }
+	 break;
+      case DRAW_EMIT_3UB_3F_RGB:
+      case DRAW_EMIT_3UB_3F_BGR:
+	 /* Test for 3UB + PAD1:
+	  */
+	 if (j == vf->attr_count - 1 ||
+	     a[1].vertoffset >= a->vertoffset + 4) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 /* Test for 3UB + 1UB:
+	  */
+	 else if (j < vf->attr_count - 1 &&
+		  a[1].format == DRAW_EMIT_1UB_1F &&
+		  a[1].vertoffset == a->vertoffset + 3) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    update_src_ptr(p, srcECX, vfESI, a);
+
+	    /* Make room for incoming value:
+	     */
+	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+
+	    get_src_ptr(p, srcECX, vfESI, &a[1]);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+	    update_src_ptr(p, srcECX, vfESI, &a[1]);
+
+	    /* Rearrange and possibly do BGR conversion:
+	     */
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	    else
+	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
+
+	    emit_pack_store_4ub(p, dest, temp);
+	    j++;		/* NOTE: two attrs consumed */
+	 }
+	 else {
+	    debug_printf("Can't emit 3ub\n");
+	 }
+	 return FALSE;	/* add this later */
+	 break;
+
+      case DRAW_EMIT_4UB_4F_RGBA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_BGRA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_ARGB:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_ABGR:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      default:
+	 debug_printf("unknown a[%d].format %d\n", j, a->format);
+	 return FALSE;	/* catch any new opcodes */
+      }
+      
+      /* Increment j by at least 1 - may have been incremented above also:
+       */
+      j++;
+   }
+
+   /* Next vertex:
+    */
+   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(&p->func, countEBP);
+   x86_test(&p->func, countEBP, countEBP); 
+   x86_jcc(&p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func.need_emms)
+      mmx_emms(&p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(&p->func, fixup);
+
+   /* Pop regs and return
+    */
+   x86_pop(&p->func, x86_get_base_reg(vfESI));
+   x86_pop(&p->func, countEBP);
+   x86_ret(&p->func);
+
+   vf->emit = (draw_vf_emit_func)x86_get_func(&p->func);
+   return TRUE;
+}
+
+
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   struct x86_program p;   
+
+   if (!cpu_has_xmm) {
+      vf->codegen_emit = NULL;
+      return;
+   }
+
+   memset(&p, 0, sizeof(p));
+
+   p.vf = vf;
+   p.inputs_safe = 0;		/* for now */
+   p.outputs_safe = 1;		/* for now */
+   p.have_sse2 = cpu_has_xmm2;
+   p.identity = x86_make_reg(file_XMM, 6);
+   p.chan0 = x86_make_reg(file_XMM, 7);
+
+   x86_init_func(&p.func);
+
+   if (build_vertex_emit(&p)) {
+      draw_vf_register_fastpath( vf, TRUE );
+   }
+   else {
+      /* Note the failure so that we don't keep trying to codegen an
+       * impossible state:
+       */
+      draw_vf_register_fastpath( vf, FALSE );
+      x86_release_func(&p.func);
+   }
+}
+
+#else
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   /* Dummy version for when USE_SSE_ASM not defined */
+}
+
+#endif
diff --git a/src/mesa/pipe/failover/fo_context.c b/src/mesa/pipe/failover/fo_context.c
index cf6c9fed50..7ce4a7df17 100644
--- a/src/mesa/pipe/failover/fo_context.c
+++ b/src/mesa/pipe/failover/fo_context.c
@@ -114,6 +114,8 @@ struct pipe_context *failover_create( struct pipe_context *hw,
    if (failover == NULL)
       return NULL;
 
+   failover->hw = hw;
+   failover->sw = sw;
    failover->pipe.winsys = hw->winsys;
    failover->pipe.destroy = failover_destroy;
    failover->pipe.is_format_supported = hw->is_format_supported;
diff --git a/src/mesa/pipe/failover/fo_state.c b/src/mesa/pipe/failover/fo_state.c
index fa700b9674..0fc5568da1 100644
--- a/src/mesa/pipe/failover/fo_state.c
+++ b/src/mesa/pipe/failover/fo_state.c
@@ -54,8 +54,8 @@ failover_create_blend_state( struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_blend_state(pipe, blend);
-   state->hw_state = failover->hw->create_blend_state(pipe, blend);
+   state->sw_state = failover->sw->create_blend_state(failover->sw, blend);
+   state->hw_state = failover->hw->create_blend_state(failover->hw, blend);
 
    return state;
 }
@@ -68,6 +68,7 @@ failover_bind_blend_state( struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state *)blend;
    failover->blend = state;
    failover->dirty |= FO_NEW_BLEND;
+   failover->sw->bind_blend_state( failover->sw, state->sw_state );
    failover->hw->bind_blend_state( failover->hw, state->hw_state );
 }
 
@@ -78,8 +79,8 @@ failover_delete_blend_state( struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)blend;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_blend_state(pipe, state->sw_state);
-   failover->hw->delete_blend_state(pipe, state->hw_state);
+   failover->sw->delete_blend_state(failover->sw, state->sw_state);
+   failover->hw->delete_blend_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -93,6 +94,7 @@ failover_set_blend_color( struct pipe_context *pipe,
 
    failover->blend_color = *blend_color;
    failover->dirty |= FO_NEW_BLEND_COLOR;
+   failover->sw->set_blend_color( failover->sw, blend_color );
    failover->hw->set_blend_color( failover->hw, blend_color );
 }
 
@@ -104,6 +106,7 @@ failover_set_clip_state( struct pipe_context *pipe,
 
    failover->clip = *clip;
    failover->dirty |= FO_NEW_CLIP;
+   failover->sw->set_clip_state( failover->sw, clip );
    failover->hw->set_clip_state( failover->hw, clip );
 }
 
@@ -115,8 +118,8 @@ failover_create_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_depth_stencil_alpha_state(pipe, templ);
-   state->hw_state = failover->hw->create_depth_stencil_alpha_state(pipe, templ);
+   state->sw_state = failover->sw->create_depth_stencil_alpha_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_depth_stencil_alpha_state(failover->hw, templ);
 
    return state;
 }
@@ -129,6 +132,7 @@ failover_bind_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state *)depth_stencil;
    failover->depth_stencil = state;
    failover->dirty |= FO_NEW_DEPTH_STENCIL;
+   failover->sw->bind_depth_stencil_alpha_state(failover->sw, state->sw_state);
    failover->hw->bind_depth_stencil_alpha_state(failover->hw, state->hw_state);
 }
 
@@ -139,8 +143,8 @@ failover_delete_depth_stencil_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)ds;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_depth_stencil_alpha_state(pipe, state->sw_state);
-   failover->hw->delete_depth_stencil_alpha_state(pipe, state->hw_state);
+   failover->sw->delete_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->delete_depth_stencil_alpha_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -154,6 +158,7 @@ failover_set_framebuffer_state(struct pipe_context *pipe,
 
    failover->framebuffer = *framebuffer;
    failover->dirty |= FO_NEW_FRAMEBUFFER;
+   failover->sw->set_framebuffer_state( failover->sw, framebuffer );
    failover->hw->set_framebuffer_state( failover->hw, framebuffer );
 }
 
@@ -165,8 +170,8 @@ failover_create_fs_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_fs_state(pipe, templ);
-   state->hw_state = failover->hw->create_fs_state(pipe, templ);
+   state->sw_state = failover->sw->create_fs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_fs_state(failover->hw, templ);
 
    return state;
 }
@@ -178,6 +183,7 @@ failover_bind_fs_state(struct pipe_context *pipe, void *fs)
    struct fo_state *state = (struct fo_state*)fs;
    failover->fragment_shader = state;
    failover->dirty |= FO_NEW_FRAGMENT_SHADER;
+   failover->sw->bind_fs_state(failover->sw, state->sw_state);
    failover->hw->bind_fs_state(failover->hw, state->hw_state);
 }
 
@@ -188,8 +194,8 @@ failover_delete_fs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)fs;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_fs_state(pipe, state->sw_state);
-   failover->hw->delete_fs_state(pipe, state->hw_state);
+   failover->sw->delete_fs_state(failover->sw, state->sw_state);
+   failover->hw->delete_fs_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -202,8 +208,8 @@ failover_create_vs_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_vs_state(pipe, templ);
-   state->hw_state = failover->hw->create_vs_state(pipe, templ);
+   state->sw_state = failover->sw->create_vs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_vs_state(failover->hw, templ);
 
    return state;
 }
@@ -217,6 +223,7 @@ failover_bind_vs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)vs;
    failover->vertex_shader = state;
    failover->dirty |= FO_NEW_VERTEX_SHADER;
+   failover->sw->bind_vs_state(failover->sw, state->sw_state);
    failover->hw->bind_vs_state(failover->hw, state->hw_state);
 }
 
@@ -227,8 +234,8 @@ failover_delete_vs_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)vs;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_vs_state(pipe, state->sw_state);
-   failover->hw->delete_vs_state(pipe, state->hw_state);
+   failover->sw->delete_vs_state(failover->sw, state->sw_state);
+   failover->hw->delete_vs_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -242,6 +249,7 @@ failover_set_polygon_stipple( struct pipe_context *pipe,
 
    failover->poly_stipple = *stipple;
    failover->dirty |= FO_NEW_STIPPLE;
+   failover->sw->set_polygon_stipple( failover->sw, stipple );
    failover->hw->set_polygon_stipple( failover->hw, stipple );
 }
 
@@ -253,8 +261,8 @@ failover_create_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_rasterizer_state(pipe, templ);
-   state->hw_state = failover->hw->create_rasterizer_state(pipe, templ);
+   state->sw_state = failover->sw->create_rasterizer_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_rasterizer_state(failover->hw, templ);
 
    return state;
 }
@@ -268,6 +276,7 @@ failover_bind_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)raster;
    failover->rasterizer = state;
    failover->dirty |= FO_NEW_RASTERIZER;
+   failover->sw->bind_rasterizer_state(failover->sw, state->sw_state);
    failover->hw->bind_rasterizer_state(failover->hw, state->hw_state);
 }
 
@@ -278,8 +287,8 @@ failover_delete_rasterizer_state(struct pipe_context *pipe,
    struct fo_state *state = (struct fo_state*)raster;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_rasterizer_state(pipe, state->sw_state);
-   failover->hw->delete_rasterizer_state(pipe, state->hw_state);
+   failover->sw->delete_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->delete_rasterizer_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -294,6 +303,7 @@ failover_set_scissor_state( struct pipe_context *pipe,
 
    failover->scissor = *scissor;
    failover->dirty |= FO_NEW_SCISSOR;
+   failover->sw->set_scissor_state( failover->sw, scissor );
    failover->hw->set_scissor_state( failover->hw, scissor );
 }
 
@@ -305,8 +315,8 @@ failover_create_sampler_state(struct pipe_context *pipe,
    struct fo_state *state = malloc(sizeof(struct fo_state));
    struct failover_context *failover = failover_context(pipe);
 
-   state->sw_state = failover->sw->create_sampler_state(pipe, templ);
-   state->hw_state = failover->hw->create_sampler_state(pipe, templ);
+   state->sw_state = failover->sw->create_sampler_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_sampler_state(failover->hw, templ);
 
    return state;
 }
@@ -320,6 +330,8 @@ failover_bind_sampler_state(struct pipe_context *pipe,
    failover->sampler[unit] = state;
    failover->dirty |= FO_NEW_SAMPLER;
    failover->dirty_sampler |= (1<<unit);
+   failover->sw->bind_sampler_state(failover->sw, unit,
+                                    state->sw_state);
    failover->hw->bind_sampler_state(failover->hw, unit,
                                     state->hw_state);
 }
@@ -330,8 +342,8 @@ failover_delete_sampler_state(struct pipe_context *pipe, void *sampler)
    struct fo_state *state = (struct fo_state*)sampler;
    struct failover_context *failover = failover_context(pipe);
 
-   failover->sw->delete_sampler_state(pipe, state->sw_state);
-   failover->hw->delete_sampler_state(pipe, state->hw_state);
+   failover->sw->delete_sampler_state(failover->sw, state->sw_state);
+   failover->hw->delete_sampler_state(failover->hw, state->hw_state);
    state->sw_state = 0;
    state->hw_state = 0;
    free(state);
@@ -348,6 +360,7 @@ failover_set_sampler_texture(struct pipe_context *pipe,
    failover->texture[unit] = texture;
    failover->dirty |= FO_NEW_TEXTURE;
    failover->dirty_texture |= (1<<unit);
+   failover->sw->set_sampler_texture( failover->sw, unit, texture );
    failover->hw->set_sampler_texture( failover->hw, unit, texture );
 }
 
@@ -360,6 +373,7 @@ failover_set_viewport_state( struct pipe_context *pipe,
 
    failover->viewport = *viewport; 
    failover->dirty |= FO_NEW_VIEWPORT;
+   failover->sw->set_viewport_state( failover->sw, viewport );
    failover->hw->set_viewport_state( failover->hw, viewport );
 }
 
@@ -374,6 +388,7 @@ failover_set_vertex_buffer(struct pipe_context *pipe,
    failover->vertex_buffer[unit] = *vertex_buffer;
    failover->dirty |= FO_NEW_VERTEX_BUFFER;
    failover->dirty_vertex_buffer |= (1<<unit);
+   failover->sw->set_vertex_buffer( failover->sw, unit, vertex_buffer );
    failover->hw->set_vertex_buffer( failover->hw, unit, vertex_buffer );
 }
 
@@ -388,9 +403,24 @@ failover_set_vertex_element(struct pipe_context *pipe,
    failover->vertex_element[unit] = *vertex_element;
    failover->dirty |= FO_NEW_VERTEX_ELEMENT;
    failover->dirty_vertex_element |= (1<<unit);
+   failover->sw->set_vertex_element( failover->sw, unit, vertex_element );
    failover->hw->set_vertex_element( failover->hw, unit, vertex_element );
 }
 
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   failover->sw->set_constant_buffer(failover->sw, shader, index, buf);
+   failover->hw->set_constant_buffer(failover->hw, shader, index, buf);
+}
+
 
 void
 failover_init_state_functions( struct failover_context *failover )
@@ -423,4 +453,5 @@ failover_init_state_functions( struct failover_context *failover )
    failover->pipe.set_viewport_state = failover_set_viewport_state;
    failover->pipe.set_vertex_buffer = failover_set_vertex_buffer;
    failover->pipe.set_vertex_element = failover_set_vertex_element;
+   failover->pipe.set_constant_buffer = failover_set_constant_buffer;
 }
diff --git a/src/mesa/pipe/i915simple/SConscript b/src/mesa/pipe/i915simple/SConscript
new file mode 100644
index 0000000000..f5fb96b995
--- /dev/null
+++ b/src/mesa/pipe/i915simple/SConscript
@@ -0,0 +1,29 @@
+Import('*')
+
+env = env.Clone()
+
+i915simple = env.ConvenienceLibrary(
+	target = 'i915simple',
+	source = [
+		'i915_blit.c',
+		'i915_clear.c',
+		'i915_context.c',
+		'i915_debug.c',
+		'i915_debug_fp.c',
+		'i915_flush.c',
+		'i915_fpc_emit.c',
+		'i915_fpc_translate.c',
+		'i915_prim_emit.c',
+		'i915_prim_vbuf.c',
+		'i915_state.c',
+		'i915_state_derived.c',
+		'i915_state_dynamic.c',
+		'i915_state_emit.c',
+		'i915_state_immediate.c',
+		'i915_state_sampler.c',
+		'i915_strings.c',
+		'i915_surface.c',
+		'i915_texture.c',
+	])
+
+Export('i915simple')
diff --git a/src/mesa/pipe/i915simple/i915_fpc_translate.c b/src/mesa/pipe/i915simple/i915_fpc_translate.c
index 0185512aeb..868f0c7e04 100644
--- a/src/mesa/pipe/i915simple/i915_fpc_translate.c
+++ b/src/mesa/pipe/i915simple/i915_fpc_translate.c
@@ -100,7 +100,7 @@ negate(int reg, int x, int y, int z, int w)
 static void
 i915_use_passthrough_shader(struct i915_context *i915)
 {
-   fprintf(stderr, "**** Using i915 pass-through fragment shader\n");
+   debug_printf("**** Using i915 pass-through fragment shader\n");
 
    i915->current.program = (uint *) MALLOC(sizeof(passthrough));
    if (i915->current.program) {
@@ -119,12 +119,12 @@ i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
    va_list args;
    char buffer[1024];
 
-   fprintf(stderr, "i915_program_error: ");
+   debug_printf("i915_program_error: ");
    va_start( args, msg );  
    vsprintf( buffer, msg, args );
    va_end( args );
-   fprintf(stderr, buffer);
-   fprintf(stderr, "\n");
+   debug_printf(buffer);
+   debug_printf("\n");
 
    p->error = 1;
 }
@@ -169,7 +169,7 @@ src_vector(struct i915_fp_compile *p,
 
       switch (sem_name) {
       case TGSI_SEMANTIC_POSITION:
-         fprintf(stderr, "SKIP SEM POS\n");
+         debug_printf("SKIP SEM POS\n");
          /*
          assert(p->wpos_tex != -1);
          src = i915_emit_decl(p, REG_TYPE_T, p->wpos_tex, D0_CHANNEL_ALL);
@@ -913,7 +913,7 @@ i915_translate_instructions(struct i915_fp_compile *p,
             ind = parse.FullToken.FullDeclaration.u.DeclarationRange.First;
             sem = parse.FullToken.FullDeclaration.Semantic.SemanticName;
             semi = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
-            /*printf("FS Input DECL [%u] sem %u\n", ind, sem);*/
+            /*debug_printf("FS Input DECL [%u] sem %u\n", ind, sem);*/
             p->input_semantic_name[ind] = sem;
             p->input_semantic_index[ind] = semi;
          }
@@ -924,7 +924,7 @@ i915_translate_instructions(struct i915_fp_compile *p,
             ind = parse.FullToken.FullDeclaration.u.DeclarationRange.First;
             sem = parse.FullToken.FullDeclaration.Semantic.SemanticName;
             semi = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
-            /*printf("FS Output DECL [%u] sem %u\n", ind, sem);*/
+            /*debug_printf("FS Output DECL [%u] sem %u\n", ind, sem);*/
             p->output_semantic_name[ind] = sem;
             p->output_semantic_index[ind] = semi;
          }
diff --git a/src/mesa/pipe/i915simple/i915_prim_vbuf.c b/src/mesa/pipe/i915simple/i915_prim_vbuf.c
index 39154b2488..e069773fd4 100644
--- a/src/mesa/pipe/i915simple/i915_prim_vbuf.c
+++ b/src/mesa/pipe/i915simple/i915_prim_vbuf.c
@@ -38,9 +38,8 @@
  */
 
 
-#include <assert.h>
-
 #include "pipe/draw/draw_vbuf.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_inlines.h"
 #include "pipe/p_winsys.h"
diff --git a/src/mesa/pipe/i915simple/i915_state_derived.c b/src/mesa/pipe/i915simple/i915_state_derived.c
index 62741e30f8..653983e4a9 100644
--- a/src/mesa/pipe/i915simple/i915_state_derived.c
+++ b/src/mesa/pipe/i915simple/i915_state_derived.c
@@ -87,7 +87,7 @@ static void calculate_vertex_layout( struct i915_context *i915 )
          }
          break;
       case TGSI_SEMANTIC_FOG:
-         fprintf(stderr, "i915 fogcoord not implemented yet\n");
+         debug_printf("i915 fogcoord not implemented yet\n");
          draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src++);
          break;
       default:
diff --git a/src/mesa/pipe/i915simple/i915_state_emit.c b/src/mesa/pipe/i915simple/i915_state_emit.c
index 657f523893..3339287f49 100644
--- a/src/mesa/pipe/i915simple/i915_state_emit.c
+++ b/src/mesa/pipe/i915simple/i915_state_emit.c
@@ -107,7 +107,7 @@ i915_emit_hardware_state(struct i915_context *i915 )
                            ) * 3/2; /* plus 50% margin */
 
 #if 0
-   fprintf (stderr, "i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
+   debug_printf("i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
 #endif
    
    if(!BEGIN_BATCH(dwords, relocs)) {
diff --git a/src/mesa/pipe/i915simple/i915_state_immediate.c b/src/mesa/pipe/i915simple/i915_state_immediate.c
index 752d25f233..07031fc6c5 100644
--- a/src/mesa/pipe/i915simple/i915_state_immediate.c
+++ b/src/mesa/pipe/i915simple/i915_state_immediate.c
@@ -97,7 +97,7 @@ static void upload_S2S4(struct i915_context *i915)
       LIS2 = i915->current.vertex_info.hwfmt[1];
       LIS4 = i915->current.vertex_info.hwfmt[0];
       /*
-      printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
+      debug_printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
       */
       assert(LIS4); /* should never be zero? */
    }
diff --git a/src/mesa/pipe/i915simple/i915_state_sampler.c b/src/mesa/pipe/i915simple/i915_state_sampler.c
index 59408b6ba0..0dbbc5241d 100644
--- a/src/mesa/pipe/i915simple/i915_state_sampler.c
+++ b/src/mesa/pipe/i915simple/i915_state_sampler.c
@@ -169,7 +169,7 @@ translate_texture_format(enum pipe_format pipeFormat)
    case PIPE_FORMAT_S8Z24_UNORM:
       return (MAPSURF_32BIT | MT_32BIT_xL824);
    default:
-      fprintf(stderr, "i915: translate_texture_format() bad image format %x\n",
+      debug_printf("i915: translate_texture_format() bad image format %x\n",
               pipeFormat);
       assert(0);
       return 0;
diff --git a/src/mesa/pipe/i915simple/i915_texture.c b/src/mesa/pipe/i915simple/i915_texture.c
index 61944fe7d9..6faeab134a 100644
--- a/src/mesa/pipe/i915simple/i915_texture.c
+++ b/src/mesa/pipe/i915simple/i915_texture.c
@@ -477,17 +477,17 @@ i945_miptree_layout(struct pipe_context *pipe, struct i915_texture * tex)
    return TRUE;
 }
 
-void
-i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+
+struct pipe_texture *
+i915_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat)
 {
-   struct i915_texture *tex = REALLOC(*pt, sizeof(struct pipe_texture),
-				      sizeof(struct i915_texture));
+   struct i915_texture *tex = CALLOC_STRUCT(i915_texture);
 
    if (tex) {
       struct i915_context *i915 = i915_context(pipe);
 
-      memset(&tex->base + 1, 0,
-	     sizeof(struct i915_texture) - sizeof(struct pipe_texture));
+      tex->base = *templat;
 
       if (i915->flags.is_i945 ? i945_miptree_layout(pipe, tex) :
 	  i915_miptree_layout(pipe, tex))
@@ -498,13 +498,14 @@ i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
 
       if (!tex->buffer) {
 	 FREE(tex);
-	 tex = NULL;
+	 return NULL;
       }
    }
 
-   *pt = &tex->base;
+   return &tex->base;
 }
 
+
 void
 i915_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/i915simple/i915_texture.h b/src/mesa/pipe/i915simple/i915_texture.h
index 84a0502e81..330d111dc7 100644
--- a/src/mesa/pipe/i915simple/i915_texture.h
+++ b/src/mesa/pipe/i915simple/i915_texture.h
@@ -6,8 +6,9 @@ struct pipe_context;
 struct pipe_texture;
 
 
-extern void
-i915_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+struct pipe_texture *
+i915_texture_create(struct pipe_context *pipe,
+                    const struct pipe_texture *templat);
 
 extern void
 i915_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/i965simple/SConscript b/src/mesa/pipe/i965simple/SConscript
new file mode 100644
index 0000000000..74621de84c
--- /dev/null
+++ b/src/mesa/pipe/i965simple/SConscript
@@ -0,0 +1,55 @@
+Import('*')
+
+env = env.Clone()
+
+i965simple = env.ConvenienceLibrary(
+	target = 'i965simple',
+	source = [
+		'brw_blit.c',
+		'brw_cc.c',
+		'brw_clip.c',
+		'brw_clip_line.c',
+		'brw_clip_point.c',
+		'brw_clip_state.c',
+		'brw_clip_tri.c',
+		'brw_clip_util.c',
+		'brw_context.c',
+		'brw_curbe.c',
+		'brw_draw.c',
+		'brw_draw_upload.c',
+		'brw_eu.c',
+		'brw_eu_debug.c',
+		'brw_eu_emit.c',
+		'brw_eu_util.c',
+		'brw_flush.c',
+		'brw_gs.c',
+		'brw_gs_emit.c',
+		'brw_gs_state.c',
+		'brw_misc_state.c',
+		'brw_sf.c',
+		'brw_sf_emit.c',
+		'brw_sf_state.c',
+		'brw_shader_info.c',
+		'brw_state.c',
+		'brw_state_batch.c',
+		'brw_state_cache.c',
+		'brw_state_pool.c',
+		'brw_state_upload.c',
+		'brw_strings.c',
+		'brw_surface.c',
+		'brw_tex_layout.c',
+		'brw_urb.c',
+		'brw_util.c',
+		'brw_vs.c',
+		'brw_vs_emit.c',
+		'brw_vs_state.c',
+		'brw_wm.c',
+		'brw_wm_decl.c',
+		'brw_wm_glsl.c',
+		'brw_wm_iz.c',
+		'brw_wm_sampler_state.c',
+		'brw_wm_state.c',
+		'brw_wm_surface_state.c',
+	])
+
+Export('i965simple')
diff --git a/src/mesa/pipe/i965simple/brw_cc.c b/src/mesa/pipe/i965simple/brw_cc.c
index dcee731895..337e4f95f6 100644
--- a/src/mesa/pipe/i965simple/brw_cc.c
+++ b/src/mesa/pipe/i965simple/brw_cc.c
@@ -58,7 +58,7 @@ static int brw_translate_compare_func(int func)
       return BRW_COMPAREFUNCTION_ALWAYS;
    }
 
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
    return BRW_COMPAREFUNCTION_ALWAYS;
 }
 
diff --git a/src/mesa/pipe/i965simple/brw_curbe.c b/src/mesa/pipe/i965simple/brw_curbe.c
index 2733eb4e75..52bbd525c1 100644
--- a/src/mesa/pipe/i965simple/brw_curbe.c
+++ b/src/mesa/pipe/i965simple/brw_curbe.c
@@ -273,10 +273,10 @@ static void upload_constant_buffer(struct brw_context *brw)
 
    if (1) {
       for (i = 0; i < sz; i+=4)
-	 _mesa_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
 		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
 
-      _mesa_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
 		   brw->curbe.last_buf, buf,
 		   bufsz, brw->curbe.last_bufsz,
 		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
@@ -299,7 +299,7 @@ static void upload_constant_buffer(struct brw_context *brw)
 			  bufsz,
 			  1 << 6,
 			  &brw->curbe.gs_offset)) {
-	 _mesa_printf("out of GS memory for curbe\n");
+	 debug_printf("out of GS memory for curbe\n");
 	 assert(0);
 	 return;
       }
diff --git a/src/mesa/pipe/i965simple/brw_eu_debug.c b/src/mesa/pipe/i965simple/brw_eu_debug.c
index be692f6502..4a94ddefa6 100644
--- a/src/mesa/pipe/i965simple/brw_eu_debug.c
+++ b/src/mesa/pipe/i965simple/brw_eu_debug.c
@@ -30,6 +30,8 @@
   */
     
 
+#include "pipe/p_debug.h"
+
 #include "brw_eu.h"
 
 void brw_print_reg( struct brw_reg hwreg )
@@ -52,7 +54,7 @@ void brw_print_reg( struct brw_reg hwreg )
       "f"
    };
 
-   _mesa_printf("%s%s", 
+   debug_printf("%s%s", 
 		hwreg.abs ? "abs/" : "",
 		hwreg.negate ? "-" : "");
      
@@ -63,17 +65,17 @@ void brw_print_reg( struct brw_reg hwreg )
        hwreg.width == BRW_WIDTH_8 &&
        hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
        hwreg.type == BRW_REGISTER_TYPE_F) {
-      _mesa_printf("vec%d", hwreg.nr);
+      debug_printf("vec%d", hwreg.nr);
    }
    else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
 	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
 	    hwreg.width == BRW_WIDTH_1 &&
 	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
 	    hwreg.type == BRW_REGISTER_TYPE_F) {      
-      _mesa_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
    }
    else {
-      _mesa_printf("%s%d.%d<%d;%d,%d>:%s", 
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
 		   file[hwreg.file],
 		   hwreg.nr,
 		   hwreg.subnr / type_sz(hwreg.type),
diff --git a/src/mesa/pipe/i965simple/brw_eu_emit.c b/src/mesa/pipe/i965simple/brw_eu_emit.c
index 2423536dd1..400a80b6fb 100644
--- a/src/mesa/pipe/i965simple/brw_eu_emit.c
+++ b/src/mesa/pipe/i965simple/brw_eu_emit.c
@@ -953,7 +953,7 @@ void brw_SAMPLE(struct brw_compile *p,
    boolean need_stall = 0;
 
    if(writemask == 0) {
-/*       _mesa_printf("%s: zero writemask??\n", __FUNCTION__); */
+/*       debug_printf("%s: zero writemask??\n", __FUNCTION__); */
       return;
    }
 
@@ -985,7 +985,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
       if (newmask != writemask) {
 	 need_stall = 1;
-/* 	 _mesa_printf("need stall %x %x\n", newmask , writemask); */
+/* 	 debug_printf("need stall %x %x\n", newmask , writemask); */
       }
       else {
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
diff --git a/src/mesa/pipe/i965simple/brw_sf.c b/src/mesa/pipe/i965simple/brw_sf.c
index b89b2e4087..7c83b81c85 100644
--- a/src/mesa/pipe/i965simple/brw_sf.c
+++ b/src/mesa/pipe/i965simple/brw_sf.c
@@ -175,7 +175,7 @@ static void upload_sf_prog( struct brw_context *brw )
 	    //int semantic = parse.FullToken.FullDeclaration.Semantic.SemanticName;
 	    //int semantic_index = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
 
-	    fprintf(stderr, "fs input %d..%d interp mode %d\n", first, last, interp_mode);
+	    debug_printf("fs input %d..%d interp mode %d\n", first, last, interp_mode);
 	    
 	    switch (interp_mode) {
 	    case TGSI_INTERPOLATE_CONSTANT:
@@ -213,9 +213,9 @@ static void upload_sf_prog( struct brw_context *brw )
    key.linear_mask |= 1;
    key.const_mask <<= 1;
 
-   fprintf(stderr, "key.persp_mask: %x\n", key.persp_mask);
-   fprintf(stderr, "key.linear_mask: %x\n", key.linear_mask);
-   fprintf(stderr, "key.const_mask: %x\n", key.const_mask);
+   debug_printf("key.persp_mask: %x\n", key.persp_mask);
+   debug_printf("key.linear_mask: %x\n", key.linear_mask);
+   debug_printf("key.const_mask: %x\n", key.const_mask);
 
 
 //   key.do_point_sprite = brw->attribs.Point->PointSprite;
diff --git a/src/mesa/pipe/i965simple/brw_sf_emit.c b/src/mesa/pipe/i965simple/brw_sf_emit.c
index 6ff5254ff7..78d6fa5e9e 100644
--- a/src/mesa/pipe/i965simple/brw_sf_emit.c
+++ b/src/mesa/pipe/i965simple/brw_sf_emit.c
@@ -137,8 +137,8 @@ static boolean calculate_masks( struct brw_sf_compile *c,
    unsigned persp_mask = c->key.persp_mask;
    unsigned linear_mask = c->key.linear_mask;
 
-   fprintf(stderr, "persp_mask: %x\n", persp_mask);
-   fprintf(stderr, "linear_mask: %x\n", linear_mask);
+   debug_printf("persp_mask: %x\n", persp_mask);
+   debug_printf("linear_mask: %x\n", linear_mask);
 
    *pc_persp = 0;
    *pc_linear = 0;
@@ -162,9 +162,9 @@ static boolean calculate_masks( struct brw_sf_compile *c,
 	 *pc_linear |= 0xf0;
    }
 
-   fprintf(stderr, "pc: %x\n", *pc);
-   fprintf(stderr, "pc_persp: %x\n", *pc_persp);
-   fprintf(stderr, "pc_linear: %x\n", *pc_linear);
+   debug_printf("pc: %x\n", *pc);
+   debug_printf("pc_persp: %x\n", *pc_persp);
+   debug_printf("pc_linear: %x\n", *pc_linear);
    
 
    return is_last_attr;
@@ -177,7 +177,7 @@ void brw_emit_tri_setup( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    unsigned i;
 
-   fprintf(stderr, "%s START ==============\n", __FUNCTION__);
+   debug_printf("%s START ==============\n", __FUNCTION__);
 
    c->nr_verts = 3;
    alloc_regs(c);
@@ -250,7 +250,7 @@ void brw_emit_tri_setup( struct brw_sf_compile *c )
       }
    }
 
-   fprintf(stderr, "%s DONE ==============\n", __FUNCTION__);
+   debug_printf("%s DONE ==============\n", __FUNCTION__);
 
 }
 
diff --git a/src/mesa/pipe/i965simple/brw_state.c b/src/mesa/pipe/i965simple/brw_state.c
index daf14ff4ff..95dfce88e4 100644
--- a/src/mesa/pipe/i965simple/brw_state.c
+++ b/src/mesa/pipe/i965simple/brw_state.c
@@ -225,7 +225,7 @@ static void brw_bind_vs_state(struct pipe_context *pipe, void *vs)
    brw->attribs.VertexProgram = (struct brw_vertex_program *)vs;
    brw->state.dirty.brw |= BRW_NEW_VS;
 
-   printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
+   debug_printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
 }
 
 static void brw_delete_vs_state(struct pipe_context *pipe, void *shader)
diff --git a/src/mesa/pipe/i965simple/brw_state_cache.c b/src/mesa/pipe/i965simple/brw_state_cache.c
index c5738733f4..b3a5124461 100644
--- a/src/mesa/pipe/i965simple/brw_state_cache.c
+++ b/src/mesa/pipe/i965simple/brw_state_cache.c
@@ -149,7 +149,7 @@ unsigned brw_upload_cache( struct brw_cache *cache,
    if (!brw_pool_alloc(cache->pool, data_size, 1 << 6, &offset)) {
       /* Should not be possible:
        */
-      printf("brw_pool_alloc failed\n");
+      debug_printf("brw_pool_alloc failed\n");
       exit(1);
    }
 
@@ -177,7 +177,7 @@ unsigned brw_upload_cache( struct brw_cache *cache,
    }
 
    if (BRW_DEBUG & DEBUG_STATE)
-      printf("upload %s: %d bytes to pool buffer %p offset %x\n",
+      debug_printf("upload %s: %d bytes to pool buffer %p offset %x\n",
              cache->name, 
 	     data_size,
              (void*)cache->pool->buffer,
@@ -416,7 +416,7 @@ void brw_clear_all_caches( struct brw_context *brw )
    int i;
 
    if (BRW_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+      debug_printf("%s\n", __FUNCTION__);
 
    for (i = 0; i < BRW_MAX_CACHE; i++)
       clear_cache(&brw->cache[i]);
diff --git a/src/mesa/pipe/i965simple/brw_state_pool.c b/src/mesa/pipe/i965simple/brw_state_pool.c
index 7c67f0ee25..f3174bfe0a 100644
--- a/src/mesa/pipe/i965simple/brw_state_pool.c
+++ b/src/mesa/pipe/i965simple/brw_state_pool.c
@@ -58,7 +58,7 @@ boolean brw_pool_alloc( struct brw_mem_pool *pool,
    size = align(size, 4);
 
    if (pool->offset + fixup + size >= pool->size) {
-      printf("%s failed\n", __FUNCTION__);
+      debug_printf("%s failed\n", __FUNCTION__);
       assert(0);
       exit(0);
    }
@@ -74,7 +74,7 @@ static
 void brw_invalidate_pool( struct brw_mem_pool *pool )
 {
    if (BRW_DEBUG & DEBUG_STATE)
-      printf("\n\n\n %s \n\n\n", __FUNCTION__);
+      debug_printf("\n\n\n %s \n\n\n", __FUNCTION__);
 
    pool->offset = 0;
 
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.c b/src/mesa/pipe/i965simple/brw_tex_layout.c
index b8b6b579e2..405fd1f794 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.c
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.c
@@ -299,15 +299,14 @@ static boolean brw_miptree_layout(struct pipe_context *pipe, struct brw_texture
    return TRUE;
 }
 
-void
-brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+
+struct pipe_texture *
+brw_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat)
 {
-   struct brw_texture *tex = REALLOC(*pt, sizeof(struct pipe_texture),
-                                     sizeof(struct brw_texture));
+   struct brw_texture *tex = CALLOC_STRUCT(brw_texture);
 
    if (tex) {
-      memset(&tex->base + 1, 0,
-	     sizeof(struct brw_texture) - sizeof(struct pipe_texture));
+      tex->base = *templat;
 
       if (brw_miptree_layout(pipe, tex))
 	 tex->buffer = pipe->winsys->buffer_create(pipe->winsys, 64,
@@ -317,11 +316,11 @@ brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
 
       if (!tex->buffer) {
 	 FREE(tex);
-	 tex = NULL;
+         return NULL;
       }
    }
 
-   *pt = &tex->base;
+   return &tex->base;
 }
 
 void
diff --git a/src/mesa/pipe/i965simple/brw_tex_layout.h b/src/mesa/pipe/i965simple/brw_tex_layout.h
index 15e275058a..cfd6b1ef3a 100644
--- a/src/mesa/pipe/i965simple/brw_tex_layout.h
+++ b/src/mesa/pipe/i965simple/brw_tex_layout.h
@@ -6,8 +6,8 @@
 struct pipe_context;
 struct pipe_texture;
 
-extern void
-brw_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+brw_texture_create(struct pipe_context *pipe, const struct pipe_texture *templat);
 
 extern void
 brw_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/i965simple/brw_urb.c b/src/mesa/pipe/i965simple/brw_urb.c
index b284526aa6..101a4367b9 100644
--- a/src/mesa/pipe/i965simple/brw_urb.c
+++ b/src/mesa/pipe/i965simple/brw_urb.c
@@ -120,18 +120,18 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	     * entries and the values for minimum nr of entries
 	     * provided above.
 	     */
-	    fprintf(stderr, "couldn't calculate URB layout!\n");
+	    debug_printf("couldn't calculate URB layout!\n");
 	    exit(1);
 	 }
 
 	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
-	    printf("URB CONSTRAINED\n");
+	    debug_printf("URB CONSTRAINED\n");
       }
       else
 	 brw->urb.constrained = 0;
 
       if (BRW_DEBUG & DEBUG_URB)
-	 printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
 		      brw->urb.gs_start,
 		      brw->urb.clip_start,
diff --git a/src/mesa/pipe/i965simple/brw_vs_emit.c b/src/mesa/pipe/i965simple/brw_vs_emit.c
index b32c233dd2..98915ba101 100644
--- a/src/mesa/pipe/i965simple/brw_vs_emit.c
+++ b/src/mesa/pipe/i965simple/brw_vs_emit.c
@@ -1228,7 +1228,7 @@ static void process_instruction(struct brw_vs_compile *c,
    case TGSI_OPCODE_ENDSUB:
       break;
    default:
-      printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
+      debug_printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
       break;
    }
 
diff --git a/src/mesa/pipe/i965simple/brw_wm.c b/src/mesa/pipe/i965simple/brw_wm.c
index 0ee0fbed51..539b170744 100644
--- a/src/mesa/pipe/i965simple/brw_wm.c
+++ b/src/mesa/pipe/i965simple/brw_wm.c
@@ -57,7 +57,7 @@ static void do_wm_prog( struct brw_context *brw,
    c->pixel_w = brw_null_reg();
 
 
-   fprintf(stderr, "XXXXXXXX FP\n");
+   debug_printf("XXXXXXXX FP\n");
    
    brw_wm_glsl_emit(c);
 
diff --git a/src/mesa/pipe/i965simple/brw_wm_glsl.c b/src/mesa/pipe/i965simple/brw_wm_glsl.c
index f4b5c13c06..d95645d108 100644
--- a/src/mesa/pipe/i965simple/brw_wm_glsl.c
+++ b/src/mesa/pipe/i965simple/brw_wm_glsl.c
@@ -982,7 +982,7 @@ static void brw_wm_emit_instruction( struct brw_wm_compile *c,
       break;
 
    default:
-      _mesa_printf("unsupported IR in fragment shader %d\n",
+      debug_printf("unsupported IR in fragment shader %d\n",
 		   inst->Instruction.Opcode);
    }
 #if 0
diff --git a/src/mesa/pipe/i965simple/brw_wm_sampler_state.c b/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
index cfb430eb09..de42ffc5b1 100644
--- a/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
+++ b/src/mesa/pipe/i965simple/brw_wm_sampler_state.c
@@ -71,7 +71,7 @@ static int intel_translate_shadow_compare_func(unsigned func)
        return COMPAREFUNC_NEVER;
    }
 
-   fprintf(stderr, "Unknown value in %s: %x\n", __FUNCTION__, func);
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
    return COMPAREFUNC_NEVER;
 }
 
diff --git a/src/mesa/pipe/p_compiler.h b/src/mesa/pipe/p_compiler.h
index e939d9cd9b..30cd729c56 100644
--- a/src/mesa/pipe/p_compiler.h
+++ b/src/mesa/pipe/p_compiler.h
@@ -28,10 +28,9 @@
 #ifndef P_COMPILER_H
 #define P_COMPILER_H
 
-#include <assert.h>
+
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 
 
 #if defined(_WIN32) && !defined(__WIN32__)
diff --git a/src/mesa/pipe/p_context.h b/src/mesa/pipe/p_context.h
index 0dda06c53b..92a1cd70c4 100644
--- a/src/mesa/pipe/p_context.h
+++ b/src/mesa/pipe/p_context.h
@@ -199,8 +199,8 @@ struct pipe_context {
    /*
     * Texture functions
     */
-   void (*texture_create)(struct pipe_context *pipe,
-			  struct pipe_texture **pt);
+   struct pipe_texture * (*texture_create)(struct pipe_context *pipe,
+                                           const struct pipe_texture *templat);
 
    void (*texture_release)(struct pipe_context *pipe,
 			   struct pipe_texture **pt);
diff --git a/src/mesa/pipe/p_debug.h b/src/mesa/pipe/p_debug.h
new file mode 100644
index 0000000000..2a11627b36
--- /dev/null
+++ b/src/mesa/pipe/p_debug.h
@@ -0,0 +1,86 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Cross-platform debugging helpers.
+ * 
+ * For now it just has assert and printf replacements, but it might be extended 
+ * with stack trace reports and more advanced logging in the near future. 
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef P_DEBUG_H_
+#define P_DEBUG_H_
+
+
+#include <stdarg.h>
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#ifdef DBG
+#ifndef DEBUG
+#define DEBUG 1
+#endif
+#else
+#ifndef NDEBUG
+#define NDEBUG 1
+#endif
+#endif
+
+
+void debug_printf(const char *format, ...);
+
+void debug_vprintf(const char *format, va_list ap);
+
+void debug_assert_fail(const char *expr, const char *file, unsigned line);
+
+
+/** Assert macro */
+#ifdef DEBUG
+#define debug_assert(expr) ((expr) ? (void)0 : debug_assert_fail(#expr, __FILE__, __LINE__))
+#else
+#define debug_assert(expr) ((void)0)
+#endif
+
+
+#ifdef assert
+#undef assert
+#endif
+#define assert(expr) debug_assert(expr)
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* P_DEBUG_H_ */
diff --git a/src/mesa/pipe/p_defines.h b/src/mesa/pipe/p_defines.h
index 85adf2d61d..0bf53ecb79 100644
--- a/src/mesa/pipe/p_defines.h
+++ b/src/mesa/pipe/p_defines.h
@@ -265,6 +265,6 @@ enum pipe_texture_target {
 #define PIPE_CAP_MAX_POINT_WIDTH_AA      17
 #define PIPE_CAP_MAX_TEXTURE_ANISOTROPY  18
 #define PIPE_CAP_MAX_TEXTURE_LOD_BIAS    19
-
+#define PIPE_CAP_BITMAP_TEXCOORD_BIAS    20
 
 #endif
diff --git a/src/mesa/pipe/p_format.h b/src/mesa/pipe/p_format.h
index 9f60cdbb04..c9ad324315 100644
--- a/src/mesa/pipe/p_format.h
+++ b/src/mesa/pipe/p_format.h
@@ -28,7 +28,10 @@
 #ifndef PIPE_FORMAT_H
 #define PIPE_FORMAT_H
 
+#include <stdio.h> // for sprintf
+
 #include "p_compiler.h"
+#include "p_debug.h"
 
 /**
  * The PIPE_FORMAT is a 32-bit wide bitfield that encodes all the information
diff --git a/src/mesa/pipe/p_shader_tokens.h b/src/mesa/pipe/p_shader_tokens.h
index e9d1d66bda..3ce35310f6 100644
--- a/src/mesa/pipe/p_shader_tokens.h
+++ b/src/mesa/pipe/p_shader_tokens.h
@@ -626,7 +626,7 @@ struct tgsi_src_register_ext
 
 /*
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ,
- * it should be cast to tgsi_src_register_ext_extswz.
+ * it should be cast to tgsi_src_register_ext_swz.
  * 
  * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_MOD,
  * it should be cast to tgsi_src_register_ext_mod.
diff --git a/src/mesa/pipe/p_util.h b/src/mesa/pipe/p_util.h
index 059528787d..469920efee 100644
--- a/src/mesa/pipe/p_util.h
+++ b/src/mesa/pipe/p_util.h
@@ -29,6 +29,7 @@
 #define P_UTIL_H
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include <math.h>
 
 
@@ -183,6 +184,20 @@ align_free(void *ptr)
 
 
 
+/**
+ * Duplicate of a block of memory
+ */
+static INLINE void *
+mem_dup(const void *src, uint size)
+{
+   void *dup = malloc(size);
+   if (dup)
+      memcpy(dup, src, size);
+   return dup;
+}
+
+
+
 #define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
 #define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
 #define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
@@ -381,10 +396,6 @@ static INLINE int align(int value, int alignment)
    return (value + alignment - 1) & ~(alignment - 1);
 }
 
-/* Convenient...
- */
-extern void _mesa_printf(const char *str, ...);
-
 
 /* util/p_util.c
  */
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer.h b/src/mesa/pipe/pipebuffer/pb_buffer.h
index 17551b3b50..97beb5f72a 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer.h
+++ b/src/mesa/pipe/pipebuffer/pb_buffer.h
@@ -44,10 +44,8 @@
 #define PB_BUFFER_H_
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
 #include "pipe/p_state.h"
 #include "pipe/p_inlines.h"
 
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
index 349647fe6e..f4fc3f6d71 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.c
@@ -34,12 +34,10 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "linked_list.h"
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include "p_winsys.h"
 #include "p_thread.h"
 #include "p_util.h"
@@ -145,7 +143,7 @@ _fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list,
       /* Do the delayed destroy:
        */
       pb_reference(&fenced_buf->buffer, NULL);
-      free(fenced_buf);
+      FREE(fenced_buf);
    }
 }
 
@@ -162,7 +160,7 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    }
    else {
       pb_reference(&fenced_buf->buffer, NULL);
-      free(fenced_buf);
+      FREE(fenced_buf);
    }
    
    if ((fenced_list->numDelayed % fenced_list->checkDelayed) == 0)
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
index 09082a5390..c40b9c75e1 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_fenced.h
@@ -51,7 +51,7 @@
 #define PB_BUFFER_FENCED_H_
 
 
-#include <assert.h>
+#include "pipe/p_debug.h"
 
 
 struct pipe_winsys;
diff --git a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
index fc83a00f36..c1b7759874 100644
--- a/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
+++ b/src/mesa/pipe/pipebuffer/pb_buffer_malloc.c
@@ -34,9 +34,7 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pb_buffer.h"
 
@@ -107,10 +105,9 @@ pb_malloc_buffer_create(size_t size,
 {
    struct malloc_buffer *buf;
    
-   /* TODO: accept an alignment parameter */
    /* TODO: do a single allocation */
    
-   buf = (struct malloc_buffer *)MALLOC(sizeof(struct malloc_buffer));
+   buf = CALLOC_STRUCT(malloc_buffer);
    if(!buf)
       return NULL;
    
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
index 3b341c64c2..c535d3276c 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_fenced.c
@@ -34,9 +34,7 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
+#include "p_debug.h"
 #include "p_util.h"
 
 #include "pb_buffer.h"
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
index 2694f57bca..8b1b51c0e2 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_mm.c
@@ -34,11 +34,10 @@
  */
 
 
-#include <assert.h>
-
 #include "linked_list.h"
 
 #include "p_defines.h"
+#include "p_debug.h"
 #include "p_thread.h"
 #include "p_util.h"
 #include "pb_buffer.h"
@@ -69,28 +68,28 @@ struct mem_block
 static void
 mmDumpMemInfo(const struct mem_block *heap)
 {
-   fprintf(stderr, "Memory heap %p:\n", (void *)heap);
+   debug_printf("Memory heap %p:\n", (void *)heap);
    if (heap == 0) {
-      fprintf(stderr, "  heap == 0\n");
+      debug_printf("  heap == 0\n");
    } else {
       const struct mem_block *p;
 
       for(p = heap->next; p != heap; p = p->next) {
-	 fprintf(stderr, "  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
 		 p->free ? 'F':'.',
 		 p->reserved ? 'R':'.');
       }
 
-      fprintf(stderr, "\nFree list:\n");
+      debug_printf("\nFree list:\n");
 
       for(p = heap->next_free; p != heap; p = p->next_free) {
-	 fprintf(stderr, " FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
 		 p->free ? 'F':'.',
 		 p->reserved ? 'R':'.');
       }
 
    }
-   fprintf(stderr, "End of memory blocks\n");
+   debug_printf("End of memory blocks\n");
 }
 #endif
 
@@ -308,11 +307,11 @@ mmFreeMem(struct mem_block *b)
       return 0;
 
    if (b->free) {
-      fprintf(stderr, "block already free\n");
+      debug_printf("block already free\n");
       return -1;
    }
    if (b->reserved) {
-      fprintf(stderr, "block is reserved\n");
+      debug_printf("block is reserved\n");
       return -1;
    }
 
@@ -367,7 +366,7 @@ struct mm_pb_manager
 };
 
 
-static inline struct mm_pb_manager *
+static INLINE struct mm_pb_manager *
 mm_pb_manager(struct pb_manager *mgr)
 {
    assert(mgr);
@@ -385,7 +384,7 @@ struct mm_buffer
 };
 
 
-static inline struct mm_buffer *
+static INLINE struct mm_buffer *
 mm_buffer(struct pb_buffer *buf)
 {
    assert(buf);
@@ -399,6 +398,8 @@ mm_buffer_destroy(struct pb_buffer *buf)
    struct mm_buffer *mm_buf = mm_buffer(buf);
    struct mm_pb_manager *mm = mm_buf->mgr;
    
+   assert(buf->base.refcount == 0);
+   
    _glthread_LOCK_MUTEX(mm->mutex);
    mmFreeMem(mm_buf->block);
    FREE(buf);
@@ -477,7 +478,7 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    
    mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
    if(!mm_buf->block) {
-      fprintf(stderr, "warning: heap full\n");
+      debug_printf("warning: heap full\n");
 #if 0
       mmDumpMemInfo(mm->heap);
 #endif
diff --git a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
index 7c29954112..04477a865a 100644
--- a/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
+++ b/src/mesa/pipe/pipebuffer/pb_bufmgr_pool.c
@@ -35,12 +35,10 @@
  */
 
 
-#include <assert.h>
-#include <stdlib.h>
-
 #include "linked_list.h"
 
 #include "p_compiler.h"
+#include "p_debug.h"
 #include "p_thread.h"
 #include "p_defines.h"
 #include "p_util.h"
@@ -172,13 +170,13 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    struct list_head *item;
 
    assert(size == pool->bufSize);
-   assert(desc->alignment % pool->bufAlign == 0);
+   assert(pool->bufAlign % desc->alignment == 0);
    
    _glthread_LOCK_MUTEX(pool->mutex);
 
    if (pool->numFree == 0) {
       _glthread_UNLOCK_MUTEX(pool->mutex);
-      fprintf(stderr, "warning: out of fixed size buffer objects\n");
+      debug_printf("warning: out of fixed size buffer objects\n");
       return NULL;
    }
 
@@ -186,7 +184,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
 
    if (item == &pool->free) {
       _glthread_UNLOCK_MUTEX(pool->mutex);
-      fprintf(stderr, "error: fixed size buffer pool corruption\n");
+      debug_printf("error: fixed size buffer pool corruption\n");
       return NULL;
    }
 
@@ -258,7 +256,7 @@ pool_bufmgr_create(struct pb_manager *provider,
    if(!pool->map)
       goto failure;
 
-   pool->bufs = (struct pool_buffer *) MALLOC(numBufs * sizeof(*pool->bufs));
+   pool->bufs = (struct pool_buffer *)CALLOC(numBufs, sizeof(*pool->bufs));
    if (!pool->bufs)
       goto failure;
 
diff --git a/src/mesa/pipe/softpipe/SConscript b/src/mesa/pipe/softpipe/SConscript
new file mode 100644
index 0000000000..d581ee8d3c
--- /dev/null
+++ b/src/mesa/pipe/softpipe/SConscript
@@ -0,0 +1,42 @@
+Import('*')
+
+env = env.Clone()
+
+softpipe = env.ConvenienceLibrary(
+	target = 'softpipe',
+	source = [
+		'sp_clear.c',
+		'sp_context.c',
+		'sp_draw_arrays.c',
+		'sp_flush.c',
+		'sp_prim_setup.c',
+		'sp_prim_vbuf.c',
+		'sp_quad_alpha_test.c',
+		'sp_quad_blend.c',
+		'sp_quad_bufloop.c',
+		'sp_quad.c',
+		'sp_quad_colormask.c',
+		'sp_quad_coverage.c',
+		'sp_quad_depth_test.c',
+		'sp_quad_earlyz.c',
+		'sp_quad_fs.c',
+		'sp_quad_occlusion.c',
+		'sp_quad_output.c',
+		'sp_quad_stencil.c',
+		'sp_quad_stipple.c',
+		'sp_query.c',
+		'sp_state_blend.c',
+		'sp_state_clip.c',
+		'sp_state_derived.c',
+		'sp_state_fs.c',
+		'sp_state_rasterizer.c',
+		'sp_state_sampler.c',
+		'sp_state_surface.c',
+		'sp_state_vertex.c',
+		'sp_surface.c',
+		'sp_tex_sample.c',
+		'sp_texture.c',
+		'sp_tile_cache.c',
+	])
+
+Export('softpipe')
+\ No newline at end of file
diff --git a/src/mesa/pipe/softpipe/sp_clear.c b/src/mesa/pipe/softpipe/sp_clear.c
index 571f64b38d..8d295a30ca 100644
--- a/src/mesa/pipe/softpipe/sp_clear.c
+++ b/src/mesa/pipe/softpipe/sp_clear.c
@@ -55,7 +55,9 @@ softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
 
    if (ps == sp_tile_cache_get_surface(softpipe->zsbuf_cache)) {
       sp_tile_cache_clear(softpipe->zsbuf_cache, clearValue);
+#if TILE_CLEAR_OPTIMIZATION
       return;
+#endif
    }
 
    for (i = 0; i < softpipe->framebuffer.num_cbufs; i++) {
diff --git a/src/mesa/pipe/softpipe/sp_prim_setup.c b/src/mesa/pipe/softpipe/sp_prim_setup.c
index b17801d13d..7478b2336b 100644
--- a/src/mesa/pipe/softpipe/sp_prim_setup.c
+++ b/src/mesa/pipe/softpipe/sp_prim_setup.c
@@ -251,9 +251,9 @@ static void print_vertex(const struct setup_stage *setup,
                          const struct vertex_header *v)
 {
    int i;
-   fprintf(stderr, "Vertex: (%p)\n", v);
+   debug_printf("Vertex: (%p)\n", v);
    for (i = 0; i < setup->quad.nr_attrs; i++) {
-      fprintf(stderr, "  %d: %f %f %f %f\n",  i, 
+      debug_printf("  %d: %f %f %f %f\n",  i, 
               v->data[i][0], v->data[i][1], v->data[i][2], v->data[i][3]);
    }
 }
@@ -267,7 +267,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
    const struct vertex_header *v2 = prim->v[2];
 
 #if DEBUG_VERTS
-   fprintf(stderr, "Triangle:\n");
+   debug_printf("Triangle:\n");
    print_vertex(setup, v0);
    print_vertex(setup, v1);
    print_vertex(setup, v2);
@@ -345,7 +345,7 @@ static boolean setup_sort_vertices( struct setup_stage *setup,
 
       setup->oneoverarea = 1.0f / area;
       /*
-      _mesa_printf("%s one-over-area %f  area %f  det %f\n",
+      debug_printf("%s one-over-area %f  area %f  det %f\n",
                    __FUNCTION__, setup->oneoverarea, area, prim->det );
       */
    }
@@ -419,7 +419,7 @@ static void tri_linear_coeff( struct setup_stage *setup,
                    dady * (setup->vmin->data[0][1] - 0.5f)));
 
    /*
-   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
 		slot, "xyzw"[i], 
 		setup->coef[slot].a0[i],
 		setup->coef[slot].dadx[i],
@@ -453,10 +453,10 @@ static void tri_persp_coeff( struct setup_stage *setup,
    float dady = b * setup->oneoverarea;
       
    /*
-   printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
-          setup->vmin->data[vertSlot][i],
-          setup->vmid->data[vertSlot][i],
-          setup->vmax->data[vertSlot][i]
+   debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+          	setup->vmin->data[vertSlot][i],
+          	setup->vmid->data[vertSlot][i],
+       		setup->vmax->data[vertSlot][i]
           );
    */
    assert(i <= 3);
@@ -619,7 +619,7 @@ static void subtriangle( struct setup_stage *setup,
    finish_y -= sy;
 
    /*
-   _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
+   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
    */
 
    for (y = start_y; y < finish_y; y++) {
@@ -671,7 +671,7 @@ static void setup_tri( struct draw_stage *stage,
    struct setup_stage *setup = setup_stage( stage );
 
    /*
-   _mesa_printf("%s\n", __FUNCTION__ );
+   debug_printf("%s\n", __FUNCTION__ );
    */
 
    setup_sort_vertices( setup, prim );
@@ -1124,7 +1124,7 @@ setup_point(struct draw_stage *stage, struct prim_header *prim)
          int ix, iy;
 
          /*
-         printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
+         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
          */
          for (iy = iymin; iy <= iymax; iy += 2) {
             uint rowMask = 0xf;
diff --git a/src/mesa/pipe/softpipe/sp_quad_fs.c b/src/mesa/pipe/softpipe/sp_quad_fs.c
index c9cc8afa0c..b5d7dfca1c 100644
--- a/src/mesa/pipe/softpipe/sp_quad_fs.c
+++ b/src/mesa/pipe/softpipe/sp_quad_fs.c
@@ -168,6 +168,11 @@ shade_quad(
              sizeof( quad->outputs.color ) );
    }
 
+   /*
+    * XXX the following code for updating quad->outputs.depth
+    * isn't really needed if we did early z testing.
+    */
+
    /* store result Z */
    if (qss->depthOutSlot >= 0) {
       /* output[slot] is new Z */
@@ -181,6 +186,10 @@ shade_quad(
       uint i;
       for (i = 0; i < 4; i++) {
          quad->outputs.depth[i] = machine->Inputs[0].xyzw[2].f[i];
+         /* XXX not sure the above line is always correct.  The following
+          * might be better:
+         quad->outputs.depth[i] = machine->QuadPos.xyzw[2].f[i];
+          */
       }
    }
 
@@ -214,13 +223,13 @@ shade_quad_llvm(struct quad_stage *qs,
    inputs[2][0][1] = fy + 1.0f;
    inputs[3][0][1] = fy + 1.0f;
 #if DLLVM
-   printf("MASK = %d\n", quad->mask);
+   debug_printf("MASK = %d\n", quad->mask);
 #endif
    gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
 #if DLLVM
    for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 2; ++j) {
-         printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
+         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
                 inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
       }
    }
@@ -231,7 +240,7 @@ shade_quad_llvm(struct quad_stage *qs,
                                    softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
                                    qss->samplers);
 #if DLLVM
-   printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
+   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
           dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
           dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
 #endif
@@ -251,7 +260,7 @@ shade_quad_llvm(struct quad_stage *qs,
    }
 #if DLLVM
    for (int i = 0; i < QUAD_SIZE; ++i) {
-      printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
+      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
              quad->outputs.color[0][i],
              quad->outputs.color[1][i],
              quad->outputs.color[2][i],
@@ -275,7 +284,7 @@ shade_quad_llvm(struct quad_stage *qs,
       }
    }
 #if DLLVM
-   printf("D [%f, %f, %f, %f] mask = %d\n",
+   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
              quad->outputs.depth[0],
              quad->outputs.depth[1],
              quad->outputs.depth[2],
diff --git a/src/mesa/pipe/softpipe/sp_state_blend.c b/src/mesa/pipe/softpipe/sp_state_blend.c
index 160ca5cbc0..2d40d6bd8f 100644
--- a/src/mesa/pipe/softpipe/sp_state_blend.c
+++ b/src/mesa/pipe/softpipe/sp_state_blend.c
@@ -32,13 +32,12 @@
 #include "sp_context.h"
 #include "sp_state.h"
 
+
 void *
 softpipe_create_blend_state(struct pipe_context *pipe,
                             const struct pipe_blend_state *blend)
 {
-   struct pipe_blend_state *state = MALLOC( sizeof(struct pipe_blend_state) );
-   memcpy(state, blend, sizeof(struct pipe_blend_state));
-   return state;
+   return mem_dup(blend, sizeof(*blend));
 }
 
 void softpipe_bind_blend_state( struct pipe_context *pipe,
@@ -78,10 +77,7 @@ void *
 softpipe_create_depth_stencil_state(struct pipe_context *pipe,
 				    const struct pipe_depth_stencil_alpha_state *depth_stencil)
 {
-   struct pipe_depth_stencil_alpha_state *state =
-      MALLOC( sizeof(struct pipe_depth_stencil_alpha_state) );
-   memcpy(state, depth_stencil, sizeof(struct pipe_depth_stencil_alpha_state));
-   return state;
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
 }
 
 void
diff --git a/src/mesa/pipe/softpipe/sp_state_rasterizer.c b/src/mesa/pipe/softpipe/sp_state_rasterizer.c
index ce8fa4f2b8..53755099dd 100644
--- a/src/mesa/pipe/softpipe/sp_state_rasterizer.c
+++ b/src/mesa/pipe/softpipe/sp_state_rasterizer.c
@@ -35,12 +35,9 @@
 
 void *
 softpipe_create_rasterizer_state(struct pipe_context *pipe,
-                                 const struct pipe_rasterizer_state *setup)
+                                 const struct pipe_rasterizer_state *rast)
 {
-   struct pipe_rasterizer_state *state =
-      MALLOC( sizeof(struct pipe_rasterizer_state) );
-   memcpy(state, setup, sizeof(struct pipe_rasterizer_state));
-   return state;
+   return mem_dup(rast, sizeof(*rast));
 }
 
 void softpipe_bind_rasterizer_state(struct pipe_context *pipe,
diff --git a/src/mesa/pipe/softpipe/sp_state_sampler.c b/src/mesa/pipe/softpipe/sp_state_sampler.c
index 3842e71503..291bbc40ad 100644
--- a/src/mesa/pipe/softpipe/sp_state_sampler.c
+++ b/src/mesa/pipe/softpipe/sp_state_sampler.c
@@ -40,9 +40,7 @@ void *
 softpipe_create_sampler_state(struct pipe_context *pipe,
                               const struct pipe_sampler_state *sampler)
 {
-   struct pipe_sampler_state *state = MALLOC( sizeof(struct pipe_sampler_state) );
-   memcpy(state, sampler, sizeof(struct pipe_sampler_state));
-   return state;
+   return mem_dup(sampler, sizeof(*sampler));
 }
 
 void
@@ -51,6 +49,8 @@ softpipe_bind_sampler_state(struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   draw_flush(softpipe->draw);
+
    assert(unit < PIPE_MAX_SAMPLERS);
    softpipe->sampler[unit] = (struct pipe_sampler_state *)sampler;
 
diff --git a/src/mesa/pipe/softpipe/sp_texture.c b/src/mesa/pipe/softpipe/sp_texture.c
index 172234843d..fd2cc3dbbb 100644
--- a/src/mesa/pipe/softpipe/sp_texture.c
+++ b/src/mesa/pipe/softpipe/sp_texture.c
@@ -79,31 +79,30 @@ softpipe_texture_layout(struct softpipe_texture * spt)
 }
 
 
-void
-softpipe_texture_create(struct pipe_context *pipe, struct pipe_texture **pt)
+struct pipe_texture *
+softpipe_texture_create(struct pipe_context *pipe,
+                        const struct pipe_texture *templat)
 {
-   struct softpipe_texture *spt = REALLOC(*pt, sizeof(struct pipe_texture),
-					  sizeof(struct softpipe_texture));
-
-   if (spt) {
-      memset(&spt->base + 1, 0,
-	     sizeof(struct softpipe_texture) - sizeof(struct pipe_texture));
+   struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
+   if (!spt)
+      return NULL;
 
-      softpipe_texture_layout(spt);
+   spt->base = *templat;
 
-      spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                                PIPE_BUFFER_USAGE_PIXEL,
-                                                spt->buffer_size);
+   softpipe_texture_layout(spt);
 
-      if (!spt->buffer) {
-	 FREE(spt);
-	 spt = NULL;
-      }
+   spt->buffer = pipe->winsys->buffer_create(pipe->winsys, 32,
+                                             PIPE_BUFFER_USAGE_PIXEL,
+                                             spt->buffer_size);
+   if (!spt->buffer) {
+      FREE(spt);
+      return NULL;
    }
 
-   *pt = &spt->base;
+   return &spt->base;
 }
 
+
 void
 softpipe_texture_release(struct pipe_context *pipe, struct pipe_texture **pt)
 {
diff --git a/src/mesa/pipe/softpipe/sp_texture.h b/src/mesa/pipe/softpipe/sp_texture.h
index c6cf370351..fa646c0de9 100644
--- a/src/mesa/pipe/softpipe/sp_texture.h
+++ b/src/mesa/pipe/softpipe/sp_texture.h
@@ -55,8 +55,9 @@ softpipe_texture(struct pipe_texture *pt)
 
 
 
-extern void
-softpipe_texture_create(struct pipe_context *pipe, struct pipe_texture **pt);
+extern struct pipe_texture *
+softpipe_texture_create(struct pipe_context *pipe,
+                        const struct pipe_texture *templat);
 
 extern void
 softpipe_texture_release(struct pipe_context *pipe, struct pipe_texture **pt);
diff --git a/src/mesa/pipe/softpipe/sp_tile_cache.c b/src/mesa/pipe/softpipe/sp_tile_cache.c
index 451e157abf..1597361b82 100644
--- a/src/mesa/pipe/softpipe/sp_tile_cache.c
+++ b/src/mesa/pipe/softpipe/sp_tile_cache.c
@@ -341,7 +341,7 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
       }
    }
 #if 0
-   printf("num cleared: %u\n", numCleared);
+   debug_printf("num cleared: %u\n", numCleared);
 #endif
 }
 
@@ -384,7 +384,7 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
 #endif
 
 #if 0
-   printf("flushed tiles in use: %d\n", inuse);
+   debug_printf("flushed tiles in use: %d\n", inuse);
 #endif
 }
 
@@ -415,8 +415,8 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
             pipe_put_tile_raw(pipe, ps,
-                           tile->x, tile->y, TILE_SIZE, TILE_SIZE,
-                           tile->data.depth32, 0/*STRIDE*/);
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_put_tile_rgba(pipe, ps,
@@ -441,9 +441,9 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
       else {
          /* get new tile data from surface */
          if (tc->depth_stencil) {
-            pipe_put_tile_raw(pipe, ps,
-                           tile->x, tile->y, TILE_SIZE, TILE_SIZE,
-                           tile->data.depth32, 0/*STRIDE*/);
+            pipe_get_tile_raw(pipe, ps,
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_get_tile_rgba(pipe, ps,
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_exec.c b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
index dcc39362a9..336ae1c8b6 100644
--- a/src/mesa/pipe/tgsi/exec/tgsi_exec.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_exec.c
@@ -143,7 +143,7 @@ tgsi_exec_prepare( struct tgsi_exec_machine *mach )
 
    k = tgsi_parse_init( &parse, mach->Tokens );
    if (k != TGSI_PARSE_OK) {
-      fprintf(stderr, "Problem parsing!\n");
+      debug_printf("Problem parsing!\n");
       return;
    }
 
@@ -249,7 +249,7 @@ tgsi_exec_machine_init(
 
    k = tgsi_parse_init (&parse, mach->Tokens);
    if (k != TGSI_PARSE_OK) {
-      fprintf( stderr, "Problem parsing!\n" );
+      debug_printf( "Problem parsing!\n" );
       return;
    }
 
@@ -1236,7 +1236,7 @@ exec_tex(struct tgsi_exec_machine *mach,
    uint chan_index;
    float lodBias;
 
-   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
 
    switch (inst->InstructionExtTexture.Texture) {
    case TGSI_TEXTURE_1D:
@@ -2010,7 +2010,7 @@ exec_instruction(
 
    case TGSI_OPCODE_TXB:
       /* Texture lookup with lod bias */
-      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[0] = texcoord (src[0].w = LOD bias) */
       /* src[1] = sampler unit */
       exec_tex(mach, inst, TRUE);
       break;
@@ -2026,7 +2026,7 @@ exec_instruction(
 
    case TGSI_OPCODE_TXL:
       /* Texture lookup with explit LOD */
-      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[0] = texcoord (src[0].w = LOD) */
       /* src[1] = sampler unit */
       exec_tex(mach, inst, TRUE);
       break;
diff --git a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
index f8660e7ad1..40bacf8552 100755
--- a/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
+++ b/src/mesa/pipe/tgsi/exec/tgsi_sse2.c
@@ -48,28 +48,28 @@ _print_reg(
    case file_REG32:
       switch( reg.idx ) {
       case reg_AX:
-         printf( "EAX" );
+         debug_printf( "EAX" );
          break;
       case reg_CX:
-         printf( "ECX" );
+         debug_printf( "ECX" );
          break;
       case reg_DX:
-         printf( "EDX" );
+         debug_printf( "EDX" );
          break;
       case reg_BX:
-         printf( "EBX" );
+         debug_printf( "EBX" );
          break;
       case reg_SP:
-         printf( "ESP" );
+         debug_printf( "ESP" );
          break;
       case reg_BP:
-         printf( "EBP" );
+         debug_printf( "EBP" );
          break;
       case reg_SI:
-         printf( "ESI" );
+         debug_printf( "ESI" );
          break;
       case reg_DI:
-         printf( "EDI" );
+         debug_printf( "EDI" );
          break;
       }
       break;
@@ -77,7 +77,7 @@ _print_reg(
       assert( 0 );
       break;
    case file_XMM:
-      printf( "XMM%u", reg.idx );
+      debug_printf( "XMM%u", reg.idx );
       break;
    case file_x87:
       assert( 0 );
@@ -92,35 +92,35 @@ _fill(
    unsigned count = 10 - strlen( op );
 
    while( count-- ) {
-      printf( " " );
+      debug_printf( " " );
    }
 }
 
-#define DUMP_START() printf( "\nsse-dump start ----------------" )
-#define DUMP_END() printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) printf( "\n%s", OP )
+#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
+#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
+#define DUMP( OP ) debug_printf( "\n%s", OP )
 #define DUMP_I( OP, I ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
-   printf( "%u", I ); } while( 0 )
+   debug_printf( "%u", I ); } while( 0 )
 #define DUMP_R( OP, R0 ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 ); } while( 0 )
 #define DUMP_RR( OP, R0, R1 ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 );\
-   printf( ", " );\
+   debug_printf( ", " );\
    _print_reg( R1 ); } while( 0 )
 #define DUMP_RRI( OP, R0, R1, I ) do {\
-   printf( "\n%s", OP );\
+   debug_printf( "\n%s", OP );\
    _fill( OP );\
    _print_reg( R0 );\
-   printf( ", " );\
+   debug_printf( ", " );\
    _print_reg( R1 );\
-   printf( ", " );\
-   printf( "%u", I ); } while( 0 )
+   debug_printf( ", " );\
+   debug_printf( "%u", I ); } while( 0 )
 
 #else
 
@@ -198,9 +198,15 @@ get_output_base( void )
 static struct x86_reg
 get_temp_base( void )
 {
+#ifdef WIN32
    return x86_make_reg(
       file_REG32,
       reg_BX );
+#else
+   return x86_make_reg(
+      file_REG32,
+      reg_SI );
+#endif
 }
 
 static struct x86_reg
@@ -2248,8 +2254,7 @@ tgsi_emit_sse2(
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
          /* XXX implement this */
-         assert(0);
-         break;
+         return 0;
 
       default:
          assert( 0 );
diff --git a/src/mesa/pipe/tgsi/util/tgsi_build.c b/src/mesa/pipe/tgsi/util/tgsi_build.c
index 67f7d2c2c2..a00ff1c2a5 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_build.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_build.c
@@ -1,3 +1,4 @@
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_build.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_dump.c b/src/mesa/pipe/tgsi/util/tgsi_dump.c
index cdbc0dbc9c..b5c54847e0 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_dump.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_dump.c
@@ -25,6 +25,9 @@
  * 
  **************************************************************************/
 
+#include <stdio.h> 
+
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_dump.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_parse.c b/src/mesa/pipe/tgsi/util/tgsi_parse.c
index f0f8d44ac2..bf6b89ce56 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_parse.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_parse.c
@@ -25,6 +25,7 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
diff --git a/src/mesa/pipe/tgsi/util/tgsi_util.c b/src/mesa/pipe/tgsi/util/tgsi_util.c
index 1e76b0f133..4cdd89182a 100644
--- a/src/mesa/pipe/tgsi/util/tgsi_util.c
+++ b/src/mesa/pipe/tgsi/util/tgsi_util.c
@@ -1,3 +1,4 @@
+#include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 #include "tgsi_parse.h"
diff --git a/src/mesa/pipe/util/p_debug.c b/src/mesa/pipe/util/p_debug.c
new file mode 100644
index 0000000000..b9607a6ba7
--- /dev/null
+++ b/src/mesa/pipe/util/p_debug.c
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdarg.h>
+
+#ifdef WIN32
+#include <windows.h>
+#include <winddi.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include "pipe/p_debug.h" 
+#include "pipe/p_compiler.h" 
+
+
+void debug_vprintf(const char *format, va_list ap)
+{
+#ifdef WIN32
+   EngDebugPrint("Gallium3D: ", (PCHAR)format, ap);
+#else
+   vfprintf(stderr, format, ap);
+#endif
+}
+
+
+void debug_printf(const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   debug_vprintf(format, ap);
+   va_end(ap);
+}
+
+
+static INLINE void debug_abort(void) 
+{
+#ifdef WIN32
+   EngDebugBreak();
+#else
+   abort();
+#endif
+}
+
+
+void debug_assert_fail(const char *expr, const char *file, unsigned line) 
+{
+   debug_printf("%s:%i: Assertion `%s' failed.\n", file, line, expr);
+   debug_abort();
+}
diff --git a/src/mesa/sources b/src/mesa/sources
index 97ef7e1936..84492c91ac 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -175,6 +175,9 @@ DRAW_SOURCES = \
 	pipe/draw/draw_vertex_fetch.c \
 	pipe/draw/draw_vertex_shader.c \
 	pipe/draw/draw_vertex_shader_llvm.c \
+	pipe/draw/draw_vf.c \
+	pipe/draw/draw_vf_generic.c \
+	pipe/draw/draw_vf_sse.c \
 	pipe/draw/draw_wide_prims.c
 
 TGSIEXEC_SOURCES = \
@@ -192,6 +195,7 @@ STATECACHE_SOURCES = \
 	pipe/cso_cache/cso_cache.c
 
 PIPEUTIL_SOURCES = \
+	pipe/util/p_debug.c \
 	pipe/util/p_tile.c \
 	pipe/util/p_util.c
 
@@ -230,6 +234,7 @@ STATETRACKER_SOURCES = \
 	state_tracker/st_extensions.c \
 	state_tracker/st_format.c \
 	state_tracker/st_framebuffer.c \
+	state_tracker/st_gen_mipmap.c \
 	state_tracker/st_mesa_to_tgsi.c \
 	state_tracker/st_program.c \
 	state_tracker/st_texture.c
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 1ed9333556..9196918509 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -226,9 +226,11 @@ find_translated_vp(struct st_context *st,
             GLint fpInAttrib = vp_out_to_fp_in(outAttr);
             if (fpInAttrib >= 0) {
                GLuint fpInSlot = stfp->input_to_slot[fpInAttrib];
-               GLuint vpOutSlot = stfp->fs->state.input_map[fpInSlot];
-               xvp->output_to_slot[outAttr] = vpOutSlot;
-               numVpOuts++;
+               if (fpInSlot != ~0) {
+                  GLuint vpOutSlot = stfp->fs->state.input_map[fpInSlot];
+                  xvp->output_to_slot[outAttr] = vpOutSlot;
+                  numVpOuts++;
+               }
             }
             else if (outAttr == VERT_RESULT_PSIZ ||
                      outAttr == VERT_RESULT_BFC0 ||
@@ -247,7 +249,7 @@ find_translated_vp(struct st_context *st,
        * We could use this info to do dead code elimination in the
        * vertex program.
        */
-      dummySlot = stfp->num_input_slots;
+      dummySlot = numVpOuts;
 
       /* Map vert program outputs that aren't used to the dummy slot */
       for (outAttr = 0; outAttr < VERT_RESULT_MAX; outAttr++) {
diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index fb21d29c40..2a836d630b 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -36,7 +36,6 @@
 #include "st_atom.h"
 #include "st_cb_texture.h"
 #include "pipe/p_context.h"
-#include "pipe/p_defines.h"
 
 
 /**
@@ -46,24 +45,21 @@
 static void 
 update_textures(struct st_context *st)
 {
-   GLuint s;
-
    /* ST_NEW_FRAGMENT_PROGRAM
     */
    struct gl_fragment_program *fprog = st->ctx->FragmentProgram._Current;
+   GLuint unit;
 
-   for (s = 0; s < st->ctx->Const.MaxTextureCoordUnits; s++) {
-      GLuint su = fprog->Base.SamplerUnits[s];
-      
-      struct gl_texture_object *texObj
-         = st->ctx->Texture.Unit[su]._Current;
-
+   for (unit = 0; unit < st->ctx->Const.MaxTextureCoordUnits; unit++) {
+      const GLuint su = fprog->Base.SamplerUnits[unit];
+      struct gl_texture_object *texObj = st->ctx->Texture.Unit[su]._Current;
       struct pipe_texture *pt;
 
       if (texObj) {
          GLboolean flush, retval;
 
          retval = st_finalize_texture(st->ctx, st->pipe, texObj, &flush);
+         /* XXX retval indicates whether there's a texture border */
 
          pt = st_get_texobj_texture(texObj);
       }
@@ -75,9 +71,9 @@ update_textures(struct st_context *st)
        * this table before being deleted, otherwise the pointer
        * comparison below could fail.
        */
-      if (st->state.sampler_texture[s] != pt) {
-	 st->state.sampler_texture[s] = pt;
-	 st->pipe->set_sampler_texture(st->pipe, s, pt);
+      if (st->state.sampler_texture[unit] != pt) {
+	 st->state.sampler_texture[unit] = pt;
+	 st->pipe->set_sampler_texture(st->pipe, unit, pt);
       }
    }
 }
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 758d4a4086..ab98b54bab 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -251,7 +251,7 @@ draw_quad(GLcontext *ctx,
       verts[i][1][3] = color[3];
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_FALSE);
 }
 
 
@@ -408,7 +408,9 @@ check_clear_depth_with_quad(GLcontext *ctx, struct gl_renderbuffer *rb)
    const struct st_renderbuffer *strb = st_renderbuffer(rb);
    const GLboolean isDS = is_depth_stencil_format(strb->surface->format);
    return  ctx->Scissor.Enabled
-      || (isDS && ctx->DrawBuffer->Visual.stencilBits > 0);
+      || (isDS && 
+	  strb->surface->status == PIPE_SURFACE_STATUS_DEFINED &&
+	  ctx->DrawBuffer->Visual.stencilBits > 0);
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 6b44cba2e4..07886e7982 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -355,8 +355,8 @@ make_fragment_shader_z(struct st_context *st)
  * Create a simple vertex shader that just passes through the
  * vertex position and texcoord (and optionally, color).
  */
-static struct st_vertex_program *
-make_vertex_shader(struct st_context *st, GLboolean passColor)
+struct st_vertex_program *
+st_make_passthrough_vertex_shader(struct st_context *st, GLboolean passColor)
 {
    /* only make programs once and re-use */
    static struct st_vertex_program *progs[2] = { NULL, NULL };
@@ -572,7 +572,7 @@ draw_quad(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
       verts[i][1][3] = 1.0; /*Q*/
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_FALSE);
 }
 
 
@@ -581,10 +581,13 @@ draw_quad_colored(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
                   GLfloat x1, GLfloat y1, const GLfloat *color,
                   GLboolean invertTex)
 {
+   GLfloat bias = ctx->st->bitmap_texcoord_bias;
    GLfloat verts[4][3][4]; /* four verts, three attribs, XYZW */
    GLuint i;
-   GLfloat sLeft = 0.0, sRight = 1.0;
-   GLfloat tTop = invertTex, tBot = 1.0 - tTop;
+   GLfloat xBias = bias / (x1-x0);
+   GLfloat yBias = bias / (y1-y0);
+   GLfloat sLeft = 0.0 + xBias, sRight = 1.0 + xBias;
+   GLfloat tTop = invertTex - yBias, tBot = 1.0 - tTop - yBias;
 
    /* upper-left */
    verts[0][0][0] = x0;    /* attr[0].x */
@@ -622,7 +625,7 @@ draw_quad_colored(GLcontext *ctx, GLfloat x0, GLfloat y0, GLfloat z,
       verts[i][2][3] = 1.0; /*Q*/
    }
 
-   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 3);
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 3, GL_FALSE);
 }
 
 
@@ -942,7 +945,7 @@ st_DrawPixels(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    if (format == GL_DEPTH_COMPONENT) {
       ps = st->state.framebuffer.zsbuf;
       stfp = make_fragment_shader_z(ctx->st);
-      stvp = make_vertex_shader(ctx->st, GL_TRUE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
       color = ctx->Current.RasterColor;
    }
    else if (format == GL_STENCIL_INDEX) {
@@ -953,7 +956,7 @@ st_DrawPixels(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    else {
       ps = st->state.framebuffer.cbufs[0];
       stfp = combined_drawpix_fragment_program(ctx);
-      stvp = make_vertex_shader(ctx->st, GL_FALSE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_FALSE);
       color = NULL;
    }
 
@@ -1108,7 +1111,7 @@ st_Bitmap(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height,
    struct st_context *st = ctx->st;
    struct pipe_texture *pt;
 
-   stvp = make_vertex_shader(ctx->st, GL_TRUE);
+   stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
    stfp = combined_bitmap_fragment_program(ctx);
 
    st_validate_state(st);
@@ -1226,13 +1229,13 @@ st_CopyPixels(GLcontext *ctx, GLint srcx, GLint srcy,
       rbRead = st_renderbuffer(ctx->ReadBuffer->_ColorReadBuffer);
       color = NULL;
       stfp = combined_drawpix_fragment_program(ctx);
-      stvp = make_vertex_shader(ctx->st, GL_FALSE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_FALSE);
    }
    else {
       rbRead = st_renderbuffer(ctx->ReadBuffer->_DepthBuffer);
       color = ctx->Current.Attrib[VERT_ATTRIB_COLOR0];
       stfp = make_fragment_shader_z(ctx->st);
-      stvp = make_vertex_shader(ctx->st, GL_TRUE);
+      stvp = st_make_passthrough_vertex_shader(ctx->st, GL_TRUE);
    }
 
    psRead = rbRead->surface;
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h
index 71ba487020..b8b906f06b 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.h
+++ b/src/mesa/state_tracker/st_cb_drawpixels.h
@@ -30,6 +30,10 @@
 #define ST_CB_DRAWPIXELS_H
 
 
+extern struct st_vertex_program *
+st_make_passthrough_vertex_shader(struct st_context *st, GLboolean passColor);
+
+
 extern void st_init_drawpixels_functions(struct dd_function_table *functions);
 
 
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index ba0950e295..3350254654 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -30,6 +30,7 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/macros.h"
+#include "main/mipmap.h"
 #include "main/texcompress.h"
 #include "main/texformat.h"
 #include "main/teximage.h"
@@ -41,6 +42,7 @@
 #include "state_tracker/st_cb_texture.h"
 #include "state_tracker/st_format.h"
 #include "state_tracker/st_texture.h"
+#include "state_tracker/st_gen_mipmap.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -76,13 +78,13 @@ struct st_texture_object
 
 
 
-
 static INLINE struct st_texture_object *
 st_texture_object(struct gl_texture_object *obj)
 {
    return (struct st_texture_object *) obj;
 }
 
+
 static INLINE struct st_texture_image *
 st_texture_image(struct gl_texture_image *img)
 {
@@ -122,32 +124,28 @@ gl_target_to_pipe(GLenum target)
 }
 
 
+/**
+ * Return nominal bytes per texel for a compressed format, 0 for non-compressed
+ * format.
+ */
 static int
 compressed_num_bytes(GLuint mesaFormat)
 {
-   int bytes = 0;
    switch(mesaFormat) {
-     
    case MESA_FORMAT_RGB_FXT1:
    case MESA_FORMAT_RGBA_FXT1:
    case MESA_FORMAT_RGB_DXT1:
    case MESA_FORMAT_RGBA_DXT1:
-     bytes = 2;
-     break;
-     
+      return 2;
    case MESA_FORMAT_RGBA_DXT3:
    case MESA_FORMAT_RGBA_DXT5:
-     bytes = 4;
+      return 4;
    default:
-     break;
+      return 0;
    }
-   
-   return bytes;
 }
 
 
-
-
 static GLboolean
 st_IsTextureResident(GLcontext * ctx, struct gl_texture_object *texObj)
 {
@@ -164,7 +162,6 @@ st_IsTextureResident(GLcontext * ctx, struct gl_texture_object *texObj)
 }
 
 
-
 static struct gl_texture_image *
 st_NewTextureImage(GLcontext * ctx)
 {
@@ -216,8 +213,6 @@ st_FreeTextureImageData(GLcontext * ctx, struct gl_texture_image *texImage)
 }
 
 
-
-
 /* ================================================================
  * From linux kernel i386 header files, copes with odd sizes better
  * than COPY_DWORDS would:
@@ -290,7 +285,12 @@ logbase2(int n)
 }
 
 
-/* Otherwise, store it in memory if (Border != 0) or (any dimension ==
+/**
+ * Allocate a pipe_texture object for the given st_texture_object using
+ * the given st_texture_image to guess the mipmap size/levels.
+ *
+ * [comments...]
+ * Otherwise, store it in memory if (Border != 0) or (any dimension ==
  * 1).
  *    
  * Otherwise, if max_level >= level >= min_level, create texture with
@@ -302,18 +302,19 @@ logbase2(int n)
 static void
 guess_and_alloc_texture(struct st_context *st,
 			struct st_texture_object *stObj,
-			struct st_texture_image *stImage)
+			const struct st_texture_image *stImage)
 {
    GLuint firstLevel;
    GLuint lastLevel;
    GLuint width = stImage->base.Width;
    GLuint height = stImage->base.Height;
    GLuint depth = stImage->base.Depth;
-   GLuint l2width, l2height, l2depth;
    GLuint i, comp_byte = 0;
 
    DBG("%s\n", __FUNCTION__);
 
+   assert(!stObj->pt);
+
    if (stImage->base.Border)
       return;
 
@@ -355,15 +356,15 @@ guess_and_alloc_texture(struct st_context *st,
       lastLevel = firstLevel;
    }
    else {
-      l2width = logbase2(width);
-      l2height = logbase2(height);
-      l2depth = logbase2(depth);
+      GLuint l2width = logbase2(width);
+      GLuint l2height = logbase2(height);
+      GLuint l2depth = logbase2(depth);
       lastLevel = firstLevel + MAX2(MAX2(l2width, l2height), l2depth);
    }
 
-   assert(!stObj->pt);
    if (stImage->base.IsCompressed)
       comp_byte = compressed_num_bytes(stImage->base.TexFormat->MesaFormat);
+
    stObj->pt = st_texture_create(st,
                                  gl_target_to_pipe(stObj->base.Target),
                                  st_mesa_format_to_pipe_format(stImage->base.TexFormat->MesaFormat),
@@ -487,21 +488,18 @@ try_pbo_upload(GLcontext *ctx,
 
 
 
-
-
-
-
 static void
 st_TexImage(GLcontext * ctx,
-              GLint dims,
-              GLenum target, GLint level,
-              GLint internalFormat,
-              GLint width, GLint height, GLint depth,
-              GLint border,
-              GLenum format, GLenum type, const void *pixels,
-              const struct gl_pixelstore_attrib *unpack,
-              struct gl_texture_object *texObj,
-              struct gl_texture_image *texImage, GLsizei imageSize, int compressed)
+            GLint dims,
+            GLenum target, GLint level,
+            GLint internalFormat,
+            GLint width, GLint height, GLint depth,
+            GLint border,
+            GLenum format, GLenum type, const void *pixels,
+            const struct gl_pixelstore_attrib *unpack,
+            struct gl_texture_object *texObj,
+            struct gl_texture_image *texImage,
+            GLsizei imageSize, int compressed)
 {
    struct st_texture_object *stObj = st_texture_object(texObj);
    struct st_texture_image *stImage = st_texture_image(texImage);
@@ -524,7 +522,7 @@ st_TexImage(GLcontext * ctx,
 
    /* choose the texture format */
    texImage->TexFormat = st_ChooseTextureFormat(ctx, internalFormat,
-                                                  format, type);
+                                                format, type);
 
    _mesa_set_fetch_functions(texImage, dims);
 
@@ -536,7 +534,8 @@ st_TexImage(GLcontext * ctx,
 	 ctx->Driver.CompressedTextureSize(ctx, texImage->Width,
 					   texImage->Height, texImage->Depth,
 					   texImage->TexFormat->MesaFormat);
-   } else {
+   }
+   else {
       texelBytes = texImage->TexFormat->TexelBytes;
       
       /* Minimum pitch of 32 bytes */
@@ -669,7 +668,7 @@ st_TexImage(GLcontext * ctx,
     * conversion and copy:
     */
    if (compressed) {
-     memcpy(texImage->Data, pixels, imageSize);
+      memcpy(texImage->Data, pixels, imageSize);
    }
    else {
       GLuint srcImageStride = _mesa_image_image_stride(unpack, width, height,
@@ -705,13 +704,9 @@ st_TexImage(GLcontext * ctx,
       texImage->Data = NULL;
    }
 
-#if 0
-   /* GL_SGIS_generate_mipmap -- this can be accelerated now.
-    */
+#if 01
    if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target,
-                            &ctx->Texture.Unit[ctx->Texture.CurrentUnit],
-                            texObj);
+      ctx->Driver.GenerateMipmap(ctx, target, texObj);
    }
 #endif
 }
@@ -1401,7 +1396,10 @@ copy_image_data_to_texture(struct st_context *st,
 }
 
 
-/*  
+/**
+ * Called during state validation.  When this function is finished,
+ * the texture object should be ready for rendering.
+ * \return GL_FALSE if a texture border is present, GL_TRUE otherwise
  */
 GLboolean
 st_finalize_texture(GLcontext *ctx,
@@ -1410,11 +1408,10 @@ st_finalize_texture(GLcontext *ctx,
 		    GLboolean *needFlush)
 {
    struct st_texture_object *stObj = st_texture_object(tObj);
+   const GLuint nr_faces = (stObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
    int comp_byte = 0;
    int cpp;
-
    GLuint face, i;
-   GLuint nr_faces = 0;
    struct st_texture_image *firstImage;
 
    *needFlush = GL_FALSE;
@@ -1426,8 +1423,7 @@ st_finalize_texture(GLcontext *ctx,
    /* What levels must the texture include at a minimum?
     */
    calculate_first_last_level(stObj);
-   firstImage =
-      st_texture_image(stObj->base.Image[0][stObj->firstLevel]);
+   firstImage = st_texture_image(stObj->base.Image[0][stObj->firstLevel]);
 
    /* Fallback case:
     */
@@ -1503,7 +1499,6 @@ st_finalize_texture(GLcontext *ctx,
 
    /* Pull in any images not in the object's texture:
     */
-   nr_faces = (stObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
    for (face = 0; face < nr_faces; face++) {
       for (i = stObj->firstLevel; i <= stObj->lastLevel; i++) {
          struct st_texture_image *stImage =
@@ -1540,6 +1535,7 @@ st_init_texture_functions(struct dd_function_table *functions)
    functions->CopyTexSubImage1D = st_CopyTexSubImage1D;
    functions->CopyTexSubImage2D = st_CopyTexSubImage2D;
    functions->CopyTexSubImage3D = st_CopyTexSubImage3D;
+   functions->GenerateMipmap = st_generate_mipmap;
 
    functions->GetTexImage = st_GetTexImage;
 
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 9c206c057a..bf4618bed8 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -49,6 +49,7 @@
 #include "st_atom.h"
 #include "st_draw.h"
 #include "st_extensions.h"
+#include "st_gen_mipmap.h"
 #include "st_program.h"
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
@@ -96,6 +97,7 @@ st_create_context_priv( GLcontext *ctx, struct pipe_context *pipe )
 
    st_init_atoms( st );
    st_init_draw( st );
+   st_init_generate_mipmap(st);
 
    /* we want all vertex data to be placed in buffer objects */
    vbo_use_buffer_objects(ctx);
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 2b6f8743f3..a756055898 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -143,6 +143,8 @@ struct st_context
 
    GLfloat polygon_offset_scale; /* ?? */
 
+   GLfloat bitmap_texcoord_bias;
+
    /** Mapping from VERT_RESULT_x to post-transformed vertex slot */
    const GLuint *vertex_result_to_slot;
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 8ef50ee768..ae9f5c8b11 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -298,6 +298,7 @@ st_draw_vbo(GLcontext *ctx,
          break;
       default:
          assert(0);
+	 return;
       }
 
       /* get/create the index buffer object */
@@ -353,7 +354,8 @@ st_draw_vbo(GLcontext *ctx,
 void 
 st_draw_vertices(GLcontext *ctx, unsigned prim,
                  unsigned numVertex, float *verts,
-                 unsigned numAttribs)
+                 unsigned numAttribs,
+                 GLboolean inClipCoords)
 {
    const float width = ctx->DrawBuffer->Width;
    const float height = ctx->DrawBuffer->Height;
@@ -366,14 +368,16 @@ st_draw_vertices(GLcontext *ctx, unsigned prim,
 
    assert(numAttribs > 0);
 
-   /* convert to clip coords */
-   for (i = 0; i < numVertex; i++) {
-      float x = verts[i * numAttribs * 4 + 0];
-      float y = verts[i * numAttribs * 4 + 1];
-      x = x / width * 2.0 - 1.0;
-      y = y / height * 2.0 - 1.0;
-      verts[i * numAttribs * 4 + 0] = x;
-      verts[i * numAttribs * 4 + 1] = y;
+   if (!inClipCoords) {
+      /* convert to clip coords */
+      for (i = 0; i < numVertex; i++) {
+         float x = verts[i * numAttribs * 4 + 0];
+         float y = verts[i * numAttribs * 4 + 1];
+         x = x / width * 2.0 - 1.0;
+         y = y / height * 2.0 - 1.0;
+         verts[i * numAttribs * 4 + 0] = x;
+         verts[i * numAttribs * 4 + 1] = y;
+      }
    }
 
    /* XXX create one-time */
@@ -570,6 +574,7 @@ st_feedback_draw_vbo(GLcontext *ctx,
          break;
       default:
          assert(0);
+	 return;
       }
 
       map = pipe->winsys->buffer_map(pipe->winsys,
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index 89ee790c57..171bde57e5 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -62,7 +62,8 @@ st_feedback_draw_vbo(GLcontext *ctx,
 void 
 st_draw_vertices(GLcontext *ctx, unsigned prim,
                  unsigned numVertex, float *verts,
-                 unsigned numAttribs);
+                 unsigned numAttribs,
+                 GLboolean inClipCoords);
 
 
 #endif
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 0157bdd6b3..97d28d77c4 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -106,6 +106,9 @@ void st_init_limits(struct st_context *st)
 
    c->MaxTextureLodBias
       = pipe->get_paramf(pipe, PIPE_CAP_MAX_TEXTURE_LOD_BIAS);
+
+   st->bitmap_texcoord_bias
+      = pipe->get_paramf(pipe, PIPE_CAP_BITMAP_TEXCOORD_BIAS);
 }
 
 
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
new file mode 100644
index 0000000000..a6ac9a55fb
--- /dev/null
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -0,0 +1,363 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/imports.h"
+#include "main/mipmap.h"
+#include "main/teximage.h"
+
+#include "shader/prog_instruction.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/cso_cache/cso_cache.h"
+
+#include "st_context.h"
+#include "st_draw.h"
+#include "st_gen_mipmap.h"
+#include "st_program.h"
+#include "st_cb_drawpixels.h"
+#include "st_cb_texture.h"
+
+
+
+static void *blend_cso = NULL;
+static void *depthstencil_cso = NULL;
+static void *rasterizer_cso = NULL;
+static void *sampler_cso = NULL;
+
+static struct st_fragment_program *stfp = NULL;
+static struct st_vertex_program *stvp = NULL;
+
+
+
+static struct st_fragment_program *
+make_tex_fragment_program(GLcontext *ctx)
+{
+   struct st_fragment_program *stfp;
+   struct gl_program *p;
+   GLuint ic = 0;
+
+   p = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, 0);
+   if (!p)
+      return NULL;
+
+   p->NumInstructions = 2;
+
+   p->Instructions = _mesa_alloc_instructions(p->NumInstructions);
+   if (!p->Instructions) {
+      ctx->Driver.DeleteProgram(ctx, p);
+      return NULL;
+   }
+   _mesa_init_instructions(p->Instructions, p->NumInstructions);
+
+   /* TEX result.color, fragment.texcoord[0], texture[0], 2D; */
+   p->Instructions[ic].Opcode = OPCODE_TEX;
+   p->Instructions[ic].DstReg.File = PROGRAM_OUTPUT;
+   p->Instructions[ic].DstReg.Index = FRAG_RESULT_COLR;
+   p->Instructions[ic].SrcReg[0].File = PROGRAM_INPUT;
+   p->Instructions[ic].SrcReg[0].Index = FRAG_ATTRIB_TEX0;
+   p->Instructions[ic].TexSrcUnit = 0;
+   p->Instructions[ic].TexSrcTarget = TEXTURE_2D_INDEX;
+   ic++;
+
+   /* END; */
+   p->Instructions[ic++].Opcode = OPCODE_END;
+
+   assert(ic == p->NumInstructions);
+
+   p->InputsRead = FRAG_BIT_TEX0;
+   p->OutputsWritten = (1 << FRAG_RESULT_COLR);
+
+   stfp = (struct st_fragment_program *) p;
+
+   st_translate_fragment_program(ctx->st, stfp, NULL,
+                                 stfp->tokens, ST_MAX_SHADER_TOKENS);
+
+   return stfp;
+}
+
+
+
+
+/**
+ * one-time init for generate mipmap
+ * XXX Note: there may be other times we need no-op/simple state like this.
+ * In that case, some code refactoring would be good.
+ */
+void
+st_init_generate_mipmap(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_blend_state blend;
+   struct pipe_rasterizer_state rasterizer;
+   struct pipe_sampler_state sampler;
+   struct pipe_depth_stencil_alpha_state depthstencil;
+
+   assert(!blend_cso);
+
+   memset(&blend, 0, sizeof(blend));
+   blend.colormask = PIPE_MASK_RGBA;
+   blend_cso = pipe->create_blend_state(pipe, &blend);
+
+   memset(&depthstencil, 0, sizeof(depthstencil));
+   depthstencil_cso = pipe->create_depth_stencil_alpha_state(pipe, &depthstencil);
+
+   memset(&rasterizer, 0, sizeof(rasterizer));
+   rasterizer_cso = pipe->create_rasterizer_state(pipe, &rasterizer);
+
+   memset(&sampler, 0, sizeof(sampler));
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.normalized_coords = 1;
+   sampler_cso = pipe->create_sampler_state(pipe, &sampler);
+
+   stfp = make_tex_fragment_program(st->ctx);
+   stvp = st_make_passthrough_vertex_shader(st, GL_FALSE);
+}
+
+
+void
+st_destroy_generate_mipmpap(struct st_context *st)
+{
+   struct pipe_context *pipe = st->pipe;
+
+   pipe->delete_blend_state(pipe, blend_cso);
+   pipe->delete_depth_stencil_alpha_state(pipe, depthstencil_cso);
+   pipe->delete_rasterizer_state(pipe, rasterizer_cso);
+   pipe->delete_sampler_state(pipe, sampler_cso);
+
+   /* XXX free stfp, stvp */
+
+   blend_cso = NULL;
+   depthstencil_cso = NULL;
+   rasterizer_cso = NULL;
+   sampler_cso = NULL;
+}
+
+
+static void
+simple_viewport(struct pipe_context *pipe, uint width, uint height)
+{
+   struct pipe_viewport_state vp;
+
+   vp.scale[0] =  0.5 * width;
+   vp.scale[1] = -0.5 * height;
+   vp.scale[2] = 1.0;
+   vp.scale[3] = 1.0;
+   vp.translate[0] = 0.5 * width;
+   vp.translate[1] = 0.5 * height;
+   vp.translate[2] = 0.0;
+   vp.translate[3] = 0.0;
+
+   pipe->set_viewport_state(pipe, &vp);
+}
+
+
+
+/*
+ * Draw simple [-1,1]x[-1,1] quad
+ */
+static void
+draw_quad(GLcontext *ctx)
+{
+   GLfloat verts[4][2][4]; /* four verts, two attribs, XYZW */
+   GLuint i;
+   GLfloat sLeft = 0.0, sRight = 1.0;
+   GLfloat tTop = 1.0, tBot = 0.0;
+   GLfloat x0 = -1.0, x1 = 1.0;
+   GLfloat y0 = -1.0, y1 = 1.0;
+
+   /* upper-left */
+   verts[0][0][0] = x0;    /* attr[0].x */
+   verts[0][0][1] = y0;    /* attr[0].y */
+   verts[0][1][0] = sLeft; /* attr[1].s */
+   verts[0][1][1] = tTop;  /* attr[1].t */
+
+   /* upper-right */
+   verts[1][0][0] = x1;
+   verts[1][0][1] = y0;
+   verts[1][1][0] = sRight;
+   verts[1][1][1] = tTop;
+
+   /* lower-right */
+   verts[2][0][0] = x1;
+   verts[2][0][1] = y1;
+   verts[2][1][0] = sRight;
+   verts[2][1][1] = tBot;
+
+   /* lower-left */
+   verts[3][0][0] = x0;
+   verts[3][0][1] = y1;
+   verts[3][1][0] = sLeft;
+   verts[3][1][1] = tBot;
+
+   /* same for all verts: */
+   for (i = 0; i < 4; i++) {
+      verts[i][0][2] = 0.0; /*Z*/
+      verts[i][0][3] = 1.0; /*W*/
+      verts[i][1][2] = 0.0; /*R*/
+      verts[i][1][3] = 1.0; /*Q*/
+   }
+
+   st_draw_vertices(ctx, PIPE_PRIM_QUADS, 4, (float *) verts, 2, GL_TRUE);
+}
+
+
+
+/**
+ * Generate mipmap levels using hardware rendering.
+ * \return TRUE if successful, FALSE if not possible
+ */
+static boolean
+st_render_mipmap(struct st_context *st,
+                 struct pipe_texture *pt,
+                 uint baseLevel, uint lastLevel)
+{
+   struct pipe_context *pipe = st->pipe;
+   struct pipe_framebuffer_state fb;
+   const uint face = 0, zslice = 0;
+   const uint first_level_save = pt->first_level;
+   uint dstLevel;
+
+   /* check if we can render in the texture's format */
+   if (!pipe->is_format_supported(pipe, pt->format, PIPE_SURFACE)) {
+      return FALSE;
+   }
+
+   /* init framebuffer state */
+   memset(&fb, 0, sizeof(fb));
+   fb.num_cbufs = 1;
+
+   /* bind CSOs */
+   pipe->bind_blend_state(pipe, blend_cso);
+   pipe->bind_depth_stencil_alpha_state(pipe, depthstencil_cso);
+   pipe->bind_rasterizer_state(pipe, rasterizer_cso);
+   pipe->bind_sampler_state(pipe, 0, sampler_cso);
+
+   /* bind shaders */
+   pipe->bind_fs_state(pipe, stfp->fs->data);
+   pipe->bind_vs_state(pipe, stvp->cso->data);
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+
+      /*
+       * Setup framebuffer / dest surface
+       */
+      fb.cbufs[0] = pipe->get_tex_surface(pipe, pt, face, dstLevel, zslice);
+      pipe->set_framebuffer_state(pipe, &fb);
+
+      simple_viewport(pipe, pt->width[dstLevel], pt->height[dstLevel]);
+
+      /*
+       * Setup src texture, override pt->first_level so we sample from
+       * the right mipmap level.
+       */
+      pt->first_level = srcLevel;
+      pipe->set_sampler_texture(pipe, 0, pt);
+
+      draw_quad(st->ctx);
+   }
+
+   /* restore first_level */
+   pt->first_level = first_level_save;
+
+   /* restore pipe state */
+   if (st->state.rasterizer)
+      pipe->bind_rasterizer_state(pipe, st->state.rasterizer->data);
+   if (st->state.fs)
+      pipe->bind_fs_state(pipe, st->state.fs->data);
+   if (st->state.vs)
+      pipe->bind_vs_state(pipe, st->state.vs->cso->data);
+   if (st->state.sampler[0])
+      pipe->bind_sampler_state(pipe, 0, st->state.sampler[0]->data);
+   pipe->set_sampler_texture(pipe, 0, st->state.sampler_texture[0]);
+   pipe->set_viewport_state(pipe, &st->state.viewport);
+
+   return TRUE;
+}
+
+
+
+void
+st_generate_mipmap(GLcontext *ctx, GLenum target,
+                   struct gl_texture_object *texObj)
+{
+   struct st_context *st = ctx->st;
+   struct pipe_texture *pt = st_get_texobj_texture(texObj);
+   const uint baseLevel = texObj->BaseLevel;
+   const uint lastLevel = pt->last_level;
+   uint dstLevel;
+
+   if (!st_render_mipmap(st, pt, baseLevel, lastLevel)) {
+      abort();
+      /* XXX the following won't really work at this time */
+      _mesa_generate_mipmap(ctx, target, texObj);
+      return;
+   }
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+      const struct gl_texture_image *srcImage
+         = _mesa_get_tex_image(ctx, texObj, target, srcLevel);
+      struct gl_texture_image *dstImage;
+      struct st_texture_image *stImage;
+      uint dstWidth = pt->width[dstLevel];
+      uint dstHeight = pt->height[dstLevel];
+      uint dstDepth = pt->depth[dstLevel];
+      uint border = srcImage->Border;
+
+
+      dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
+      if (!dstImage) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
+         return;
+      }
+
+      if (dstImage->ImageOffsets)
+         _mesa_free(dstImage->ImageOffsets);
+
+      /* Free old image data */
+      if (dstImage->Data)
+         ctx->Driver.FreeTexImageData(ctx, dstImage);
+
+      /* initialize new image */
+      _mesa_init_teximage_fields(ctx, target, dstImage, dstWidth, dstHeight,
+                                 dstDepth, border, srcImage->InternalFormat);
+
+      dstImage->TexFormat = srcImage->TexFormat;
+
+      stImage = (struct st_texture_image *) dstImage;
+      stImage->pt = pt;
+   }
+
+}
diff --git a/src/mesa/state_tracker/st_gen_mipmap.h b/src/mesa/state_tracker/st_gen_mipmap.h
new file mode 100644
index 0000000000..7668c1e44e
--- /dev/null
+++ b/src/mesa/state_tracker/st_gen_mipmap.h
@@ -0,0 +1,46 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef ST_GEN_MIPMAP_H
+#define ST_GEN_MIPMAP_H
+
+
+extern void
+st_init_generate_mipmap(struct st_context *st);
+
+
+extern void
+st_destroy_generate_mipmpap(struct st_context *st);
+
+
+extern void
+st_generate_mipmap(GLcontext *ctx, GLenum target,
+                   struct gl_texture_object *texObj);
+
+
+#endif /* ST_GEN_MIPMAP_H */
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 1f1e6500e0..84a9094001 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -33,6 +33,7 @@
 
 #include "main/imports.h"
 #include "main/mtypes.h"
+#include "shader/prog_print.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -68,6 +69,7 @@ st_translate_vertex_program(struct st_context *st,
    struct pipe_shader_state vs;
    const struct cso_vertex_shader *cso;
    GLuint attr, i;
+   GLuint num_generic = 0;
 
    memset(&vs, 0, sizeof(vs));
 
@@ -117,7 +119,7 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_ATTRIB_TEX6:
          case VERT_ATTRIB_TEX7:
             vs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.input_semantic_index[slot] = attr - VERT_ATTRIB_TEX0;
+            vs.input_semantic_index[slot] = num_generic++;
             break;
          case VERT_ATTRIB_GENERIC0:
          case VERT_ATTRIB_GENERIC1:
@@ -129,7 +131,7 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_ATTRIB_GENERIC7:
             assert(attr < VERT_ATTRIB_MAX);
             vs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.input_semantic_index[slot] = attr - VERT_ATTRIB_GENERIC0;
+            vs.input_semantic_index[slot] = num_generic++;
             break;
          default:
             assert(0);
@@ -143,6 +145,7 @@ st_translate_vertex_program(struct st_context *st,
       vs.output_semantic_index[i] = 0;
    }
 
+   num_generic = 0;
    /*
     * Determine number of outputs, the (default) output register
     * mapping and the semantic information for each output.
@@ -207,14 +210,14 @@ st_translate_vertex_program(struct st_context *st,
          case VERT_RESULT_TEX6:
          case VERT_RESULT_TEX7:
             vs.output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.output_semantic_index[slot] = attr - VERT_RESULT_TEX0;
+            vs.output_semantic_index[slot] = num_generic++;
             break;
          case VERT_RESULT_VAR0:
             /* fall-through */
          default:
             assert(attr - VERT_RESULT_VAR0 < MAX_VARYING);
             vs.output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            vs.output_semantic_index[slot] = attr - VERT_RESULT_VAR0;
+            vs.output_semantic_index[slot] = num_generic++;
          }
       }
    }
@@ -258,6 +261,9 @@ st_translate_vertex_program(struct st_context *st,
    cso = st_cached_vs_state(st, &vs);
    stvp->cso = cso;
 
+   if (0)
+      _mesa_print_program(&stvp->Base.Base);
+
    if (TGSI_DEBUG)
       tgsi_dump( tokensOut, 0 );
 }
@@ -286,6 +292,7 @@ st_translate_fragment_program(struct st_context *st,
    GLuint attr;
    const GLbitfield inputsRead = stfp->Base.Base.InputsRead;
    GLuint vslot = 0;
+   GLuint num_generic = 0;
 
    memset(&fs, 0, sizeof(fs));
 
@@ -338,14 +345,14 @@ st_translate_fragment_program(struct st_context *st,
          case FRAG_ATTRIB_TEX6:
          case FRAG_ATTRIB_TEX7:
             fs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            fs.input_semantic_index[slot] = attr - FRAG_ATTRIB_TEX0;
+            fs.input_semantic_index[slot] = num_generic++;
             interpMode[slot] = TGSI_INTERPOLATE_PERSPECTIVE;
             break;
          case FRAG_ATTRIB_VAR0:
             /* fall-through */
          default:
             fs.input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
-            fs.input_semantic_index[slot] = attr - FRAG_ATTRIB_VAR0;
+            fs.input_semantic_index[slot] = num_generic++;
             interpMode[slot] = TGSI_INTERPOLATE_PERSPECTIVE;
          }
       }
@@ -415,6 +422,9 @@ st_translate_fragment_program(struct st_context *st,
    cso = st_cached_fs_state(st, &fs);
    stfp->fs = cso;
 
+   if (0)
+      _mesa_print_program(&stfp->Base.Base);
+
    if (TGSI_DEBUG)
       tgsi_dump( tokensOut, 0/*TGSI_DUMP_VERBOSE*/ );
 
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 15cc458be8..844a9f80d8 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -59,6 +59,10 @@ target_to_target(GLenum target)
 }
 #endif
 
+
+/**
+ * Allocate a new pipe_texture object
+ */
 struct pipe_texture *
 st_texture_create(struct st_context *st,
                   enum pipe_texture_target target,
@@ -70,7 +74,7 @@ st_texture_create(struct st_context *st,
 		  GLuint depth0,
 		  GLuint compress_byte)
 {
-   struct pipe_texture *pt = CALLOC_STRUCT(pipe_texture);
+   struct pipe_texture pt;
 
    assert(target <= PIPE_TEXTURE_CUBE);
 
@@ -78,39 +82,33 @@ st_texture_create(struct st_context *st,
        _mesa_lookup_enum_by_nr(target),
        _mesa_lookup_enum_by_nr(format), first_level, last_level);
 
-   if (!pt)
-      return NULL;
-
    assert(format);
 
-   pt->target = target;
-   pt->format = format;
-   pt->first_level = first_level;
-   pt->last_level = last_level;
-   pt->width[0] = width0;
-   pt->height[0] = height0;
-   pt->depth[0] = depth0;
-   pt->compressed = compress_byte ? 1 : 0;
-   pt->cpp = pt->compressed ? compress_byte : st_sizeof_format(format);
-   pt->refcount = 1; 
+   pt.target = target;
+   pt.format = format;
+   pt.first_level = first_level;
+   pt.last_level = last_level;
+   pt.width[0] = width0;
+   pt.height[0] = height0;
+   pt.depth[0] = depth0;
+   pt.compressed = compress_byte ? 1 : 0;
+   pt.cpp = pt.compressed ? compress_byte : st_sizeof_format(format);
+   pt.refcount = 1; 
 
-   st->pipe->texture_create(st->pipe, &pt);
-
-   return pt;
+   return st->pipe->texture_create(st->pipe, &pt);
 }
 
 
-
-
-/* Can the image be pulled into a unified mipmap texture.  This mirrors
- * the completeness test in a lot of ways.
+/**
+ * Check if a texture image be pulled into a unified mipmap texture.
+ * This mirrors the completeness test in a lot of ways.
  *
  * Not sure whether I want to pass gl_texture_image here.
  */
 GLboolean
-st_texture_match_image(struct pipe_texture *pt,
-                          struct gl_texture_image *image,
-                          GLuint face, GLuint level)
+st_texture_match_image(const struct pipe_texture *pt,
+                       const struct gl_texture_image *image,
+                       GLuint face, GLuint level)
 {
    /* Images with borders are never pulled into mipmap textures. 
     */
@@ -189,6 +187,7 @@ st_texture_image_map(struct st_context *st, struct st_texture_image *stImage,
    return pipe_surface_map(stImage->surface);
 }
 
+
 void
 st_texture_image_unmap(struct st_texture_image *stImage)
 {
@@ -201,7 +200,8 @@ st_texture_image_unmap(struct st_texture_image *stImage)
 
 
 
-/* Upload data to a rectangular sub-region.  Lots of choices how to do this:
+/**
+ * Upload data to a rectangular sub-region.  Lots of choices how to do this:
  *
  * - memcpy by span to current destination
  * - upload data as new buffer and blit
@@ -261,13 +261,14 @@ st_texture_image_data(struct pipe_context *pipe,
    }
 }
 
+
 /* Copy mipmap image between textures
  */
 void
 st_texture_image_copy(struct pipe_context *pipe,
-                         struct pipe_texture *dst,
-                         GLuint face, GLuint level,
-                         struct pipe_texture *src)
+                      struct pipe_texture *dst,
+                      GLuint face, GLuint level,
+                      struct pipe_texture *src)
 {
    GLuint width = src->width[level];
    GLuint height = src->height[level];
@@ -278,6 +279,7 @@ st_texture_image_copy(struct pipe_context *pipe,
 
    if (dst->compressed)
       height /= 4;
+
    for (i = 0; i < depth; i++) {
       dst_surface = pipe->get_tex_surface(pipe, dst, face, level, i);
       src_surface = pipe->get_tex_surface(pipe, src, face, level, i);
@@ -292,5 +294,4 @@ st_texture_image_copy(struct pipe_context *pipe,
       pipe_surface_reference(&dst_surface, NULL);
       pipe_surface_reference(&src_surface, NULL);
    }
-
 }
diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h
index d8b1bcad9d..0b87a494c3 100644
--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -47,11 +47,11 @@ st_texture_create(struct st_context *st,
                   GLuint compress_byte);
 
 
-/* Check if an image fits an existing texture
+/* Check if an image fits into an existing texture object.
  */
 extern GLboolean
-st_texture_match_image(struct pipe_texture *pt,
-                       struct gl_texture_image *image,
+st_texture_match_image(const struct pipe_texture *pt,
+                       const struct gl_texture_image *image,
                        GLuint face, GLuint level);
 
 /* Return a pointer to an image within a texture.  Return image stride as
@@ -73,7 +73,7 @@ extern const GLuint *
 st_texture_depth_offsets(struct pipe_texture *pt, GLuint level);
 
 
-/* Return the linear offset of an image relative to the start of its region:
+/* Return the linear offset of an image relative to the start of its region.
  */
 extern GLuint
 st_texture_image_offset(const struct pipe_texture *pt,
diff --git a/src/mesa/x86/rtasm/x86sse.c b/src/mesa/x86/rtasm/x86sse.c
index 56c211eee0..f8da6e405f 100644
--- a/src/mesa/x86/rtasm/x86sse.c
+++ b/src/mesa/x86/rtasm/x86sse.c
@@ -1137,6 +1137,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size )
 void x86_release_func( struct x86_function *p )
 {
    _mesa_exec_free(p->store);
+   p->store = NULL;
 }